/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_reader.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | | // Use of this source code is governed by a BSD-style license that can be |
3 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
4 | | // |
5 | | // The following only applies to changes made to this file as part of YugaByte development. |
6 | | // |
7 | | // Portions Copyright (c) YugaByte, Inc. |
8 | | // |
9 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
10 | | // in compliance with the License. You may obtain a copy of the License at |
11 | | // |
12 | | // http://www.apache.org/licenses/LICENSE-2.0 |
13 | | // |
14 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
15 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
16 | | // or implied. See the License for the specific language governing permissions and limitations |
17 | | // under the License. |
18 | | // |
19 | | |
20 | | #ifndef YB_ROCKSDB_TABLE_PLAIN_TABLE_READER_H |
21 | | #define YB_ROCKSDB_TABLE_PLAIN_TABLE_READER_H |
22 | | |
23 | | #ifndef ROCKSDB_LITE |
24 | | #include <stdint.h> |
25 | | #include <unordered_map> |
26 | | #include <memory> |
27 | | #include <vector> |
28 | | #include <string> |
29 | | |
30 | | #include "yb/rocksdb/db/dbformat.h" |
31 | | #include "yb/rocksdb/env.h" |
32 | | #include "yb/rocksdb/iterator.h" |
33 | | #include "yb/rocksdb/slice_transform.h" |
34 | | #include "yb/rocksdb/table.h" |
35 | | #include "yb/rocksdb/table_properties.h" |
36 | | |
37 | | #include "yb/rocksdb/table/format.h" |
38 | | #include "yb/rocksdb/table/table_reader.h" |
39 | | #include "yb/rocksdb/table/plain_table_factory.h" |
40 | | #include "yb/rocksdb/table/plain_table_index.h" |
41 | | |
42 | | #include "yb/rocksdb/util/arena.h" |
43 | | #include "yb/rocksdb/util/dynamic_bloom.h" |
44 | | #include "yb/rocksdb/util/file_reader_writer.h" |
45 | | |
46 | | namespace rocksdb { |
47 | | |
48 | | class Block; |
49 | | struct BlockContents; |
50 | | class BlockHandle; |
51 | | class Footer; |
52 | | struct Options; |
53 | | struct ReadOptions; |
54 | | class TableCache; |
55 | | class TableReader; |
56 | | class InternalKeyComparator; |
57 | | class PlainTableKeyDecoder; |
58 | | class GetContext; |
59 | | class InternalIterator; |
60 | | |
61 | | using std::unique_ptr; |
62 | | using std::unordered_map; |
63 | | using std::vector; |
64 | | extern const uint32_t kPlainTableVariableLength; |
65 | | |
66 | | struct PlainTableReaderFileInfo { |
67 | | bool is_mmap_mode; |
68 | | Slice file_data; |
69 | | uint32_t data_end_offset; |
70 | | unique_ptr<RandomAccessFileReader> file; |
71 | | |
72 | | PlainTableReaderFileInfo(unique_ptr<RandomAccessFileReader>&& _file, |
73 | | const EnvOptions& storage_options, |
74 | | uint32_t _data_size_offset) |
75 | | : is_mmap_mode(storage_options.use_mmap_reads), |
76 | | data_end_offset(_data_size_offset), |
77 | 3.11k | file(std::move(_file)) {} |
78 | | }; |
79 | | |
80 | | // Based on following output file format shown in plain_table_factory.h |
81 | | // When opening the output file, IndexedTableReader creates a hash table |
82 | | // from key prefixes to offset of the output file. IndexedTable will decide |
83 | | // whether it points to the data offset of the first key with the key prefix |
84 | | // or the offset of it. If there are too many keys share this prefix, it will |
85 | | // create a binary search-able index from the suffix to offset on disk. |
86 | | // |
87 | | // The implementation of IndexedTableReader requires output file is mmaped |
88 | | class PlainTableReader: public TableReader { |
89 | | public: |
90 | | static Status Open(const ImmutableCFOptions& ioptions, |
91 | | const EnvOptions& env_options, |
92 | | const InternalKeyComparatorPtr& internal_comparator, |
93 | | unique_ptr<RandomAccessFileReader>&& file, |
94 | | uint64_t file_size, unique_ptr<TableReader>* table, |
95 | | const int bloom_bits_per_key, double hash_table_ratio, |
96 | | size_t index_sparseness, size_t huge_page_tlb_size, |
97 | | bool full_scan_mode); |
98 | | |
99 | 2.34k | bool IsSplitSst() const override { return false; } |
100 | | |
101 | 0 | void SetDataFileReader(unique_ptr<RandomAccessFileReader>&& data_file) override { assert(false); } |
102 | | |
103 | | InternalIterator* NewIterator(const ReadOptions&, Arena* arena = nullptr, |
104 | | bool skip_filters = false) override; |
105 | | |
106 | | void Prepare(const Slice& target) override; |
107 | | |
108 | | Status Get(const ReadOptions&, const Slice& key, GetContext* get_context, |
109 | | bool skip_filters = false) override; |
110 | | |
111 | | uint64_t ApproximateOffsetOf(const Slice& key) override; |
112 | | |
113 | 17.0k | uint32_t GetIndexSize() const { return index_.GetIndexSize(); } |
114 | | void SetupForCompaction() override; |
115 | | |
116 | 1.60k | std::shared_ptr<const TableProperties> GetTableProperties() const override { |
117 | 1.60k | return table_properties_; |
118 | 1.60k | } |
119 | | |
120 | 48 | virtual size_t ApproximateMemoryUsage() const override { |
121 | 48 | return arena_.MemoryAllocatedBytes(); |
122 | 48 | } |
123 | | |
124 | | PlainTableReader(const ImmutableCFOptions& ioptions, |
125 | | unique_ptr<RandomAccessFileReader>&& file, |
126 | | const EnvOptions& env_options, |
127 | | const InternalKeyComparatorPtr& internal_comparator, |
128 | | EncodingType encoding_type, uint64_t file_size, |
129 | | const TableProperties* table_properties); |
130 | | virtual ~PlainTableReader(); |
131 | | |
132 | | protected: |
133 | | // Check bloom filter to see whether it might contain this prefix. |
134 | | // The hash of the prefix is given, since it can be reused for index lookup |
135 | | // too. |
136 | | virtual bool MatchBloom(uint32_t hash) const; |
137 | | |
138 | | // PopulateIndex() builds index of keys. It must be called before any query |
139 | | // to the table. |
140 | | // |
141 | | // props: the table properties object that need to be stored. Ownership of |
142 | | // the object will be passed. |
143 | | // |
144 | | |
145 | | Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, |
146 | | double hash_table_ratio, size_t index_sparseness, |
147 | | size_t huge_page_tlb_size); |
148 | | |
149 | | Status MmapDataIfNeeded(); |
150 | | |
151 | | private: |
152 | | InternalKeyComparatorPtr internal_comparator_; |
153 | | EncodingType encoding_type_; |
154 | | // represents plain table's current status. |
155 | | Status status_; |
156 | | |
157 | | PlainTableIndex index_; |
158 | | bool full_scan_mode_; |
159 | | |
160 | | // data_start_offset_ and data_end_offset_ defines the range of the |
161 | | // sst file that stores data. |
162 | | const uint32_t data_start_offset_ = 0; |
163 | | const uint32_t user_key_len_; |
164 | | const SliceTransform* prefix_extractor_; |
165 | | |
166 | | static const size_t kNumInternalBytes = 8; |
167 | | |
168 | | // Bloom filter is used to rule out non-existent key |
169 | | bool enable_bloom_; |
170 | | DynamicBloom bloom_; |
171 | | PlainTableReaderFileInfo file_info_; |
172 | | Arena arena_; |
173 | | TrackedAllocation index_block_alloc_; |
174 | | TrackedAllocation bloom_block_alloc_; |
175 | | |
176 | | const ImmutableCFOptions& ioptions_; |
177 | | uint64_t file_size_; |
178 | | std::shared_ptr<const TableProperties> table_properties_; |
179 | | std::shared_ptr<yb::MemTracker> mem_tracker_; |
180 | | size_t tracked_consumption_ = 0; |
181 | | |
182 | 0 | bool IsFixedLength() const { |
183 | 0 | return user_key_len_ != kPlainTableVariableLength; |
184 | 0 | } |
185 | | |
186 | 0 | size_t GetFixedInternalKeyLength() const { |
187 | 0 | return user_key_len_ + kNumInternalBytes; |
188 | 0 | } |
189 | | |
190 | 164k | Slice GetPrefix(const Slice& target) const { |
191 | 164k | assert(target.size() >= 8); // target is internal key |
192 | 164k | return GetPrefixFromUserKey(GetUserKey(target)); |
193 | 164k | } |
194 | | |
195 | 2.07M | Slice GetPrefix(const ParsedInternalKey& target) const { |
196 | 2.07M | return GetPrefixFromUserKey(target.user_key); |
197 | 2.07M | } |
198 | | |
199 | 164k | Slice GetUserKey(const Slice& key) const { |
200 | 164k | return Slice(key.data(), key.size() - 8); |
201 | 164k | } |
202 | | |
203 | 2.24M | Slice GetPrefixFromUserKey(const Slice& user_key) const { |
204 | 2.24M | if (!IsTotalOrderMode()) { |
205 | 2.16M | return prefix_extractor_->Transform(user_key); |
206 | 73.7k | } else { |
207 | | // Use empty slice as prefix if prefix_extractor is not set. |
208 | | // In that case, |
209 | | // it falls back to pure binary search and |
210 | | // total iterator seek is supported. |
211 | 73.7k | return Slice(); |
212 | 73.7k | } |
213 | 2.24M | } |
214 | | |
215 | | friend class TableCache; |
216 | | friend class PlainTableIterator; |
217 | | |
218 | | // Internal helper function to generate an IndexRecordList object from all |
219 | | // the rows, which contains index records as a list. |
220 | | // If bloom_ is not null, all the keys' full-key hash will be added to the |
221 | | // bloom filter. |
222 | | Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, |
223 | | vector<uint32_t>* prefix_hashes); |
224 | | |
225 | | // Internal helper function to allocate memory for bloom filter and fill it |
226 | | void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes, |
227 | | size_t huge_page_tlb_size, |
228 | | vector<uint32_t>* prefix_hashes); |
229 | | |
230 | | void FillBloom(vector<uint32_t>* prefix_hashes); |
231 | | |
232 | | // Read the key and value at `offset` to parameters for keys, the and |
233 | | // `seekable`. |
234 | | // On success, `offset` will be updated as the offset for the next key. |
235 | | // `parsed_key` will be key in parsed format. |
236 | | // if `internal_key` is not empty, it will be filled with key with slice |
237 | | // format. |
238 | | // if `seekable` is not null, it will return whether we can directly read |
239 | | // data using this offset. |
240 | | Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset, |
241 | | ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, |
242 | | bool* seekable = nullptr) const; |
243 | | // Get file offset for key target. |
244 | | // return value prefix_matched is set to true if the offset is confirmed |
245 | | // for a key with the same prefix as target. |
246 | | Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, |
247 | | const Slice& prefix, uint32_t prefix_hash, |
248 | | bool* prefix_matched, uint32_t* offset) const; |
249 | | |
250 | 2.38M | bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } |
251 | | |
252 | | // No copying allowed |
253 | | explicit PlainTableReader(const TableReader&) = delete; |
254 | | void operator=(const TableReader&) = delete; |
255 | | }; |
256 | | } // namespace rocksdb |
257 | | #endif // ROCKSDB_LITE |
258 | | |
259 | | #endif // YB_ROCKSDB_TABLE_PLAIN_TABLE_READER_H |