/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_key_coding.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | |
21 | | #pragma once |
22 | | #ifndef ROCKSDB_LITE |
23 | | |
24 | | #include <array> |
25 | | #include "yb/util/slice.h" |
26 | | #include "yb/rocksdb/db/dbformat.h" |
27 | | #include "yb/rocksdb/table/plain_table_reader.h" |
28 | | |
29 | | namespace rocksdb { |
30 | | |
31 | | class WritableFile; |
32 | | struct ParsedInternalKey; |
33 | | struct PlainTableReaderFileInfo; |
34 | | enum PlainTableEntryType : unsigned char; |
35 | | |
36 | | // Helper class to write out a key to an output file |
37 | | // Actual data format of the key is documented in plain_table_factory.h |
38 | | class PlainTableKeyEncoder { |
39 | | public: |
40 | | explicit PlainTableKeyEncoder(EncodingType encoding_type, |
41 | | uint32_t user_key_len, |
42 | | const SliceTransform* prefix_extractor, |
43 | | size_t index_sparseness) |
44 | | : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), |
45 | | fixed_user_key_len_(user_key_len), |
46 | | prefix_extractor_(prefix_extractor), |
47 | | index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), |
48 | 2.06k | key_count_for_prefix_(0) {} |
49 | | // key: the key to write out, in the format of internal key. |
50 | | // file: the output file to write out |
51 | | // offset: offset in the file. Needs to be updated after appending bytes |
52 | | // for the key |
53 | | // meta_bytes_buf: buffer for extra meta bytes |
54 | | // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated |
55 | | // if meta_bytes_buf is updated. |
56 | | Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, |
57 | | char* meta_bytes_buf, size_t* meta_bytes_buf_size); |
58 | | |
59 | | // Return actual encoding type to be picked |
60 | 2.06k | EncodingType GetEncodingType() { return encoding_type_; } |
61 | | |
62 | | private: |
63 | | EncodingType encoding_type_; |
64 | | uint32_t fixed_user_key_len_; |
65 | | const SliceTransform* prefix_extractor_; |
66 | | const size_t index_sparseness_; |
67 | | size_t key_count_for_prefix_; |
68 | | IterKey pre_prefix_; |
69 | | }; |
70 | | |
71 | | class PlainTableFileReader { |
72 | | public: |
73 | | explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) |
74 | 89.2k | : file_info_(_file_info), num_buf_(0) {} |
75 | | // In mmaped mode, the results point to mmaped area of the file, which |
76 | | // means it is always valid before closing the file. |
77 | | // In non-mmap mode, the results point to an internal buffer. If the caller |
78 | | // makes another read call, the results may not be valid. So callers should |
79 | | // make a copy when needed. |
80 | | // In order to save read calls to files, we keep two internal buffers: |
81 | | // the first read and the most recent read. This is efficient because it |
82 | | // columns these two common use cases: |
83 | | // (1) hash index only identify one location, we read the key to verify |
84 | | // the location, and read key and value if it is the right location. |
85 | | // (2) after hash index checking, we identify two locations (because of |
86 | | // hash bucket conflicts), we binary search the two location to see |
87 | | // which one is what we need and start to read from the location. |
88 | | // These two most common use cases will be covered by the two buffers |
89 | | // so that we don't need to re-read the same location. |
90 | | // Currently we keep a fixed size buffer. If a read doesn't exactly fit |
91 | | // the buffer, we replace the second buffer with the location user reads. |
92 | | // |
93 | | // If return false, status code is stored in status_. |
94 | 12.1M | bool Read(uint32_t file_offset, uint32_t len, Slice* out) { |
95 | 12.1M | if (file_info_->is_mmap_mode) { |
96 | 6.95M | assert(file_offset + len <= file_info_->data_end_offset); |
97 | 6.95M | *out = Slice(file_info_->file_data.data() + file_offset, len); |
98 | 6.95M | return true; |
99 | 5.21M | } else { |
100 | 5.21M | return ReadNonMmap(file_offset, len, out); |
101 | 5.21M | } |
102 | 12.1M | } |
103 | | |
104 | | // If return false, status code is stored in status_. |
105 | | bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); |
106 | | |
107 | | // *bytes_read = 0 means eof. false means failure and status is saved |
108 | | // in status_. Not directly returning Status to save copying status |
109 | | // object to map previous performance of mmap mode. |
110 | | inline bool ReadVarint32(uint32_t offset, uint32_t* output, |
111 | | uint32_t* bytes_read); |
112 | | |
113 | | bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, |
114 | | uint32_t* bytes_read); |
115 | | |
116 | 0 | Status status() const { return status_; } |
117 | | |
118 | 4.30M | const PlainTableReaderFileInfo* file_info() { return file_info_; } |
119 | | |
120 | | private: |
121 | | const PlainTableReaderFileInfo* file_info_; |
122 | | |
123 | | struct Buffer { |
124 | 42.7k | Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} |
125 | | std::unique_ptr<char[]> buf; |
126 | | uint32_t buf_start_offset; |
127 | | uint32_t buf_len; |
128 | | uint32_t buf_capacity; |
129 | | }; |
130 | | |
131 | | // Keep buffers for two recent reads. |
132 | | std::array<unique_ptr<Buffer>, 2> buffers_; |
133 | | uint32_t num_buf_; |
134 | | Status status_; |
135 | | |
136 | | Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); |
137 | | }; |
138 | | |
139 | | // A helper class to decode keys from input buffer |
140 | | // Actual data format of the key is documented in plain_table_factory.h |
141 | | class PlainTableKeyDecoder { |
142 | | public: |
143 | | explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, |
144 | | EncodingType encoding_type, |
145 | | uint32_t user_key_len, |
146 | | const SliceTransform* prefix_extractor) |
147 | | : file_reader_(file_info), |
148 | | encoding_type_(encoding_type), |
149 | | prefix_len_(0), |
150 | | fixed_user_key_len_(user_key_len), |
151 | | prefix_extractor_(prefix_extractor), |
152 | 89.2k | in_prefix_(false) {} |
153 | | // Find the next key. |
154 | | // start: char array where the key starts. |
155 | | // limit: boundary of the char array |
156 | | // parsed_key: the output of the result key |
157 | | // internal_key: if not null, fill with the output of the result key in |
158 | | // un-parsed format |
159 | | // bytes_read: how many bytes read from start. Output |
160 | | // seekable: whether key can be read from this place. Used when building |
161 | | // indexes. Output. |
162 | | Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, |
163 | | Slice* internal_key, Slice* value, uint32_t* bytes_read, |
164 | | bool* seekable = nullptr); |
165 | | |
166 | | Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, |
167 | | Slice* internal_key, uint32_t* bytes_read, |
168 | | bool* seekable = nullptr); |
169 | | |
170 | | PlainTableFileReader file_reader_; |
171 | | EncodingType encoding_type_; |
172 | | uint32_t prefix_len_; |
173 | | uint32_t fixed_user_key_len_; |
174 | | Slice saved_user_key_; |
175 | | IterKey cur_key_; |
176 | | const SliceTransform* prefix_extractor_; |
177 | | bool in_prefix_; |
178 | | |
179 | | private: |
180 | | Status NextPlainEncodingKey(uint32_t start_offset, |
181 | | ParsedInternalKey* parsed_key, |
182 | | Slice* internal_key, uint32_t* bytes_read, |
183 | | bool* seekable = nullptr); |
184 | | Status NextPrefixEncodingKey(uint32_t start_offset, |
185 | | ParsedInternalKey* parsed_key, |
186 | | Slice* internal_key, uint32_t* bytes_read, |
187 | | bool* seekable = nullptr); |
188 | | Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, |
189 | | ParsedInternalKey* parsed_key, uint32_t* bytes_read, |
190 | | bool* internal_key_valid, Slice* internal_key); |
191 | | inline Status DecodeSize(uint32_t start_offset, |
192 | | PlainTableEntryType* entry_type, uint32_t* key_size, |
193 | | uint32_t* bytes_read); |
194 | | }; |
195 | | |
196 | | } // namespace rocksdb |
197 | | |
198 | | #endif // ROCKSDB_LITE |