/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/format.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | |
24 | | #include "yb/rocksdb/table/format.h" |
25 | | |
26 | | #include <inttypes.h> |
27 | | |
28 | | #include <string> |
29 | | |
30 | | #include "yb/rocksdb/env.h" |
31 | | #include "yb/rocksdb/util/coding.h" |
32 | | #include "yb/rocksdb/util/compression.h" |
33 | | #include "yb/rocksdb/util/crc32c.h" |
34 | | #include "yb/rocksdb/util/file_reader_writer.h" |
35 | | #include "yb/rocksdb/util/perf_context_imp.h" |
36 | | #include "yb/rocksdb/util/xxhash.h" |
37 | | |
38 | | #include "yb/util/debug-util.h" |
39 | | #include "yb/util/env.h" |
40 | | #include "yb/util/mem_tracker.h" |
41 | | #include "yb/util/result.h" |
42 | | #include "yb/util/stats/perf_step_timer.h" |
43 | | #include "yb/util/status_format.h" |
44 | | #include "yb/util/std_util.h" |
45 | | #include "yb/util/string_util.h" |
46 | | |
47 | | using yb::Format; |
48 | | using yb::Result; |
49 | | |
50 | | namespace rocksdb { |
51 | | |
52 | | extern const uint64_t kLegacyBlockBasedTableMagicNumber; |
53 | | extern const uint64_t kBlockBasedTableMagicNumber; |
54 | | |
55 | | #ifndef ROCKSDB_LITE |
56 | | extern const uint64_t kLegacyPlainTableMagicNumber; |
57 | | extern const uint64_t kPlainTableMagicNumber; |
58 | | #else |
59 | | // ROCKSDB_LITE doesn't have plain table |
60 | | const uint64_t kLegacyPlainTableMagicNumber = 0; |
61 | | const uint64_t kPlainTableMagicNumber = 0; |
62 | | #endif |
63 | | const uint32_t DefaultStackBufferSize = 5000; |
64 | | |
65 | 2.75M | void BlockHandle::AppendEncodedTo(std::string* dst) const { |
66 | | // Sanity check that all fields have been set |
67 | 2.75M | DCHECK_NE(offset_, kUint64FieldNotSet); |
68 | 2.75M | DCHECK_NE(size_, kUint64FieldNotSet); |
69 | 2.75M | PutVarint64(dst, offset_); |
70 | 2.75M | PutVarint64(dst, size_); |
71 | 2.75M | } |
72 | | |
73 | 24.6M | Status BlockHandle::DecodeFrom(Slice* input) { |
74 | 24.6M | if (GetVarint64(input, &offset_) && |
75 | 24.6M | GetVarint64(input, &size_)) { |
76 | 24.6M | return Status::OK(); |
77 | 3.03k | } else { |
78 | 3.03k | return STATUS(Corruption, "bad block handle"); |
79 | 3.03k | } |
80 | 24.6M | } |
81 | | |
82 | | // Return a string that contains the copy of handle. |
83 | 6 | std::string BlockHandle::ToString(bool hex) const { |
84 | 6 | std::string handle_str; |
85 | 6 | AppendEncodedTo(&handle_str); |
86 | 6 | if (hex) { |
87 | 6 | std::string result; |
88 | 6 | char buf[10]; |
89 | 24 | for (size_t i = 0; i < handle_str.size(); i++) { |
90 | 18 | snprintf(buf, sizeof(buf), "%02X", |
91 | 18 | static_cast<unsigned char>(handle_str[i])); |
92 | 18 | result += buf; |
93 | 18 | } |
94 | 6 | return result; |
95 | 0 | } else { |
96 | 0 | return handle_str; |
97 | 0 | } |
98 | 6 | } |
99 | | |
100 | 26 | std::string BlockHandle::ToDebugString() const { |
101 | 26 | return Format("BlockHandle { offset: $0 size: $1 }", offset_, size_); |
102 | 26 | } |
103 | | |
104 | | const BlockHandle BlockHandle::kNullBlockHandle(0, 0); |
105 | | |
106 | | namespace { |
107 | 518k | inline bool IsLegacyFooterFormat(uint64_t magic_number) { |
108 | 518k | return magic_number == kLegacyBlockBasedTableMagicNumber || |
109 | 518k | magic_number == kLegacyPlainTableMagicNumber; |
110 | 518k | } |
111 | 10.5k | inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { |
112 | 10.5k | if (magic_number == kLegacyBlockBasedTableMagicNumber) { |
113 | 1 | return kBlockBasedTableMagicNumber; |
114 | 1 | } |
115 | 10.5k | if (magic_number == kLegacyPlainTableMagicNumber) { |
116 | 10.5k | return kPlainTableMagicNumber; |
117 | 10.5k | } |
118 | 0 | assert(false); |
119 | 0 | return 0; |
120 | 0 | } |
121 | | } // namespace |
122 | | |
123 | | // legacy footer format: |
124 | | // metaindex handle (varint64 offset, varint64 size) |
125 | | // index handle (varint64 offset, varint64 size) |
126 | | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength |
127 | | // table_magic_number (8 bytes) |
128 | | // new footer format: |
129 | | // checksum (char, 1 byte) |
130 | | // metaindex handle (varint64 offset, varint64 size) |
131 | | // index handle (varint64 offset, varint64 size) |
132 | | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 |
133 | | // footer version (4 bytes) |
134 | | // table_magic_number (8 bytes) |
135 | 63.4k | void Footer::AppendEncodedTo(std::string* dst) const { |
136 | 63.4k | assert(HasInitializedTableMagicNumber()); |
137 | 63.4k | if (IsLegacyFooterFormat(table_magic_number())) { |
138 | | // has to be default checksum with legacy footer |
139 | 2.06k | assert(checksum_ == kCRC32c); |
140 | 2.06k | const size_t original_size = dst->size(); |
141 | 2.06k | metaindex_handle_.AppendEncodedTo(dst); |
142 | 2.06k | data_index_handle_.AppendEncodedTo(dst); |
143 | 2.06k | dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding |
144 | 2.06k | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); |
145 | 2.06k | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); |
146 | 2.06k | assert(dst->size() == original_size + kVersion0EncodedLength); |
147 | 61.4k | } else { |
148 | 61.4k | const size_t original_size = dst->size(); |
149 | 61.4k | dst->push_back(static_cast<char>(checksum_)); |
150 | 61.4k | metaindex_handle_.AppendEncodedTo(dst); |
151 | 61.4k | data_index_handle_.AppendEncodedTo(dst); |
152 | 61.4k | dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding |
153 | 61.4k | PutFixed32(dst, version()); |
154 | 61.4k | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); |
155 | 61.4k | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); |
156 | 61.4k | assert(dst->size() == original_size + kNewVersionsEncodedLength); |
157 | 61.4k | } |
158 | 63.4k | } |
159 | | |
160 | | Footer::Footer(uint64_t _table_magic_number, uint32_t _version) |
161 | | : version_(_version), |
162 | | checksum_(kCRC32c), |
163 | 348k | table_magic_number_(_table_magic_number) { |
164 | | // This should be guaranteed by constructor callers |
165 | 348k | assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); |
166 | 348k | } |
167 | | |
168 | 106k | Status Footer::DecodeFrom(Slice* input) { |
169 | 106k | RSTATUS_DCHECK( |
170 | 106k | !HasInitializedTableMagicNumber(), IllegalState, "Decoding into the same footer twice"); |
171 | 106k | RSTATUS_DCHECK(input != nullptr, IllegalState, "input can't be null"); |
172 | 106k | RSTATUS_DCHECK_GE(input->size(), kMinEncodedLength, Corruption, "Footer size is too small"); |
173 | | |
174 | 106k | const char *magic_ptr = input->cend() - kMagicNumberLengthByte; |
175 | 106k | const uint32_t magic_lo = DecodeFixed32(magic_ptr); |
176 | 106k | const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); |
177 | 106k | uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | |
178 | 106k | (static_cast<uint64_t>(magic_lo))); |
179 | | |
180 | | // We check for legacy formats here and silently upconvert them |
181 | 106k | bool legacy = IsLegacyFooterFormat(magic); |
182 | 106k | if (legacy) { |
183 | 10.5k | magic = UpconvertLegacyFooterFormat(magic); |
184 | 10.5k | } |
185 | 106k | set_table_magic_number(magic); |
186 | | |
187 | 106k | if (legacy) { |
188 | | // The size is already asserted to be at least kMinEncodedLength |
189 | | // at the beginning of the function |
190 | 10.5k | input->remove_prefix(input->size() - kVersion0EncodedLength); |
191 | 10.5k | version_ = 0 /* legacy */; |
192 | 10.5k | checksum_ = kCRC32c; |
193 | 95.6k | } else { |
194 | 95.6k | version_ = DecodeFixed32(magic_ptr - 4); |
195 | | // Footer version 1 and higher will always occupy exactly this many bytes. |
196 | | // It consists of the checksum type, two block handles, padding, |
197 | | // a version number, and a magic number |
198 | 95.6k | if (input->size() < kNewVersionsEncodedLength) { |
199 | 0 | return STATUS(Corruption, "input is too short to be an sstable"); |
200 | 95.6k | } else { |
201 | 95.6k | input->remove_prefix(input->size() - kNewVersionsEncodedLength); |
202 | 95.6k | } |
203 | 95.6k | uint32_t chksum; |
204 | 95.6k | if (!GetVarint32(input, &chksum)) { |
205 | 0 | return STATUS(Corruption, "bad checksum type"); |
206 | 0 | } |
207 | 95.6k | checksum_ = static_cast<ChecksumType>(chksum); |
208 | 95.6k | } |
209 | | |
210 | 106k | Status result = metaindex_handle_.DecodeFrom(input); |
211 | 106k | if (result.ok()) { |
212 | 106k | result = data_index_handle_.DecodeFrom(input); |
213 | 106k | } |
214 | 106k | if (result.ok()) { |
215 | | // We skip over any leftover data (just padding for now) in "input" |
216 | 106k | const char* end = magic_ptr + kMagicNumberLengthByte; |
217 | 106k | *input = Slice(end, input->cend() - end); |
218 | 106k | } |
219 | 106k | return result; |
220 | 106k | } |
221 | | |
222 | 3 | std::string Footer::ToString() const { |
223 | 3 | std::string result, handle_; |
224 | 3 | result.reserve(1024); |
225 | | |
226 | 3 | bool legacy = IsLegacyFooterFormat(table_magic_number_); |
227 | 3 | if (legacy) { |
228 | 0 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); |
229 | 0 | result.append("data index handle: " + data_index_handle_.ToString() + "\n "); |
230 | 0 | result.append("table_magic_number: " + |
231 | 0 | rocksdb::ToString(table_magic_number_) + "\n "); |
232 | 3 | } else { |
233 | 3 | result.append("checksum: " + rocksdb::ToString(checksum_) + "\n "); |
234 | 3 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); |
235 | 3 | result.append("data index handle: " + data_index_handle_.ToString() + "\n "); |
236 | 3 | result.append("footer version: " + rocksdb::ToString(version_) + "\n "); |
237 | 3 | result.append("table_magic_number: " + |
238 | 3 | rocksdb::ToString(table_magic_number_) + "\n "); |
239 | 3 | } |
240 | 3 | return result; |
241 | 3 | } |
242 | | |
243 | 212k | Status CheckSSTableFileSize(RandomAccessFileReader* file, uint64_t file_size) { |
244 | 212k | if (file_size < Footer::kMinEncodedLength) { |
245 | 75 | return STATUS_FORMAT(Corruption, |
246 | 75 | "File is too short to be an SSTable: $0", |
247 | 75 | file->file()->filename()); |
248 | 75 | } |
249 | 212k | return Status::OK(); |
250 | 212k | } |
251 | | |
252 | | Status ReadFooterFromFile( |
253 | | RandomAccessFileReader* file, uint64_t file_size, Footer* footer, |
254 | 106k | uint64_t enforce_table_magic_number) { |
255 | 106k | RETURN_NOT_OK(CheckSSTableFileSize(file, file_size)); |
256 | | |
257 | 106k | char footer_space[Footer::kMaxEncodedLength]; |
258 | 106k | Slice footer_input; |
259 | 106k | size_t read_offset = |
260 | 106k | (file_size > Footer::kMaxEncodedLength) |
261 | 106k | ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength) |
262 | 11 | : 0; |
263 | 106k | const size_t read_size = std::min<size_t>(Footer::kMaxEncodedLength, file_size); |
264 | 106k | struct FooterValidator : public yb::ReadValidator { |
265 | 106k | FooterValidator(RandomAccessFileReader* file_, |
266 | 106k | Footer* footer_, |
267 | 106k | uint64_t enforce_table_magic_number_) |
268 | 106k | : file(file_), |
269 | 106k | footer(footer_), |
270 | 106k | enforce_table_magic_number(enforce_table_magic_number_) {} |
271 | | |
272 | 106k | CHECKED_STATUS Validate(const Slice& read_result) const override { |
273 | | // Check that we actually read the whole footer from the file. It may be that size isn't |
274 | | // correct. |
275 | 106k | RETURN_NOT_OK(CheckSSTableFileSize(file, read_result.size())); |
276 | 106k | Slice mutable_read_result(read_result); |
277 | 106k | *footer = Footer(); |
278 | 106k | RETURN_NOT_OK(footer->DecodeFrom(&mutable_read_result)); |
279 | 106k | if (enforce_table_magic_number != 0 && |
280 | 82.1k | enforce_table_magic_number != footer->table_magic_number()) { |
281 | 4 | return STATUS_FORMAT( |
282 | 4 | Corruption, "Bad table magic number: 0x$0, expected: 0x$1", |
283 | 4 | FastHex64ToString(footer->table_magic_number()), |
284 | 4 | FastHex64ToString(enforce_table_magic_number)); |
285 | 4 | } |
286 | 106k | return Status::OK(); |
287 | 106k | } |
288 | 106k | RandomAccessFileReader* file; |
289 | 106k | Footer* const footer; |
290 | 106k | const uint64_t enforce_table_magic_number; |
291 | 106k | } validator(file, footer, enforce_table_magic_number); |
292 | | |
293 | 106k | return file->ReadAndValidate(read_offset, read_size, &footer_input, footer_space, validator); |
294 | 106k | } |
295 | | |
296 | | // Without anonymous namespace here, we fail the warning -Wmissing-prototypes |
297 | | namespace { |
298 | | |
299 | | struct ChecksumData { |
300 | | uint32_t expected; |
301 | | uint32_t actual; |
302 | | }; |
303 | | |
304 | | Result<ChecksumData> ComputeChecksum( |
305 | | RandomAccessFileReader* file, |
306 | | const Footer& footer, |
307 | | const BlockHandle& handle, |
308 | | const Slice& src_data, |
309 | 3.74M | uint32_t raw_expected_checksum) { |
310 | 3.74M | switch (footer.checksum()) { |
311 | 3.73M | case kCRC32c: |
312 | 3.73M | return ChecksumData { |
313 | 3.73M | .expected = crc32c::Unmask(raw_expected_checksum), |
314 | 3.73M | .actual = crc32c::Value(src_data.data(), src_data.size()) |
315 | 3.73M | }; |
316 | 11.2k | case kxxHash: |
317 | 11.2k | if (yb::std_util::cmp_greater(src_data.size(), std::numeric_limits<int>::max())) { |
318 | 0 | return STATUS_FORMAT( |
319 | 0 | Corruption, "Block too large for xxHash ($0 bytes, but must be $1 or smaller)", |
320 | 0 | src_data.size(), std::numeric_limits<int>::max()); |
321 | 0 | } |
322 | 11.2k | return ChecksumData { |
323 | 11.2k | .expected = raw_expected_checksum, |
324 | 11.2k | .actual = XXH32(src_data.data(), static_cast<int>(src_data.size()), 0) |
325 | 11.2k | }; |
326 | 0 | case kNoChecksum: |
327 | 0 | return ChecksumData { |
328 | 0 | .expected = raw_expected_checksum, |
329 | 0 | .actual = raw_expected_checksum |
330 | 0 | }; |
331 | 0 | } |
332 | 0 | return STATUS_FORMAT( |
333 | 0 | Corruption, "Unknown checksum type in file: $0, block handle: $1", |
334 | 0 | file->file()->filename(), handle.ToDebugString()); |
335 | 0 | } |
336 | | |
337 | | Status VerifyBlockChecksum( |
338 | | RandomAccessFileReader* file, |
339 | | const Footer& footer, |
340 | | const BlockHandle& handle, |
341 | | const char* data, |
342 | 3.74M | const size_t block_size) { |
343 | 3.74M | PERF_TIMER_GUARD(block_checksum_time); |
344 | 3.74M | const uint32_t raw_expected_checksum = DecodeFixed32(data + block_size + 1); |
345 | 3.74M | auto checksum = VERIFY_RESULT( |
346 | 3.74M | ComputeChecksum(file, footer, handle, Slice(data, block_size + 1), raw_expected_checksum)); |
347 | 3.74M | if (checksum.actual != checksum.expected) { |
348 | 26 | return STATUS_FORMAT( |
349 | 26 | Corruption, "Block checksum mismatch in file: $0, block handle: $1", |
350 | 26 | file->file()->filename(), handle.ToDebugString()); |
351 | 26 | } |
352 | 3.74M | return Status::OK(); |
353 | 3.74M | } |
354 | | |
355 | | // Read a block and check its CRC. When this function returns, *contents will contain the result of |
356 | | // reading. |
357 | | Status ReadBlock( |
358 | | RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, |
359 | 3.87M | const BlockHandle& handle, Slice* contents, /* result of reading */ char* buf) { |
360 | 3.87M | *contents = Slice(buf, buf); |
361 | 3.87M | const size_t expected_read_size = static_cast<size_t>(handle.size()) + kBlockTrailerSize; |
362 | 3.87M | Status s; |
363 | 3.87M | { |
364 | 3.87M | PERF_TIMER_GUARD(block_read_time); |
365 | 3.87M | struct BlockChecksumValidator : public yb::ReadValidator { |
366 | 3.87M | BlockChecksumValidator( |
367 | 3.87M | RandomAccessFileReader* file_, const Footer& footer_, const ReadOptions& options_, |
368 | 3.87M | const BlockHandle& handle_, size_t expected_read_size_) |
369 | 3.87M | : file(file_), |
370 | 3.87M | footer(footer_), |
371 | 3.87M | options(options_), |
372 | 3.87M | handle(handle_), |
373 | 3.87M | expected_read_size(expected_read_size_) {} |
374 | | |
375 | 3.87M | CHECKED_STATUS Validate(const Slice& read_result) const override { |
376 | 3.87M | if (read_result.size() != expected_read_size) { |
377 | 0 | return STATUS_FORMAT( |
378 | 0 | Corruption, "Truncated block read in file: $0, block handle: $1, expected size: $2", |
379 | 0 | file->file()->filename(), handle.ToDebugString(), expected_read_size); |
380 | 0 | } |
381 | | |
382 | 3.87M | if (options.verify_checksums) { |
383 | 3.74M | return VerifyBlockChecksum(file, footer, handle, read_result.cdata(), handle.size()); |
384 | 3.74M | } |
385 | 136k | return Status::OK(); |
386 | 136k | }; |
387 | | |
388 | 3.87M | RandomAccessFileReader* file; |
389 | 3.87M | const Footer& footer; |
390 | 3.87M | const ReadOptions& options; |
391 | 3.87M | const BlockHandle& handle; |
392 | 3.87M | const size_t expected_read_size; |
393 | 3.87M | } validator(file, footer, options, handle, expected_read_size); |
394 | | |
395 | 3.87M | s = file->ReadAndValidate(handle.offset(), expected_read_size, contents, buf, validator); |
396 | 3.87M | } |
397 | | |
398 | 3.87M | PERF_COUNTER_ADD(block_read_count, 1); |
399 | 3.87M | PERF_COUNTER_ADD(block_read_byte, expected_read_size); |
400 | | |
401 | 3.87M | return s; |
402 | 3.87M | } |
403 | | |
404 | | } // namespace |
405 | | |
406 | | TrackedAllocation::TrackedAllocation() |
407 | 6.04M | : size_(0) { |
408 | 6.04M | } |
409 | | |
410 | | TrackedAllocation::TrackedAllocation( |
411 | | std::unique_ptr<char[]>&& data, size_t size, yb::MemTrackerPtr mem_tracker) |
412 | 3.88M | : holder_(std::move(data)), size_(size), mem_tracker_(std::move(mem_tracker)) { |
413 | 3.88M | if (holder_ && mem_tracker_) { |
414 | 247k | mem_tracker_->Consume(size_); |
415 | 247k | } |
416 | 3.88M | } |
417 | | |
418 | 13.6M | TrackedAllocation::~TrackedAllocation() { |
419 | 13.6M | if (holder_ && mem_tracker_) { |
420 | 103k | mem_tracker_->Release(size_); |
421 | 103k | } |
422 | 13.6M | } |
423 | | |
424 | 3.89M | TrackedAllocation& TrackedAllocation::operator=(TrackedAllocation&& other) { |
425 | 3.89M | if (holder_ && mem_tracker_) { |
426 | 0 | mem_tracker_->Release(size_); |
427 | 0 | } |
428 | | |
429 | 3.89M | holder_ = std::move(other.holder_); |
430 | 3.89M | size_ = other.size_; |
431 | 3.89M | mem_tracker_ = std::move(other.mem_tracker_); |
432 | | |
433 | 3.89M | return *this; |
434 | 3.89M | } |
435 | | |
436 | | BlockContents::BlockContents( |
437 | | std::unique_ptr<char[]>&& _data, size_t _size, bool _cachable, |
438 | | CompressionType _compression_type, yb::MemTrackerPtr mem_tracker) |
439 | | : data(_data.get(), _size), |
440 | | cachable(_cachable), |
441 | | compression_type(_compression_type), |
442 | 3.88M | allocation(std::move(_data), _size, std::move(mem_tracker)) { |
443 | 3.88M | } |
444 | | |
445 | | Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, |
446 | | const ReadOptions& options, const BlockHandle& handle, |
447 | | BlockContents* contents, Env* env, |
448 | 3.87M | const yb::MemTrackerPtr& mem_tracker, bool decompression_requested) { |
449 | 3.87M | Status status; |
450 | 3.87M | Slice slice; |
451 | 3.87M | size_t n = static_cast<size_t>(handle.size()); |
452 | 3.87M | std::unique_ptr<char[]> heap_buf; |
453 | 3.87M | char stack_buf[DefaultStackBufferSize]; |
454 | 3.87M | char* used_buf = nullptr; |
455 | 3.87M | rocksdb::CompressionType compression_type; |
456 | | |
457 | 3.87M | if (decompression_requested && |
458 | 3.72M | n + kBlockTrailerSize < DefaultStackBufferSize) { |
459 | | // If we've got a small enough hunk of data, read it in to the |
460 | | // trivially allocated stack buffer instead of needing a full malloc() |
461 | 3.37M | used_buf = &stack_buf[0]; |
462 | 499k | } else { |
463 | 499k | heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]); |
464 | 499k | used_buf = heap_buf.get(); |
465 | 499k | } |
466 | | |
467 | 3.87M | status = ReadBlock(file, footer, options, handle, &slice, used_buf); |
468 | | |
469 | 3.87M | if (!status.ok()) { |
470 | 2 | LOG(ERROR) << __func__ << ": " << status << "\n" << yb::GetStackTrace(); |
471 | 2 | return status; |
472 | 2 | } |
473 | | |
474 | 3.87M | PERF_TIMER_GUARD(block_decompress_time); |
475 | | |
476 | 3.87M | compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]); |
477 | | |
478 | 3.87M | if (decompression_requested && compression_type != kNoCompression) { |
479 | 1.39M | return UncompressBlockContents(slice.cdata(), n, contents, footer.version(), mem_tracker); |
480 | 1.39M | } |
481 | | |
482 | 2.48M | if (slice.cdata() != used_buf) { |
483 | 15.6k | *contents = BlockContents(Slice(slice.data(), n), false, compression_type); |
484 | 15.6k | return status; |
485 | 15.6k | } |
486 | | |
487 | 2.46M | if (used_buf == &stack_buf[0]) { |
488 | 2.19M | heap_buf = std::unique_ptr<char[]>(new char[n]); |
489 | 2.19M | memcpy(heap_buf.get(), stack_buf, n); |
490 | 2.19M | } |
491 | | |
492 | 2.46M | *contents = BlockContents(std::move(heap_buf), n, true, compression_type, mem_tracker); |
493 | 2.46M | return status; |
494 | 2.46M | } |
495 | | |
496 | | // |
497 | | // The 'data' points to the raw block contents that was read in from file. |
498 | | // This method allocates a new heap buffer and the raw block |
499 | | // contents are uncompresed into this buffer. This |
500 | | // buffer is returned via 'result' and it is upto the caller to |
501 | | // free this buffer. |
502 | | // format_version is the block format as defined in include/rocksdb/table.h |
503 | | Status UncompressBlockContents(const char* data, size_t n, |
504 | | BlockContents* contents, |
505 | | uint32_t format_version, |
506 | 1.40M | const std::shared_ptr<yb::MemTracker>& mem_tracker) { |
507 | 1.40M | std::unique_ptr<char[]> ubuf; |
508 | 1.40M | int decompress_size = 0; |
509 | 1.40M | assert(data[n] != kNoCompression); |
510 | 1.40M | switch (data[n]) { |
511 | 1.35M | case kSnappyCompression: { |
512 | 1.35M | size_t ulength = 0; |
513 | 1.35M | static char snappy_corrupt_msg[] = |
514 | 1.35M | "Snappy not supported or corrupted Snappy compressed block contents"; |
515 | 1.35M | if (!Snappy_GetUncompressedLength(data, n, &ulength)) { |
516 | 0 | return STATUS(Corruption, snappy_corrupt_msg); |
517 | 0 | } |
518 | 1.35M | ubuf = std::unique_ptr<char[]>(new char[ulength]); |
519 | 1.35M | if (!Snappy_Uncompress(data, n, ubuf.get())) { |
520 | 0 | return STATUS(Corruption, snappy_corrupt_msg); |
521 | 0 | } |
522 | 1.35M | *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression, mem_tracker); |
523 | 1.35M | break; |
524 | 1.35M | } |
525 | 16.7k | case kZlibCompression: |
526 | 16.7k | ubuf = std::unique_ptr<char[]>(Zlib_Uncompress( |
527 | 16.7k | data, n, &decompress_size, |
528 | 16.7k | GetCompressFormatForVersion(kZlibCompression, format_version))); |
529 | 16.7k | if (!ubuf) { |
530 | 0 | static char zlib_corrupt_msg[] = |
531 | 0 | "Zlib not supported or corrupted Zlib compressed block contents"; |
532 | 0 | return STATUS(Corruption, zlib_corrupt_msg); |
533 | 0 | } |
534 | 16.7k | *contents = |
535 | 16.7k | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression, mem_tracker); |
536 | 16.7k | break; |
537 | 0 | case kBZip2Compression: |
538 | 0 | ubuf = std::unique_ptr<char[]>(BZip2_Uncompress( |
539 | 0 | data, n, &decompress_size, |
540 | 0 | GetCompressFormatForVersion(kBZip2Compression, format_version))); |
541 | 0 | if (!ubuf) { |
542 | 0 | static char bzip2_corrupt_msg[] = |
543 | 0 | "Bzip2 not supported or corrupted Bzip2 compressed block contents"; |
544 | 0 | return STATUS(Corruption, bzip2_corrupt_msg); |
545 | 0 | } |
546 | 0 | *contents = |
547 | 0 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression, mem_tracker); |
548 | 0 | break; |
549 | 13.4k | case kLZ4Compression: |
550 | 13.4k | ubuf = std::unique_ptr<char[]>(LZ4_Uncompress( |
551 | 13.4k | data, n, &decompress_size, |
552 | 13.4k | GetCompressFormatForVersion(kLZ4Compression, format_version))); |
553 | 13.4k | if (!ubuf) { |
554 | 0 | static char lz4_corrupt_msg[] = |
555 | 0 | "LZ4 not supported or corrupted LZ4 compressed block contents"; |
556 | 0 | return STATUS(Corruption, lz4_corrupt_msg); |
557 | 0 | } |
558 | 13.4k | *contents = |
559 | 13.4k | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression, mem_tracker); |
560 | 13.4k | break; |
561 | 13.4k | case kLZ4HCCompression: |
562 | 13.4k | ubuf = std::unique_ptr<char[]>(LZ4_Uncompress( |
563 | 13.4k | data, n, &decompress_size, |
564 | 13.4k | GetCompressFormatForVersion(kLZ4HCCompression, format_version))); |
565 | 13.4k | if (!ubuf) { |
566 | 0 | static char lz4hc_corrupt_msg[] = |
567 | 0 | "LZ4HC not supported or corrupted LZ4HC compressed block contents"; |
568 | 0 | return STATUS(Corruption, lz4hc_corrupt_msg); |
569 | 0 | } |
570 | 13.4k | *contents = |
571 | 13.4k | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression, mem_tracker); |
572 | 13.4k | break; |
573 | 0 | case kZSTDNotFinalCompression: |
574 | 0 | ubuf = |
575 | 0 | std::unique_ptr<char[]>(ZSTD_Uncompress(data, n, &decompress_size)); |
576 | 0 | if (!ubuf) { |
577 | 0 | static char zstd_corrupt_msg[] = |
578 | 0 | "ZSTD not supported or corrupted ZSTD compressed block contents"; |
579 | 0 | return STATUS(Corruption, zstd_corrupt_msg); |
580 | 0 | } |
581 | 0 | *contents = |
582 | 0 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression, mem_tracker); |
583 | 0 | break; |
584 | 0 | default: |
585 | 0 | return STATUS(Corruption, "bad block type"); |
586 | 1.40M | } |
587 | 1.40M | return Status::OK(); |
588 | 1.40M | } |
589 | | |
590 | | } // namespace rocksdb |