/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_prefix_index.cc

Source (jump to first uncovered line)
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//

#include "yb/rocksdb/table/block_prefix_index.h"

#include <vector>

#include "yb/util/slice.h"
#include "yb/rocksdb/slice_transform.h"
#include "yb/rocksdb/util/arena.h"
#include "yb/rocksdb/util/coding.h"
#include "yb/rocksdb/util/hash.h"

namespace rocksdb {

inline uint32_t Hash(const Slice& s) {
  return rocksdb::Hash(s.data(), s.size(), 0);
}

inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
  return Hash(prefix) % num_buckets;
}

// The prefix block index is simply a bucket array, with each entry pointing to
// the blocks that span the prefixes hashed to this bucket.
//
// To reduce memory footprint, if there is only one block per bucket, the entry
// stores the block id directly. If there are more than one blocks per bucket,
// because of hash collision or a single prefix spanning multiple blocks,
// the entry points to an array of block ids. The block array is an array of
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
// by the block ids.
//
// To differentiate the two cases, the high order bit of the entry indicates
// whether it is a 'pointer' into a separate block array.
// 0x7FFFFFFF is reserved for empty bucket.

const uint32_t kNoneBlock = 0x7FFFFFFF;
const uint32_t kBlockArrayMask = 0x80000000;

inline bool IsNone(uint32_t block_id) {
  return block_id == kNoneBlock;
}

inline bool IsBlockId(uint32_t block_id) {
  return (block_id & kBlockArrayMask) == 0;
}

inline uint32_t DecodeIndex(uint32_t block_id) {
  uint32_t index = block_id ^ kBlockArrayMask;
  assert(index < kBlockArrayMask);
  return index;
}

inline uint32_t EncodeIndex(uint32_t index) {
  assert(index < kBlockArrayMask);
  return index | kBlockArrayMask;
}

// temporary storage for prefix information during index building
struct PrefixRecord {
  Slice prefix;
  uint32_t start_block;
  uint32_t end_block;
  uint32_t num_blocks;
  PrefixRecord* next;
};

class BlockPrefixIndex::Builder {
 public:
  explicit Builder(const SliceTransform* internal_prefix_extractor)
      : internal_prefix_extractor_(internal_prefix_extractor) {}

  void Add(const Slice& key_prefix, uint32_t start_block,
           uint32_t num_blocks) {
    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
      arena_.AllocateAligned(sizeof(PrefixRecord)));
    record->prefix = key_prefix;
    record->start_block = start_block;
    record->end_block = start_block + num_blocks - 1;
    record->num_blocks = num_blocks;
    prefixes_.push_back(record);
  }

  BlockPrefixIndex* Finish() {
    // For now, use roughly 1:1 prefix to bucket ratio.
    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;

    // Collect prefix records that hash to the same bucket, into a single
    // linklist.
    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
    for (PrefixRecord* current : prefixes_) {
      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
      // merge the prefix block span if the first block of this prefix is
      // connected to the last block of the previous prefix.
      PrefixRecord* prev = prefixes_per_bucket[bucket];
      if (prev) {
        assert(current->start_block >= prev->end_block);
        auto distance = current->start_block - prev->end_block;
        if (distance <= 1) {
          prev->end_block = current->end_block;
          prev->num_blocks = prev->end_block - prev->start_block + 1;
          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
          continue;
        }
      }
      current->next = prev;
      prefixes_per_bucket[bucket] = current;
      num_blocks_per_bucket[bucket] += current->num_blocks;
    }

    // Calculate the block array buffer size
    uint32_t total_block_array_entries = 0;
    for (uint32_t i = 0; i < num_buckets; i++437k) {
      uint32_t num_blocks = num_blocks_per_bucket[i];
      if (num_blocks > 1) {
        total_block_array_entries += (num_blocks + 1);
      }
    }

    // Populate the final prefix block index
    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
    uint32_t* buckets = new uint32_t[num_buckets];
    uint32_t offset = 0;
    for (uint32_t i = 0; i < num_buckets; i++436k) {
      uint32_t num_blocks = num_blocks_per_bucket[i];
      if (num_blocks == 0) {
        assert(prefixes_per_bucket[i] == nullptr);
        buckets[i] = kNoneBlock;
      } else if (num_blocks == 1) {
        assert(prefixes_per_bucket[i] != nullptr);
        assert(prefixes_per_bucket[i]->next == nullptr);
        buckets[i] = prefixes_per_bucket[i]->start_block;
      } else {
        assert(prefixes_per_bucket[i] != nullptr);
        buckets[i] = EncodeIndex(offset);
        block_array_buffer[offset] = num_blocks;
        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
        auto current = prefixes_per_bucket[i];
        // populate block ids from largest to smallest
        while (current != nullptr) {
          for (uint32_t iter = 0; iter < current->num_blocks; iter++245k) {
            *last_block = current->end_block - iter;
            last_block--;
          }
          current = current->next;
        }
        assert(last_block == &block_array_buffer[offset]);
        offset += (num_blocks + 1);
      }
    }

    assert(offset == total_block_array_entries);

    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
                                buckets, total_block_array_entries,
                                block_array_buffer);
  }

 private:
  const SliceTransform* internal_prefix_extractor_;

  std::vector<PrefixRecord*> prefixes_;
  Arena arena_;
};


Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
                                const Slice& prefixes, const Slice& prefix_meta,
                                BlockPrefixIndex** prefix_index) {
  uint64_t pos = 0;
  auto meta_pos = prefix_meta;
  Status s;
  Builder builder(internal_prefix_extractor);

  while (!meta_pos.empty()) {
    uint32_t prefix_size = 0;
    uint32_t entry_index = 0;
    uint32_t num_blocks = 0;
    if (!GetVarint32(&meta_pos, &prefix_size) ||
        !GetVarint32(&meta_pos, &entry_index)429k ||
        !GetVarint32(&meta_pos, &num_blocks)428k) {
      s = STATUS(Corruption,
          "Corrupted prefix meta block: unable to read from it.");
      break;
    }
    if (pos + prefix_size > prefixes.size()) {
      s = STATUS(Corruption,
        "Corrupted prefix meta block: size inconsistency.");
      break;
    }
    Slice prefix(prefixes.data() + pos, prefix_size);
    builder.Add(prefix, entry_index, num_blocks);

    pos += prefix_size;
  }

  if (s.ok() && pos != prefixes.size()1.66k) {
    s = STATUS(Corruption, "Corrupted prefix meta block");
  }

  if (s.ok()) {
    *prefix_index = builder.Finish();
  }

  return s;
}

uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
                                     uint32_t** blocks) {
  Slice prefix = internal_prefix_extractor_->Transform(key);

  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
  uint32_t block_id = buckets_[bucket];

  if (IsNone(block_id)) {
    return 0;
  } else if (IsBlockId(block_id)) {
    *blocks = &buckets_[bucket];
    return 1;
  } else {
    uint32_t index = DecodeIndex(block_id);
    assert(index < num_block_array_buffer_entries_);
    *blocks = &block_array_buffer_[index+1];
    uint32_t num_blocks = block_array_buffer_[index];
    assert(num_blocks > 1);
    assert(index + num_blocks < num_block_array_buffer_entries_);
    return num_blocks;
  }
}

}  // namespace rocksdb

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2		// This source code is licensed under the BSD-style license found in the
3		// LICENSE file in the root directory of this source tree. An additional grant
4		// of patent rights can be found in the PATENTS file in the same directory.
5		//
6		// The following only applies to changes made to this file as part of YugaByte development.
7		//
8		// Portions Copyright (c) YugaByte, Inc.
9		//
10		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11		// in compliance with the License. You may obtain a copy of the License at
12		//
13		// http://www.apache.org/licenses/LICENSE-2.0
14		//
15		// Unless required by applicable law or agreed to in writing, software distributed under the License
16		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17		// or implied. See the License for the specific language governing permissions and limitations
18		// under the License.
19		//
20
21		#include "yb/rocksdb/table/block_prefix_index.h"
22
23		#include <vector>
24
25		#include "yb/util/slice.h"
26		#include "yb/rocksdb/slice_transform.h"
27		#include "yb/rocksdb/util/arena.h"
28		#include "yb/rocksdb/util/coding.h"
29		#include "yb/rocksdb/util/hash.h"
30
31		namespace rocksdb {
32
33	531k	inline uint32_t Hash(const Slice& s) {
34	531k	return rocksdb::Hash(s.data(), s.size(), 0);
35	531k	}
36
37	530k	inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
38	530k	return Hash(prefix) % num_buckets;
39	530k	}
40
41		// The prefix block index is simply a bucket array, with each entry pointing to
42		// the blocks that span the prefixes hashed to this bucket.
43		//
44		// To reduce memory footprint, if there is only one block per bucket, the entry
45		// stores the block id directly. If there are more than one blocks per bucket,
46		// because of hash collision or a single prefix spanning multiple blocks,
47		// the entry points to an array of block ids. The block array is an array of
48		// uint32_t's. The first uint32_t indicates the total number of blocks, followed
49		// by the block ids.
50		//
51		// To differentiate the two cases, the high order bit of the entry indicates
52		// whether it is a 'pointer' into a separate block array.
53		// 0x7FFFFFFF is reserved for empty bucket.
54
55		const uint32_t kNoneBlock = 0x7FFFFFFF;
56		const uint32_t kBlockArrayMask = 0x80000000;
57
58	101k	inline bool IsNone(uint32_t block_id) {
59	101k	return block_id == kNoneBlock;
60	101k	}
61
62	99.5k	inline bool IsBlockId(uint32_t block_id) {
63	99.5k	return (block_id & kBlockArrayMask) == 0;
64	99.5k	}
65
66	76.5k	inline uint32_t DecodeIndex(uint32_t block_id) {
67	76.5k	uint32_t index = block_id ^ kBlockArrayMask;
68	76.5k	assert(index < kBlockArrayMask);
69	0	return index;
70	76.5k	}
71
72	101k	inline uint32_t EncodeIndex(uint32_t index) {
73	101k	assert(index < kBlockArrayMask);
74	0	return index \| kBlockArrayMask;
75	101k	}
76
77		// temporary storage for prefix information during index building
78		struct PrefixRecord {
79		Slice prefix;
80		uint32_t start_block;
81		uint32_t end_block;
82		uint32_t num_blocks;
83		PrefixRecord* next;
84		};
85
86		class BlockPrefixIndex::Builder {
87		public:
88		explicit Builder(const SliceTransform* internal_prefix_extractor)
89	1.66k	: internal_prefix_extractor_(internal_prefix_extractor) {}
90
91		void Add(const Slice& key_prefix, uint32_t start_block,
92	430k	uint32_t num_blocks) {
93	430k	PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
94	430k	arena_.AllocateAligned(sizeof(PrefixRecord)));
95	430k	record->prefix = key_prefix;
96	430k	record->start_block = start_block;
97	430k	record->end_block = start_block + num_blocks - 1;
98	430k	record->num_blocks = num_blocks;
99	430k	prefixes_.push_back(record);
100	430k	}
101
102	1.66k	BlockPrefixIndex* Finish() {
103		// For now, use roughly 1:1 prefix to bucket ratio.
104	1.66k	uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
105
106		// Collect prefix records that hash to the same bucket, into a single
107		// linklist.
108	1.66k	std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
109	1.66k	std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
110	429k	for (PrefixRecord* current : prefixes_) {
111	429k	uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
112		// merge the prefix block span if the first block of this prefix is
113		// connected to the last block of the previous prefix.
114	429k	PrefixRecord* prev = prefixes_per_bucket[bucket];
115	429k	if (prev) {
116	158k	assert(current->start_block >= prev->end_block);
117	0	auto distance = current->start_block - prev->end_block;
118	158k	if (distance <= 1) {
119	46.3k	prev->end_block = current->end_block;
120	46.3k	prev->num_blocks = prev->end_block - prev->start_block + 1;
121	46.3k	num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
122	46.3k	continue;
123	46.3k	}
124	158k	}
125	383k	current->next = prev;
126	383k	prefixes_per_bucket[bucket] = current;
127	383k	num_blocks_per_bucket[bucket] += current->num_blocks;
128	383k	}
129
130		// Calculate the block array buffer size
131	1.66k	uint32_t total_block_array_entries = 0;
132	439k	for (uint32_t i = 0; i < num_buckets; i++437k ) {
133	437k	uint32_t num_blocks = num_blocks_per_bucket[i];
134	437k	if (num_blocks > 1) {
135	101k	total_block_array_entries += (num_blocks + 1);
136	101k	}
137	437k	}
138
139		// Populate the final prefix block index
140	1.66k	uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
141	1.66k	uint32_t* buckets = new uint32_t[num_buckets];
142	1.66k	uint32_t offset = 0;
143	438k	for (uint32_t i = 0; i < num_buckets; i++436k ) {
144	436k	uint32_t num_blocks = num_blocks_per_bucket[i];
145	436k	if (num_blocks == 0) {
146	160k	assert(prefixes_per_bucket[i] == nullptr);
147	0	buckets[i] = kNoneBlock;
148	276k	} else if (num_blocks == 1) {
149	174k	assert(prefixes_per_bucket[i] != nullptr);
150	0	assert(prefixes_per_bucket[i]->next == nullptr);
151	0	buckets[i] = prefixes_per_bucket[i]->start_block;
152	174k	} else {
153	101k	assert(prefixes_per_bucket[i] != nullptr);
154	0	buckets[i] = EncodeIndex(offset);
155	101k	block_array_buffer[offset] = num_blocks;
156	101k	uint32_t* last_block = &block_array_buffer[offset + num_blocks];
157	101k	auto current = prefixes_per_bucket[i];
158		// populate block ids from largest to smallest
159	314k	while (current != nullptr) {
160	459k	for (uint32_t iter = 0; iter < current->num_blocks; iter++245k ) {
161	245k	*last_block = current->end_block - iter;
162	245k	last_block--;
163	245k	}
164	213k	current = current->next;
165	213k	}
166	101k	assert(last_block == &block_array_buffer[offset]);
167	0	offset += (num_blocks + 1);
168	101k	}
169	436k	}
170
171	1.66k	assert(offset == total_block_array_entries);
172
173	0	return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
174	1.66k	buckets, total_block_array_entries,
175	1.66k	block_array_buffer);
176	1.66k	}
177
178		private:
179		const SliceTransform* internal_prefix_extractor_;
180
181		std::vector<PrefixRecord*> prefixes_;
182		Arena arena_;
183		};
184
185
186		Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
187		const Slice& prefixes, const Slice& prefix_meta,
188	1.66k	BlockPrefixIndex** prefix_index) {
189	1.66k	uint64_t pos = 0;
190	1.66k	auto meta_pos = prefix_meta;
191	1.66k	Status s;
192	1.66k	Builder builder(internal_prefix_extractor);
193
194	434k	while (!meta_pos.empty()) {
195	432k	uint32_t prefix_size = 0;
196	432k	uint32_t entry_index = 0;
197	432k	uint32_t num_blocks = 0;
198	432k	if (!GetVarint32(&meta_pos, &prefix_size) \|\|
199	432k	!GetVarint32(&meta_pos, &entry_index)429k \|\|
200	432k	!GetVarint32(&meta_pos, &num_blocks)428k ) {
201	0	s = STATUS(Corruption,
202	0	"Corrupted prefix meta block: unable to read from it.");
203	0	break;
204	0	}
205	432k	if (pos + prefix_size > prefixes.size()) {
206	0	s = STATUS(Corruption,
207	0	"Corrupted prefix meta block: size inconsistency.");
208	0	break;
209	0	}
210	432k	Slice prefix(prefixes.data() + pos, prefix_size);
211	432k	builder.Add(prefix, entry_index, num_blocks);
212
213	432k	pos += prefix_size;
214	432k	}
215
216	1.66k	if (s.ok() && pos != prefixes.size()1.66k ) {
217	0	s = STATUS(Corruption, "Corrupted prefix meta block");
218	0	}
219
220	1.66k	if (s.ok()) {
221	1.66k	*prefix_index = builder.Finish();
222	1.66k	}
223
224	1.66k	return s;
225	1.66k	}
226
227		uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
228	101k	uint32_t** blocks) {
229	101k	Slice prefix = internal_prefix_extractor_->Transform(key);
230
231	101k	uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
232	101k	uint32_t block_id = buckets_[bucket];
233
234	101k	if (IsNone(block_id)) {
235	2.17k	return 0;
236	99.5k	} else if (IsBlockId(block_id)) {
237	23.0k	*blocks = &buckets_[bucket];
238	23.0k	return 1;
239	76.4k	} else {
240	76.4k	uint32_t index = DecodeIndex(block_id);
241	76.4k	assert(index < num_block_array_buffer_entries_);
242	0	*blocks = &block_array_buffer_[index+1];
243	76.4k	uint32_t num_blocks = block_array_buffer_[index];
244	76.4k	assert(num_blocks > 1);
245	0	assert(index + num_blocks < num_block_array_buffer_entries_);
246	0	return num_blocks;
247	76.4k	}
248	101k	}
249
250		} // namespace rocksdb