YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/common/partition.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
#ifndef YB_COMMON_PARTITION_H
33
#define YB_COMMON_PARTITION_H
34
35
#include <algorithm>
36
#include <string>
37
#include <vector>
38
39
#include <boost/optional/optional.hpp>
40
41
#include <google/protobuf/repeated_field.h>
42
43
#include "yb/common/common_fwd.h"
44
#include "yb/common/column_id.h"
45
#include "yb/common/partial_row.h"
46
47
#include "yb/util/status_fwd.h"
48
#include "yb/util/memory/arena_fwd.h"
49
50
namespace yb {
51
52
class ConstContiguousRow;
53
class YBPartialRow;
54
class PartitionSchemaPB;
55
class TypeInfo;
56
57
enum class YBHashSchema {
58
  kMultiColumnHash = 1, // YQL default hashing.
59
  kRedisHash = 2, // Redis default hashing.
60
  kPgsqlHash = 3 // PGSQL default hashing.
61
};
62
63
// A Partition describes the set of rows that a Tablet is responsible for
64
// serving. Each tablet is assigned a single Partition.
65
//
66
// Partitions consist primarily of a start and end partition key. Every row with
67
// a partition key that falls in a Tablet's Partition will be served by that
68
// tablet.
69
//
70
// In addition to the start and end partition keys, a Partition holds metadata
71
// to determine if a scan can prune, or skip, a partition based on the scan's
72
// start and end primary keys, and predicates.
73
class Partition {
74
 public:
75
76
1.68M
  const std::vector<int32_t>& hash_buckets() const {
77
1.68M
    return hash_buckets_;
78
1.68M
  }
79
80
  Slice range_key_start() const;
81
82
  Slice range_key_end() const;
83
84
26.5M
  const std::string& partition_key_start() const {
85
26.5M
    return partition_key_start_;
86
26.5M
  }
87
88
73.5M
  const std::string& partition_key_end() const {
89
73.5M
    return partition_key_end_;
90
73.5M
  }
91
92
75
  void set_partition_key_start(const std::string& partition_key_start) {
93
75
    partition_key_start_ = partition_key_start;
94
75
  }
95
96
75
  void set_partition_key_end(const std::string& partition_key_end) {
97
75
    partition_key_end_ = partition_key_end;
98
75
  }
99
100
  // Serializes a partition into a protobuf message.
101
  void ToPB(PartitionPB* pb) const;
102
103
  // Deserializes a protobuf message into a partition.
104
  //
105
  // The protobuf message is not validated, since partitions are only expected
106
  // to be created by the master process.
107
  static void FromPB(const PartitionPB& pb, Partition* partition);
108
109
23.3M
  bool ContainsKey(const std::string& partition_key) const {
110
23.3M
    return partition_key >= partition_key_start() &&
111
23.3M
        (partition_key_end().empty() || 
partition_key < partition_key_end()14.1M
);
112
23.3M
  }
113
114
  // Checks if this partition is a superset or is exactly the same as another partition.
115
  template <class T>
116
  bool ContainsPartition(const T& other) const {
117
    return other.partition_key_start() >= partition_key_start() &&
118
           (partition_key_end().empty() || (!other.partition_key_end().empty() &&
119
                                            other.partition_key_end() <= partition_key_end()));
120
  }
121
122
  // Checks if this and another partitions have equal bounds.
123
  template <class T>
124
  bool BoundsEqualToPartition(const T& other) const {
125
    return partition_key_start() == other.partition_key_start() &&
126
           partition_key_end() == other.partition_key_end();
127
  }
128
129
  // Checks if this partition is a strict superset of another partition (shouldn't be exactly the
130
  // same).
131
  template <class T>
132
  bool ContainsPartitionStrict(const T& other) const {
133
    return ContainsPartition(other) && !BoundsEqualToPartition(other);
134
  }
135
136
  std::string ToString() const;
137
138
 private:
139
  friend class PartitionSchema;
140
141
  // Helper function for accessing the range key portion of a partition key.
142
  Slice range_key(const std::string& partition_key) const;
143
144
  std::vector<int32_t> hash_buckets_;
145
146
  std::string partition_key_start_;
147
  std::string partition_key_end_;
148
};
149
150
// A partition schema describes how the rows of a table are distributed among
151
// tablets.
152
//
153
// Primarily, a table's partition schema is responsible for translating the
154
// primary key column values of a row into a partition key that can be used to
155
// determine the tablet containing the key.
156
//
157
// The partition schema is made up of zero or more hash bucket components,
158
// followed by a single range component.
159
//
160
// Each hash bucket component includes one or more columns from the primary key
161
// column set, with the restriction that an individual primary key column may
162
// only be included in a single hash component.
163
//
164
// To determine the hash bucket of an individual row, the values of the columns
165
// of the hash component are encoded into bytes (in PK or lexicographic
166
// preserving encoding), then hashed into a u64, then modded into an i32. When
167
// constructing a partition key from a row, the buckets of the row are simply
168
// encoded into the partition key in order (again in PK or lexicographic
169
// preserving encoding).
170
//
171
// The range component contains a (possibly full or empty) subset of the primary
172
// key columns. When encoding the partition key, the columns of the partition
173
// component are encoded in order.
174
//
175
// The above is true of the relationship between rows and partition keys. It
176
// gets trickier with partitions (tablet partition key boundaries), because the
177
// boundaries of tablets do not necessarily align to rows. For instance,
178
// currently the absolute-start and absolute-end primary keys of a table
179
// represented as an empty key, but do not have a corresponding row. Partitions
180
// are similar, but instead of having just one absolute-start and absolute-end,
181
// each component of a partition schema has an absolute-start and absolute-end.
182
// When creating the initial set of partitions during table creation, we deal
183
// with this by "carrying through" absolute-start or absolute-ends into lower
184
// significance components.
185
class PartitionSchema {
186
 public:
187
188
  static constexpr int32_t kPartitionKeySize = 2;
189
  static constexpr int32_t kMaxPartitionKey = std::numeric_limits<uint16_t>::max();
190
191
  // Deserializes a protobuf message into a partition schema.
192
  static CHECKED_STATUS FromPB(const PartitionSchemaPB& pb,
193
                               const Schema& schema,
194
                               PartitionSchema* partition_schema) WARN_UNUSED_RESULT;
195
196
  // Serializes a partition schema into a protobuf message.
197
  void ToPB(PartitionSchemaPB* pb) const;
198
199
  CHECKED_STATUS EncodeRedisKey(const YBPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT;
200
201
  CHECKED_STATUS EncodeRedisKey(const ConstContiguousRow& row,
202
                                std::string* buf) const WARN_UNUSED_RESULT;
203
204
  CHECKED_STATUS EncodeRedisKey(const Slice& slice, std::string* buf) const WARN_UNUSED_RESULT;
205
206
  CHECKED_STATUS EncodeKey(const google::protobuf::RepeatedPtrField<QLExpressionPB>& hash_values,
207
                           std::string* buf) const WARN_UNUSED_RESULT;
208
209
  CHECKED_STATUS EncodeKey(const google::protobuf::RepeatedPtrField<PgsqlExpressionPB>& hash_values,
210
                           std::string* buf) const WARN_UNUSED_RESULT;
211
212
  // Appends the row's encoded partition key into the provided buffer.
213
  // On failure, the buffer may have data partially appended.
214
  CHECKED_STATUS EncodeKey(const YBPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT;
215
216
  // Appends the row's encoded partition key into the provided buffer.
217
  // On failure, the buffer may have data partially appended.
218
  CHECKED_STATUS EncodeKey(const ConstContiguousRow& row, std::string* buf) const
219
    WARN_UNUSED_RESULT;
220
221
  bool IsHashPartitioning() const;
222
223
  YBHashSchema hash_schema() const;
224
225
301M
  bool IsRangePartitioning() const {
226
301M
    return range_schema_.column_ids.size() > 0;
227
301M
  }
228
229
  // Encodes the given uint16 value into a 2 byte string.
230
  static std::string EncodeMultiColumnHashValue(uint16_t hash_value);
231
232
  // Decode the given partition_key to a 2-byte integer.
233
  static uint16_t DecodeMultiColumnHashValue(const std::string& partition_key);
234
235
  // Does [partition_key_start, partition_key_end] form a valid range.
236
  static CHECKED_STATUS IsValidHashPartitionRange(const std::string& partition_key_start,
237
                                                  const std::string& partition_key_end);
238
239
  static bool IsValidHashPartitionKeyBound(const std::string& partition_key);
240
241
  // Encoded (sub)doc keys that belong to partition with partition_key lower bound
242
  // are starting with this prefix or greater than it
243
  static std::string GetEncodedKeyPrefix(
244
    const std::string& partition_key, const PartitionSchemaPB& partition_schema);
245
246
  // YugaByte partition creation
247
  // Creates the set of table partitions using multi column hash schema. In this schema, we divide
248
  // the [0, max_partition_key] range into the requested number of intervals.
249
  // - Inputs are from SQL and CQL.
250
  // - YugaByte methods are used for hash and range partitioning to create tablets.
251
  // 'num_tablets' is just a recommendation for the target number of partitions. The final
252
  // partitions are in the result variable 'partitions', so the final number of tablets is
253
  // 'partitions.size()' value.
254
  //
255
  // TODO(neil) Investigate partitions to support both hash and range schema.
256
  // - First, use range schema to split the table.
257
  // - Second, use hash schema to partition each split.
258
  CHECKED_STATUS CreatePartitions(int32_t num_tablets, std::vector<Partition>* partitions) const;
259
260
  // Kudu partition creation
261
  // NOTE: The following function from Kudu is to support a C++ API instead of SQL or CQL. They
262
  // also create partitions differently. There are code in this function that shouldn't be apply
263
  // to YugaByte's database except for metadata in master, which is using Kudu's DB.
264
  //
265
  // Creates the set of table partitions for a partition schema and collection
266
  // of split rows.
267
  //
268
  // The number of resulting partitions is the product of the number of hash
269
  // buckets for each hash bucket component, multiplied by
270
  // (split_rows.size() + 1).
271
  CHECKED_STATUS CreatePartitions(const std::vector<YBPartialRow>& split_rows,
272
                                  const Schema& schema,
273
                                  std::vector<Partition>* partitions) const WARN_UNUSED_RESULT;
274
275
  // Tests if the partition contains the row.
276
  CHECKED_STATUS PartitionContainsRow(const Partition& partition,
277
                                      const YBPartialRow& row,
278
                                      bool* contains) const WARN_UNUSED_RESULT;
279
280
  // Tests if the partition contains the row.
281
  CHECKED_STATUS PartitionContainsRow(const Partition& partition,
282
                                      const ConstContiguousRow& row,
283
                                      bool* contains) const WARN_UNUSED_RESULT;
284
285
  // Returns a text description of the partition suitable for debug printing.
286
  std::string PartitionDebugString(const Partition& partition, const Schema& schema) const;
287
288
  // Returns a text description of a range partition suitable for debug printing.
289
  std::string RangePartitionDebugString(const Partition& partition, const Schema& schema) const;
290
291
  // Returns a text description of the partial row's partition key suitable for debug printing.
292
  std::string RowDebugString(const YBPartialRow& row) const;
293
294
  // Returns a text description of the row's partition key suitable for debug printing.
295
  std::string RowDebugString(const ConstContiguousRow& row) const;
296
297
  // Returns a text description of the encoded partition key suitable for debug printing.
298
  std::string PartitionKeyDebugString(const std::string& key, const Schema& schema) const;
299
300
  // Returns a text description of this partition schema suitable for debug printing.
301
  std::string DebugString(const Schema& schema) const;
302
303
  // Returns true if the other partition schema is equivalent to this one.
304
  bool Equals(const PartitionSchema& other) const;
305
306
  // Return true if the partitioning scheme simply range-partitions on the full primary key,
307
  // with no bucketing components, etc.
308
  bool IsSimplePKRangePartitioning(const Schema& schema) const;
309
310
 private:
311
312
  struct RangeSplit {
313
2.77k
    explicit RangeSplit(const std::string& bounds) : column_bounds(bounds) {}
314
315
    std::string column_bounds;
316
  };
317
318
  struct RangeSchema {
319
    std::vector<ColumnId> column_ids;
320
    std::vector<RangeSplit> splits;
321
  };
322
323
  struct HashBucketSchema {
324
    std::vector<ColumnId> column_ids;
325
    int32_t num_buckets;
326
    uint32_t seed;
327
  };
328
329
  // Convertion between PB and partition schema.
330
  static CHECKED_STATUS KuduFromPB(const PartitionSchemaPB& pb,
331
                                 const Schema& schema,
332
                                 PartitionSchema* partition_schema);
333
  void KuduToPB(PartitionSchemaPB* pb) const;
334
335
  // Creates the set of table partitions using multi column hash schema. In this schema, we divide
336
  // the [ hash(0), hash(max_partition_key) ] range equally into the requested number of intervals.
337
  CHECKED_STATUS CreateHashPartitions(int32_t num_tablets,
338
                                      std::vector<Partition>* partitions,
339
                                      int32_t max_partition_key = kMaxPartitionKey) const;
340
341
  // Creates the set of table partitions using primary-key range schema. In this schema, we divide
342
  // the table by given ranges in the partitions vector.
343
  CHECKED_STATUS CreateRangePartitions(std::vector<Partition>* partitions) const;
344
345
  // Encodes the specified columns of a row into lexicographic sort-order preserving format.
346
  static CHECKED_STATUS EncodeColumns(const YBPartialRow& row,
347
                                      const std::vector<ColumnId>& column_ids,
348
                                      std::string* buf);
349
350
  // Encodes the specified columns of a row into lexicographic sort-order preserving format.
351
  static CHECKED_STATUS EncodeColumns(const ConstContiguousRow& row,
352
                                      const std::vector<ColumnId>& column_ids,
353
                                      std::string* buf);
354
355
  // Hashes a compound string of all columns into a 16-bit integer.
356
  static uint16_t HashColumnCompoundValue(const std::string& compound);
357
358
  // Encodes the specified columns of a row into 2-byte partition key using the multi column
359
  // hashing scheme.
360
  static CHECKED_STATUS EncodeColumns(const YBPartialRow& row, std::string* buf);
361
362
  // Encodes the specified columns of a row into 2-byte partition key using the multi column
363
  // hashing scheme.
364
  static CHECKED_STATUS EncodeColumns(const ConstContiguousRow& row, std::string* buf);
365
366
  // Assigns the row to a hash bucket according to the hash schema.
367
  template<typename Row>
368
  static CHECKED_STATUS BucketForRow(const Row& row,
369
                                     const HashBucketSchema& hash_bucket_schema,
370
                                     int32_t* bucket);
371
372
  // Private templated helper for PartitionContainsRow.
373
  template<typename Row>
374
  CHECKED_STATUS PartitionContainsRowImpl(const Partition& partition,
375
                                          const Row& row,
376
                                          bool* contains) const;
377
378
  // Appends the stringified range partition components of a partial row to a
379
  // vector.
380
  //
381
  // If any columns of the range partition do not exist in the partial row,
382
  // processing stops and the provided default string piece is appended to the vector.
383
  void AppendRangeDebugStringComponentsOrString(const YBPartialRow& row,
384
                                                GStringPiece default_string,
385
                                                std::vector<std::string>* components) const;
386
387
  // Appends the stringified range partition components of a partial row to a
388
  // vector.
389
  //
390
  // If any columns of the range partition do not exist in the partial row, the
391
  // logical minimum value for that column will be used instead.
392
  void AppendRangeDebugStringComponentsOrMin(const YBPartialRow& row,
393
                                             std::vector<std::string>* components) const;
394
395
  // Decodes a range partition key into a partial row, with variable-length
396
  // fields stored in the arena.
397
  CHECKED_STATUS DecodeRangeKey(Slice* encode_key,
398
                                YBPartialRow* partial_row,
399
                                Arena* arena) const;
400
401
  // Decodes the hash bucket component of a partition key into its buckets.
402
  //
403
  // This should only be called with partition keys created from a row, not with
404
  // partition keys from a partition.
405
  CHECKED_STATUS DecodeHashBuckets(Slice* partition_key, std::vector<int32_t>* buckets) const;
406
407
  // Clears the state of this partition schema.
408
  void Clear();
409
410
  // Validates that this partition schema is valid. Returns OK, or an
411
  // appropriate error code for an invalid partition schema.
412
  CHECKED_STATUS Validate(const Schema& schema) const;
413
414
  std::vector<HashBucketSchema> hash_bucket_schemas_;
415
  RangeSchema range_schema_;
416
  boost::optional<YBHashSchema> hash_schema_; // Defined only for table that is hash-partitioned.
417
};
418
419
} // namespace yb
420
421
#endif // YB_COMMON_PARTITION_H