/Users/deen/code/yugabyte-db/src/yb/common/partition.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | #ifndef YB_COMMON_PARTITION_H |
33 | | #define YB_COMMON_PARTITION_H |
34 | | |
35 | | #include <algorithm> |
36 | | #include <string> |
37 | | #include <vector> |
38 | | |
39 | | #include <boost/optional/optional.hpp> |
40 | | |
41 | | #include <google/protobuf/repeated_field.h> |
42 | | |
43 | | #include "yb/common/common_fwd.h" |
44 | | #include "yb/common/column_id.h" |
45 | | #include "yb/common/partial_row.h" |
46 | | |
47 | | #include "yb/util/status_fwd.h" |
48 | | #include "yb/util/memory/arena_fwd.h" |
49 | | |
50 | | namespace yb { |
51 | | |
52 | | class ConstContiguousRow; |
53 | | class YBPartialRow; |
54 | | class PartitionSchemaPB; |
55 | | class TypeInfo; |
56 | | |
57 | | enum class YBHashSchema { |
58 | | kMultiColumnHash = 1, // YQL default hashing. |
59 | | kRedisHash = 2, // Redis default hashing. |
60 | | kPgsqlHash = 3 // PGSQL default hashing. |
61 | | }; |
62 | | |
63 | | // A Partition describes the set of rows that a Tablet is responsible for |
64 | | // serving. Each tablet is assigned a single Partition. |
65 | | // |
66 | | // Partitions consist primarily of a start and end partition key. Every row with |
67 | | // a partition key that falls in a Tablet's Partition will be served by that |
68 | | // tablet. |
69 | | // |
70 | | // In addition to the start and end partition keys, a Partition holds metadata |
71 | | // to determine if a scan can prune, or skip, a partition based on the scan's |
72 | | // start and end primary keys, and predicates. |
73 | | class Partition { |
74 | | public: |
75 | | |
76 | 905k | const std::vector<int32_t>& hash_buckets() const { |
77 | 905k | return hash_buckets_; |
78 | 905k | } |
79 | | |
80 | | Slice range_key_start() const; |
81 | | |
82 | | Slice range_key_end() const; |
83 | | |
84 | 12.6M | const std::string& partition_key_start() const { |
85 | 12.6M | return partition_key_start_; |
86 | 12.6M | } |
87 | | |
88 | 34.8M | const std::string& partition_key_end() const { |
89 | 34.8M | return partition_key_end_; |
90 | 34.8M | } |
91 | | |
92 | 51 | void set_partition_key_start(const std::string& partition_key_start) { |
93 | 51 | partition_key_start_ = partition_key_start; |
94 | 51 | } |
95 | | |
96 | 51 | void set_partition_key_end(const std::string& partition_key_end) { |
97 | 51 | partition_key_end_ = partition_key_end; |
98 | 51 | } |
99 | | |
100 | | // Serializes a partition into a protobuf message. |
101 | | void ToPB(PartitionPB* pb) const; |
102 | | |
103 | | // Deserializes a protobuf message into a partition. |
104 | | // |
105 | | // The protobuf message is not validated, since partitions are only expected |
106 | | // to be created by the master process. |
107 | | static void FromPB(const PartitionPB& pb, Partition* partition); |
108 | | |
109 | 11.0M | bool ContainsKey(const std::string& partition_key) const { |
110 | 11.0M | return partition_key >= partition_key_start() && |
111 | 11.0M | (partition_key_end().empty() || partition_key < partition_key_end()); |
112 | 11.0M | } |
113 | | |
114 | | // Checks if this partition is a superset or is exactly the same as another partition. |
115 | | template <class T> |
116 | | bool ContainsPartition(const T& other) const { |
117 | | return other.partition_key_start() >= partition_key_start() && |
118 | | (partition_key_end().empty() || (!other.partition_key_end().empty() && |
119 | | other.partition_key_end() <= partition_key_end())); |
120 | | } |
121 | | |
122 | | // Checks if this and another partitions have equal bounds. |
123 | | template <class T> |
124 | 2 | bool BoundsEqualToPartition(const T& other) const { |
125 | 2 | return partition_key_start() == other.partition_key_start() && |
126 | 2 | partition_key_end() == other.partition_key_end(); |
127 | 2 | } |
128 | | |
129 | | // Checks if this partition is a strict superset of another partition (shouldn't be exactly the |
130 | | // same). |
131 | | template <class T> |
132 | | bool ContainsPartitionStrict(const T& other) const { |
133 | | return ContainsPartition(other) && !BoundsEqualToPartition(other); |
134 | | } |
135 | | |
136 | | std::string ToString() const; |
137 | | |
138 | | private: |
139 | | friend class PartitionSchema; |
140 | | |
141 | | // Helper function for accessing the range key portion of a partition key. |
142 | | Slice range_key(const std::string& partition_key) const; |
143 | | |
144 | | std::vector<int32_t> hash_buckets_; |
145 | | |
146 | | std::string partition_key_start_; |
147 | | std::string partition_key_end_; |
148 | | }; |
149 | | |
150 | | // A partition schema describes how the rows of a table are distributed among |
151 | | // tablets. |
152 | | // |
153 | | // Primarily, a table's partition schema is responsible for translating the |
154 | | // primary key column values of a row into a partition key that can be used to |
155 | | // determine the tablet containing the key. |
156 | | // |
157 | | // The partition schema is made up of zero or more hash bucket components, |
158 | | // followed by a single range component. |
159 | | // |
160 | | // Each hash bucket component includes one or more columns from the primary key |
161 | | // column set, with the restriction that an individual primary key column may |
162 | | // only be included in a single hash component. |
163 | | // |
164 | | // To determine the hash bucket of an individual row, the values of the columns |
165 | | // of the hash component are encoded into bytes (in PK or lexicographic |
166 | | // preserving encoding), then hashed into a u64, then modded into an i32. When |
167 | | // constructing a partition key from a row, the buckets of the row are simply |
168 | | // encoded into the partition key in order (again in PK or lexicographic |
169 | | // preserving encoding). |
170 | | // |
171 | | // The range component contains a (possibly full or empty) subset of the primary |
172 | | // key columns. When encoding the partition key, the columns of the partition |
173 | | // component are encoded in order. |
174 | | // |
175 | | // The above is true of the relationship between rows and partition keys. It |
176 | | // gets trickier with partitions (tablet partition key boundaries), because the |
177 | | // boundaries of tablets do not necessarily align to rows. For instance, |
178 | | // currently the absolute-start and absolute-end primary keys of a table |
179 | | // represented as an empty key, but do not have a corresponding row. Partitions |
180 | | // are similar, but instead of having just one absolute-start and absolute-end, |
181 | | // each component of a partition schema has an absolute-start and absolute-end. |
182 | | // When creating the initial set of partitions during table creation, we deal |
183 | | // with this by "carrying through" absolute-start or absolute-ends into lower |
184 | | // significance components. |
185 | | class PartitionSchema { |
186 | | public: |
187 | | |
188 | | static constexpr int32_t kPartitionKeySize = 2; |
189 | | static constexpr int32_t kMaxPartitionKey = std::numeric_limits<uint16_t>::max(); |
190 | | |
191 | | // Deserializes a protobuf message into a partition schema. |
192 | | static CHECKED_STATUS FromPB(const PartitionSchemaPB& pb, |
193 | | const Schema& schema, |
194 | | PartitionSchema* partition_schema) WARN_UNUSED_RESULT; |
195 | | |
196 | | // Serializes a partition schema into a protobuf message. |
197 | | void ToPB(PartitionSchemaPB* pb) const; |
198 | | |
199 | | CHECKED_STATUS EncodeRedisKey(const YBPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT; |
200 | | |
201 | | CHECKED_STATUS EncodeRedisKey(const ConstContiguousRow& row, |
202 | | std::string* buf) const WARN_UNUSED_RESULT; |
203 | | |
204 | | CHECKED_STATUS EncodeRedisKey(const Slice& slice, std::string* buf) const WARN_UNUSED_RESULT; |
205 | | |
206 | | CHECKED_STATUS EncodeKey(const google::protobuf::RepeatedPtrField<QLExpressionPB>& hash_values, |
207 | | std::string* buf) const WARN_UNUSED_RESULT; |
208 | | |
209 | | CHECKED_STATUS EncodeKey(const google::protobuf::RepeatedPtrField<PgsqlExpressionPB>& hash_values, |
210 | | std::string* buf) const WARN_UNUSED_RESULT; |
211 | | |
212 | | // Appends the row's encoded partition key into the provided buffer. |
213 | | // On failure, the buffer may have data partially appended. |
214 | | CHECKED_STATUS EncodeKey(const YBPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT; |
215 | | |
216 | | // Appends the row's encoded partition key into the provided buffer. |
217 | | // On failure, the buffer may have data partially appended. |
218 | | CHECKED_STATUS EncodeKey(const ConstContiguousRow& row, std::string* buf) const |
219 | | WARN_UNUSED_RESULT; |
220 | | |
221 | | bool IsHashPartitioning() const; |
222 | | |
223 | | YBHashSchema hash_schema() const; |
224 | | |
225 | 140M | bool IsRangePartitioning() const { |
226 | 140M | return range_schema_.column_ids.size() > 0; |
227 | 140M | } |
228 | | |
229 | | // Encodes the given uint16 value into a 2 byte string. |
230 | | static std::string EncodeMultiColumnHashValue(uint16_t hash_value); |
231 | | |
232 | | // Decode the given partition_key to a 2-byte integer. |
233 | | static uint16_t DecodeMultiColumnHashValue(const std::string& partition_key); |
234 | | |
235 | | // Does [partition_key_start, partition_key_end] form a valid range. |
236 | | static CHECKED_STATUS IsValidHashPartitionRange(const std::string& partition_key_start, |
237 | | const std::string& partition_key_end); |
238 | | |
239 | | static bool IsValidHashPartitionKeyBound(const std::string& partition_key); |
240 | | |
241 | | // Encoded (sub)doc keys that belong to partition with partition_key lower bound |
242 | | // are starting with this prefix or greater than it |
243 | | static std::string GetEncodedKeyPrefix( |
244 | | const std::string& partition_key, const PartitionSchemaPB& partition_schema); |
245 | | |
246 | | // YugaByte partition creation |
247 | | // Creates the set of table partitions using multi column hash schema. In this schema, we divide |
248 | | // the [0, max_partition_key] range into the requested number of intervals. |
249 | | // - Inputs are from SQL and CQL. |
250 | | // - YugaByte methods are used for hash and range partitioning to create tablets. |
251 | | // 'num_tablets' is just a recommendation for the target number of partitions. The final |
252 | | // partitions are in the result variable 'partitions', so the final number of tablets is |
253 | | // 'partitions.size()' value. |
254 | | // |
255 | | // TODO(neil) Investigate partitions to support both hash and range schema. |
256 | | // - First, use range schema to split the table. |
257 | | // - Second, use hash schema to partition each split. |
258 | | CHECKED_STATUS CreatePartitions(int32_t num_tablets, std::vector<Partition>* partitions) const; |
259 | | |
260 | | // Kudu partition creation |
261 | | // NOTE: The following function from Kudu is to support a C++ API instead of SQL or CQL. They |
262 | | // also create partitions differently. There are code in this function that shouldn't be apply |
263 | | // to YugaByte's database except for metadata in master, which is using Kudu's DB. |
264 | | // |
265 | | // Creates the set of table partitions for a partition schema and collection |
266 | | // of split rows. |
267 | | // |
268 | | // The number of resulting partitions is the product of the number of hash |
269 | | // buckets for each hash bucket component, multiplied by |
270 | | // (split_rows.size() + 1). |
271 | | CHECKED_STATUS CreatePartitions(const std::vector<YBPartialRow>& split_rows, |
272 | | const Schema& schema, |
273 | | std::vector<Partition>* partitions) const WARN_UNUSED_RESULT; |
274 | | |
275 | | // Tests if the partition contains the row. |
276 | | CHECKED_STATUS PartitionContainsRow(const Partition& partition, |
277 | | const YBPartialRow& row, |
278 | | bool* contains) const WARN_UNUSED_RESULT; |
279 | | |
280 | | // Tests if the partition contains the row. |
281 | | CHECKED_STATUS PartitionContainsRow(const Partition& partition, |
282 | | const ConstContiguousRow& row, |
283 | | bool* contains) const WARN_UNUSED_RESULT; |
284 | | |
285 | | // Returns a text description of the partition suitable for debug printing. |
286 | | std::string PartitionDebugString(const Partition& partition, const Schema& schema) const; |
287 | | |
288 | | // Returns a text description of a range partition suitable for debug printing. |
289 | | std::string RangePartitionDebugString(const Partition& partition, const Schema& schema) const; |
290 | | |
291 | | // Returns a text description of the partial row's partition key suitable for debug printing. |
292 | | std::string RowDebugString(const YBPartialRow& row) const; |
293 | | |
294 | | // Returns a text description of the row's partition key suitable for debug printing. |
295 | | std::string RowDebugString(const ConstContiguousRow& row) const; |
296 | | |
297 | | // Returns a text description of the encoded partition key suitable for debug printing. |
298 | | std::string PartitionKeyDebugString(const std::string& key, const Schema& schema) const; |
299 | | |
300 | | // Returns a text description of this partition schema suitable for debug printing. |
301 | | std::string DebugString(const Schema& schema) const; |
302 | | |
303 | | // Returns true if the other partition schema is equivalent to this one. |
304 | | bool Equals(const PartitionSchema& other) const; |
305 | | |
306 | | // Return true if the partitioning scheme simply range-partitions on the full primary key, |
307 | | // with no bucketing components, etc. |
308 | | bool IsSimplePKRangePartitioning(const Schema& schema) const; |
309 | | |
310 | | private: |
311 | | |
312 | | struct RangeSplit { |
313 | 220 | explicit RangeSplit(const std::string& bounds) : column_bounds(bounds) {} |
314 | | |
315 | | std::string column_bounds; |
316 | | }; |
317 | | |
318 | | struct RangeSchema { |
319 | | std::vector<ColumnId> column_ids; |
320 | | std::vector<RangeSplit> splits; |
321 | | }; |
322 | | |
323 | | struct HashBucketSchema { |
324 | | std::vector<ColumnId> column_ids; |
325 | | int32_t num_buckets; |
326 | | uint32_t seed; |
327 | | }; |
328 | | |
329 | | // Convertion between PB and partition schema. |
330 | | static CHECKED_STATUS KuduFromPB(const PartitionSchemaPB& pb, |
331 | | const Schema& schema, |
332 | | PartitionSchema* partition_schema); |
333 | | void KuduToPB(PartitionSchemaPB* pb) const; |
334 | | |
335 | | // Creates the set of table partitions using multi column hash schema. In this schema, we divide |
336 | | // the [ hash(0), hash(max_partition_key) ] range equally into the requested number of intervals. |
337 | | CHECKED_STATUS CreateHashPartitions(int32_t num_tablets, |
338 | | std::vector<Partition>* partitions, |
339 | | int32_t max_partition_key = kMaxPartitionKey) const; |
340 | | |
341 | | // Creates the set of table partitions using primary-key range schema. In this schema, we divide |
342 | | // the table by given ranges in the partitions vector. |
343 | | CHECKED_STATUS CreateRangePartitions(std::vector<Partition>* partitions) const; |
344 | | |
345 | | // Encodes the specified columns of a row into lexicographic sort-order preserving format. |
346 | | static CHECKED_STATUS EncodeColumns(const YBPartialRow& row, |
347 | | const std::vector<ColumnId>& column_ids, |
348 | | std::string* buf); |
349 | | |
350 | | // Encodes the specified columns of a row into lexicographic sort-order preserving format. |
351 | | static CHECKED_STATUS EncodeColumns(const ConstContiguousRow& row, |
352 | | const std::vector<ColumnId>& column_ids, |
353 | | std::string* buf); |
354 | | |
355 | | // Hashes a compound string of all columns into a 16-bit integer. |
356 | | static uint16_t HashColumnCompoundValue(const std::string& compound); |
357 | | |
358 | | // Encodes the specified columns of a row into 2-byte partition key using the multi column |
359 | | // hashing scheme. |
360 | | static CHECKED_STATUS EncodeColumns(const YBPartialRow& row, std::string* buf); |
361 | | |
362 | | // Encodes the specified columns of a row into 2-byte partition key using the multi column |
363 | | // hashing scheme. |
364 | | static CHECKED_STATUS EncodeColumns(const ConstContiguousRow& row, std::string* buf); |
365 | | |
366 | | // Assigns the row to a hash bucket according to the hash schema. |
367 | | template<typename Row> |
368 | | static CHECKED_STATUS BucketForRow(const Row& row, |
369 | | const HashBucketSchema& hash_bucket_schema, |
370 | | int32_t* bucket); |
371 | | |
372 | | // Private templated helper for PartitionContainsRow. |
373 | | template<typename Row> |
374 | | CHECKED_STATUS PartitionContainsRowImpl(const Partition& partition, |
375 | | const Row& row, |
376 | | bool* contains) const; |
377 | | |
378 | | // Appends the stringified range partition components of a partial row to a |
379 | | // vector. |
380 | | // |
381 | | // If any columns of the range partition do not exist in the partial row, |
382 | | // processing stops and the provided default string piece is appended to the vector. |
383 | | void AppendRangeDebugStringComponentsOrString(const YBPartialRow& row, |
384 | | GStringPiece default_string, |
385 | | std::vector<std::string>* components) const; |
386 | | |
387 | | // Appends the stringified range partition components of a partial row to a |
388 | | // vector. |
389 | | // |
390 | | // If any columns of the range partition do not exist in the partial row, the |
391 | | // logical minimum value for that column will be used instead. |
392 | | void AppendRangeDebugStringComponentsOrMin(const YBPartialRow& row, |
393 | | std::vector<std::string>* components) const; |
394 | | |
395 | | // Decodes a range partition key into a partial row, with variable-length |
396 | | // fields stored in the arena. |
397 | | CHECKED_STATUS DecodeRangeKey(Slice* encode_key, |
398 | | YBPartialRow* partial_row, |
399 | | Arena* arena) const; |
400 | | |
401 | | // Decodes the hash bucket component of a partition key into its buckets. |
402 | | // |
403 | | // This should only be called with partition keys created from a row, not with |
404 | | // partition keys from a partition. |
405 | | CHECKED_STATUS DecodeHashBuckets(Slice* partition_key, std::vector<int32_t>* buckets) const; |
406 | | |
407 | | // Clears the state of this partition schema. |
408 | | void Clear(); |
409 | | |
410 | | // Validates that this partition schema is valid. Returns OK, or an |
411 | | // appropriate error code for an invalid partition schema. |
412 | | CHECKED_STATUS Validate(const Schema& schema) const; |
413 | | |
414 | | std::vector<HashBucketSchema> hash_bucket_schemas_; |
415 | | RangeSchema range_schema_; |
416 | | boost::optional<YBHashSchema> hash_schema_; // Defined only for table that is hash-partitioned. |
417 | | }; |
418 | | |
419 | | } // namespace yb |
420 | | |
421 | | #endif // YB_COMMON_PARTITION_H |