/Users/deen/code/yugabyte-db/src/yb/docdb/docdb.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_DOCDB_DOCDB_H_ |
15 | | #define YB_DOCDB_DOCDB_H_ |
16 | | |
17 | | #include <cstdint> |
18 | | #include <ostream> |
19 | | #include <string> |
20 | | #include <vector> |
21 | | |
22 | | #include <boost/function.hpp> |
23 | | |
24 | | #include "yb/common/doc_hybrid_time.h" |
25 | | #include "yb/common/hybrid_time.h" |
26 | | #include "yb/common/read_hybrid_time.h" |
27 | | #include "yb/common/transaction.h" |
28 | | |
29 | | #include "yb/docdb/docdb_fwd.h" |
30 | | #include "yb/docdb/shared_lock_manager_fwd.h" |
31 | | #include "yb/docdb/doc_path.h" |
32 | | #include "yb/docdb/doc_write_batch.h" |
33 | | #include "yb/docdb/docdb.pb.h" |
34 | | #include "yb/docdb/docdb_types.h" |
35 | | #include "yb/docdb/lock_batch.h" |
36 | | #include "yb/docdb/subdocument.h" |
37 | | #include "yb/docdb/value.h" |
38 | | |
39 | | #include "yb/rocksdb/rocksdb_fwd.h" |
40 | | |
41 | | #include "yb/util/result.h" |
42 | | #include "yb/util/strongly_typed_bool.h" |
43 | | |
44 | | // DocDB mapping on top of the key-value map in RocksDB: |
45 | | // <document_key> <hybrid_time> -> <doc_type> |
46 | | // <document_key> <hybrid_time> <key_a> <gen_ts_a> -> <subdoc_a_type_or_value> |
47 | | // |
48 | | // Assuming the type of subdocument corresponding to key_a in the above example is "object", the |
49 | | // contents of that subdocument are stored in a similar way: |
50 | | // <document_key> <hybrid_time> <key_a> <gen_ts_a> <key_aa> <gen_ts_aa> -> <subdoc_aa_type_or_value> |
51 | | // <document_key> <hybrid_time> <key_a> <gen_ts_a> <key_ab> <gen_ts_ab> -> <subdoc_ab_type_or_value> |
52 | | // ... |
53 | | // |
54 | | // See doc_key.h for the encoding of the <document_key> part. |
55 | | // |
56 | | // <key_a>, <key_aa> are subkeys indicating a path inside a document. |
57 | | // Their encoding is as follows: |
58 | | // <value_type> -- one byte, see the ValueType enum. |
59 | | // <value_specific_encoding> -- e.g. a big-endian 8-byte integer, or a string in a "zero encoded" |
60 | | // format. This is empty for null or true/false values. |
61 | | // |
62 | | // <hybrid_time>, <gen_ts_a>, <gen_ts_ab> are "generation hybrid_times" corresponding to hybrid |
63 | | // clock hybrid_times of the last time a particular top-level document / subdocument was fully |
64 | | // overwritten or deleted. |
65 | | // |
66 | | // <subdoc_a_type_or_value>, <subdoc_aa_type_or_value>, <subdoc_ab_type_or_value> are values of the |
67 | | // following form: |
68 | | // - One-byte value type (see the ValueType enum). |
69 | | // - For primitive values, the encoded value. Note: the value encoding may be different from the |
70 | | // key encoding for the same data type. E.g. we only flip the sign bit for signed 64-bit |
71 | | // integers when encoded as part of a RocksDB key, not value. |
72 | | // |
73 | | // Also see this document for a high-level overview of how we lay out JSON documents on top of |
74 | | // RocksDB: |
75 | | // https://docs.google.com/document/d/1uEOHUqGBVkijw_CGD568FMt8UOJdHtiE3JROUOppYBU/edit |
76 | | |
77 | | namespace yb { |
78 | | |
79 | | class Histogram; |
80 | | |
81 | | namespace docdb { |
82 | | |
83 | | class DocOperation; |
84 | | |
85 | | // This function prepares the transaction by taking locks. The set of keys locked are returned to |
86 | | // the caller via the keys_locked argument (because they need to be saved and unlocked when the |
87 | | // transaction commits). A flag is also returned to indicate if any of the write operations |
88 | | // requires a clean read snapshot to be taken before being applied (see DocOperation for details). |
89 | | // |
90 | | // Example: doc_write_ops might consist of the following operations: |
91 | | // a.b = {}, a.b.c = 1, a.b.d = 2, e.d = 3 |
92 | | // We will generate all the lock_prefixes for the keys with lock types |
93 | | // a - shared, a.b - exclusive, a - shared, a.b - shared, a.b.c - exclusive ... |
94 | | // Then we will deduplicate the keys and promote shared locks to exclusive, and sort them. |
95 | | // Finally, the locks taken will be in order: |
96 | | // a - shared, a.b - exclusive, a.b.c - exclusive, a.b.d - exclusive, e - shared, e.d - exclusive. |
97 | | // Then the sorted lock key list will be returned. (Type is not returned because it is not needed |
98 | | // for unlocking) |
99 | | // TODO(akashnil): If a.b is exclusive, we don't need to lock any sub-paths under it. |
100 | | // |
101 | | // Input: doc_write_ops |
102 | | // Context: lock_manager |
103 | | |
104 | | struct PrepareDocWriteOperationResult { |
105 | | LockBatch lock_batch; |
106 | | bool need_read_snapshot = false; |
107 | | }; |
108 | | |
109 | | Result<PrepareDocWriteOperationResult> PrepareDocWriteOperation( |
110 | | const std::vector<std::unique_ptr<DocOperation>>& doc_write_ops, |
111 | | const google::protobuf::RepeatedPtrField<KeyValuePairPB>& read_pairs, |
112 | | const scoped_refptr<Histogram>& write_lock_latency, |
113 | | const IsolationLevel isolation_level, |
114 | | const OperationKind operation_kind, |
115 | | const RowMarkType row_mark_type, |
116 | | bool transactional_table, |
117 | | CoarseTimePoint deadline, |
118 | | PartialRangeKeyIntents partial_range_key_intents, |
119 | | SharedLockManager *lock_manager); |
120 | | |
121 | | // This constructs a DocWriteBatch using the given list of DocOperations, reading the previous |
122 | | // state of data from RocksDB when necessary. |
123 | | // |
124 | | // Input: doc_write_ops, read snapshot hybrid_time if requested in PrepareDocWriteOperation(). |
125 | | // Context: rocksdb |
126 | | // Outputs: keys_locked, write_batch |
127 | | CHECKED_STATUS AssembleDocWriteBatch( |
128 | | const std::vector<std::unique_ptr<DocOperation>>& doc_write_ops, |
129 | | CoarseTimePoint deadline, |
130 | | const ReadHybridTime& read_time, |
131 | | const DocDB& doc_db, |
132 | | KeyValueWriteBatchPB* write_batch, |
133 | | InitMarkerBehavior init_marker_behavior, |
134 | | std::atomic<int64_t>* monotonic_counter, |
135 | | HybridTime* restart_read_ht, |
136 | | const std::string& table_name); |
137 | | |
138 | | struct ExternalTxnApplyStateData { |
139 | | HybridTime commit_ht; |
140 | | IntraTxnWriteId write_id = 0; |
141 | | |
142 | 0 | std::string ToString() const { |
143 | 0 | return YB_STRUCT_TO_STRING(commit_ht, write_id); |
144 | 0 | } |
145 | | }; |
146 | | |
147 | | using ExternalTxnApplyState = std::map<TransactionId, ExternalTxnApplyStateData>; |
148 | | |
149 | | // Adds external pair to write batch. |
150 | | // Returns true if add was skipped because pair is a regular (non external) record. |
151 | | bool AddExternalPairToWriteBatch( |
152 | | const KeyValuePairPB& kv_pair, |
153 | | HybridTime hybrid_time, |
154 | | int write_id, |
155 | | ExternalTxnApplyState* apply_external_transactions, |
156 | | rocksdb::WriteBatch* regular_write_batch, |
157 | | rocksdb::WriteBatch* intents_write_batch); |
158 | | |
159 | | // Prepares external part of non transaction write batch. |
160 | | // Batch could contain intents for external transactions, in this case those intents |
161 | | // will be added to intents_write_batch. |
162 | | // |
163 | | // Returns true if batch contains regular entries. |
164 | | bool PrepareExternalWriteBatch( |
165 | | const docdb::KeyValueWriteBatchPB& put_batch, |
166 | | HybridTime hybrid_time, |
167 | | rocksdb::DB* intents_db, |
168 | | rocksdb::WriteBatch* regular_write_batch, |
169 | | rocksdb::WriteBatch* intents_write_batch); |
170 | | |
171 | | YB_STRONGLY_TYPED_BOOL(LastKey); |
172 | | |
173 | | // Enumerates intents corresponding to provided key value pairs. |
174 | | // For each key it generates a strong intent and for each parent of each it generates a weak one. |
175 | | // functor should accept 3 arguments: |
176 | | // intent_kind - kind of intent weak or strong |
177 | | // value_slice - value of intent |
178 | | // key - pointer to key in format of SubDocKey (no ht) |
179 | | // last_key - whether it is last strong key in enumeration |
180 | | |
181 | | // Indicates that the intent contains a full document key, i.e. it does not omit any final range |
182 | | // components of the document key. This flag is also true for intents that include subdocument keys. |
183 | | YB_STRONGLY_TYPED_BOOL(FullDocKey); |
184 | | |
185 | | // TODO(dtxn) don't expose this method outside of DocDB if TransactionConflictResolver is moved |
186 | | // inside DocDB. |
187 | | // Note: From https://stackoverflow.com/a/17278470/461529: |
188 | | // "As of GCC 4.8.1, the std::function in libstdc++ optimizes only for pointers to functions and |
189 | | // methods. So regardless the size of your functor (lambdas included), initializing a std::function |
190 | | // from it triggers heap allocation." |
191 | | // So, we use boost::function which doesn't have such issue: |
192 | | // http://www.boost.org/doc/libs/1_65_1/doc/html/function/misc.html |
193 | | typedef boost::function< |
194 | | Status(IntentStrength, FullDocKey, Slice, KeyBytes*, LastKey)> EnumerateIntentsCallback; |
195 | | |
196 | | CHECKED_STATUS EnumerateIntents( |
197 | | const google::protobuf::RepeatedPtrField<yb::docdb::KeyValuePairPB>& kv_pairs, |
198 | | const EnumerateIntentsCallback& functor, PartialRangeKeyIntents partial_range_key_intents); |
199 | | |
200 | | CHECKED_STATUS EnumerateIntents( |
201 | | Slice key, const Slice& intent_value, const EnumerateIntentsCallback& functor, |
202 | | KeyBytes* encoded_key_buffer, PartialRangeKeyIntents partial_range_key_intents, |
203 | | LastKey last_key = LastKey::kFalse); |
204 | | |
205 | | // replicated_batches_state format does not matter at this point, because it is just |
206 | | // appended to appropriate value. |
207 | | void PrepareTransactionWriteBatch( |
208 | | const docdb::KeyValueWriteBatchPB& put_batch, |
209 | | HybridTime hybrid_time, |
210 | | rocksdb::WriteBatch* rocksdb_write_batch, |
211 | | const TransactionId& transaction_id, |
212 | | IsolationLevel isolation_level, |
213 | | PartialRangeKeyIntents partial_range_key_intents, |
214 | | const Slice& replicated_batches_state, |
215 | | IntraTxnWriteId* write_id); |
216 | | |
217 | | |
218 | | struct IntentKeyValueForCDC { |
219 | | Slice key; |
220 | | Slice value; |
221 | | std::string key_buf, value_buf; |
222 | | std::string reverse_index_key; |
223 | | IntraTxnWriteId write_id = 0; |
224 | | |
225 | | std::string ToString() const; |
226 | | |
227 | | template <class PB> |
228 | | void ToPB(PB* pb) const { |
229 | | pb->set_key(key); |
230 | | pb->set_value(value); |
231 | | pb->set_reverse_index_key(reverse_index_key); |
232 | | pb->set_write_id(write_id); |
233 | | } |
234 | | |
235 | | template <class PB> |
236 | | static IntentKeyValueForCDC FromPB(const PB& pb) { |
237 | | return IntentKeyValueForCDC { |
238 | | .key = pb.key(), |
239 | | .value = pb.value(), |
240 | | .reverse_index_key = pb.reverse_index_key(), |
241 | | .write_id = pb.write_id(), |
242 | | }; |
243 | | } |
244 | | }; |
245 | | |
246 | | // See ApplyTransactionStatePB for details. |
247 | | struct ApplyTransactionState { |
248 | | std::string key; |
249 | | IntraTxnWriteId write_id = 0; |
250 | | AbortedSubTransactionSet aborted; |
251 | | |
252 | 1.70M | bool active() const { |
253 | 1.70M | return !key.empty(); |
254 | 1.70M | } |
255 | | |
256 | | std::string ToString() const; |
257 | | |
258 | | template <class PB> |
259 | 1 | void ToPB(PB* pb) const { |
260 | 1 | pb->set_key(key); |
261 | 1 | pb->set_write_id(write_id); |
262 | 1 | aborted.ToPB(pb->mutable_aborted()->mutable_set()); |
263 | 1 | } |
264 | | |
265 | | template <class PB> |
266 | 0 | static Result<ApplyTransactionState> FromPB(const PB& pb) { |
267 | 0 | return ApplyTransactionState { |
268 | 0 | .key = pb.key(), |
269 | 0 | .write_id = pb.write_id(), |
270 | 0 | .aborted = VERIFY_RESULT(AbortedSubTransactionSet::FromPB(pb.aborted().set())), |
271 | 0 | }; |
272 | 0 | } |
273 | | }; |
274 | | |
275 | | Result<ApplyTransactionState> GetIntentsBatch( |
276 | | const TransactionId& transaction_id, |
277 | | const KeyBounds* key_bounds, |
278 | | const ApplyTransactionState* stream_state, |
279 | | rocksdb::DB* intents_db, |
280 | | std::vector<IntentKeyValueForCDC>* keyValueIntents); |
281 | | |
282 | | void AppendTransactionKeyPrefix(const TransactionId& transaction_id, docdb::KeyBytes* out); |
283 | | |
284 | | // Class that is used while combining external intents into single key value pair. |
285 | | class ExternalIntentsProvider { |
286 | | public: |
287 | | // Set output key. |
288 | | virtual void SetKey(const Slice& slice) = 0; |
289 | | |
290 | | // Set output value. |
291 | | virtual void SetValue(const Slice& slice) = 0; |
292 | | |
293 | | // Get next external intent, returns false when there are no more intents. |
294 | | virtual boost::optional<std::pair<Slice, Slice>> Next() = 0; |
295 | | |
296 | | virtual const Uuid& InvolvedTablet() = 0; |
297 | | |
298 | 0 | virtual ~ExternalIntentsProvider() = default; |
299 | | }; |
300 | | |
301 | | // Combine external intents into single key value pair. |
302 | | void CombineExternalIntents( |
303 | | const TransactionId& txn_id, |
304 | | ExternalIntentsProvider* provider); |
305 | | |
306 | | } // namespace docdb |
307 | | } // namespace yb |
308 | | |
309 | | #endif // YB_DOCDB_DOCDB_H_ |