/Users/deen/code/yugabyte-db/src/yb/docdb/subdoc_reader.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include "yb/docdb/subdoc_reader.h" |
15 | | |
16 | | #include <cstddef> |
17 | | #include <memory> |
18 | | #include <string> |
19 | | #include <vector> |
20 | | |
21 | | #include <glog/logging.h> |
22 | | |
23 | | #include "yb/common/doc_hybrid_time.h" |
24 | | #include "yb/common/hybrid_time.h" |
25 | | #include "yb/common/transaction.h" |
26 | | #include "yb/common/typedefs.h" |
27 | | |
28 | | #include "yb/docdb/deadline_info.h" |
29 | | #include "yb/docdb/doc_key.h" |
30 | | #include "yb/docdb/doc_ttl_util.h" |
31 | | #include "yb/docdb/expiration.h" |
32 | | #include "yb/docdb/intent_aware_iterator.h" |
33 | | #include "yb/docdb/key_bytes.h" |
34 | | #include "yb/docdb/primitive_value.h" |
35 | | #include "yb/docdb/subdocument.h" |
36 | | #include "yb/docdb/value.h" |
37 | | #include "yb/docdb/value_type.h" |
38 | | |
39 | | #include "yb/gutil/integral_types.h" |
40 | | #include "yb/gutil/macros.h" |
41 | | |
42 | | #include "yb/server/hybrid_clock.h" |
43 | | |
44 | | #include "yb/util/monotime.h" |
45 | | #include "yb/util/result.h" |
46 | | #include "yb/util/status.h" |
47 | | #include "yb/util/status_format.h" |
48 | | |
49 | | using std::vector; |
50 | | |
51 | | using yb::HybridTime; |
52 | | |
53 | | namespace yb { |
54 | | namespace docdb { |
55 | | |
56 | | namespace { |
57 | | |
58 | | Expiration GetNewExpiration( |
59 | | const Expiration& parent_exp, const MonoDelta& ttl, |
60 | 69.7M | const DocHybridTime& new_write_time) { |
61 | 69.7M | Expiration new_exp = parent_exp; |
62 | | // We may need to update the TTL in individual columns. |
63 | 69.7M | if (new_write_time.hybrid_time() >= new_exp.write_ht) { |
64 | | // We want to keep the default TTL otherwise. |
65 | 69.7M | if (ttl != ValueControlFields::kMaxTtl) { |
66 | 98.1k | new_exp.write_ht = new_write_time.hybrid_time(); |
67 | 98.1k | new_exp.ttl = ttl; |
68 | 69.6M | } else if (new_exp.ttl.IsNegative()) { |
69 | 0 | new_exp.ttl = -new_exp.ttl; |
70 | 0 | } |
71 | 69.7M | } |
72 | | |
73 | | // If the hybrid time is kMin, then we must be using default TTL. |
74 | 69.7M | if (new_exp.write_ht == HybridTime::kMin) { |
75 | 69.5M | new_exp.write_ht = new_write_time.hybrid_time(); |
76 | 69.5M | } |
77 | | |
78 | 69.7M | return new_exp; |
79 | 69.7M | } |
80 | | |
81 | | } // namespace |
82 | | |
83 | | ObsolescenceTracker::ObsolescenceTracker(DocHybridTime write_time_watermark): |
84 | 740M | write_time_watermark_(write_time_watermark) {} |
85 | | |
86 | | ObsolescenceTracker::ObsolescenceTracker( |
87 | | const ReadHybridTime& read_time, DocHybridTime write_time_watermark, Expiration expiration): |
88 | 97.9M | write_time_watermark_(write_time_watermark), read_time_(read_time), expiration_(expiration) {} |
89 | | |
90 | 91.3M | const DocHybridTime& ObsolescenceTracker::GetHighWriteTime() { return write_time_watermark_; } |
91 | | |
92 | 674M | bool ObsolescenceTracker::IsObsolete(const DocHybridTime& write_time) const { |
93 | 674M | if (expiration_.has_value()) { |
94 | 65.4M | DCHECK(read_time_.has_value()); |
95 | 65.4M | if (HasExpiredTTL(expiration_.value().write_ht, expiration_.value().ttl, |
96 | 65.4M | read_time_.value().read)) { |
97 | 17.7k | return true; |
98 | 17.7k | } |
99 | 65.4M | } |
100 | 674M | return write_time < write_time_watermark_; |
101 | 674M | } |
102 | | |
103 | | ObsolescenceTracker ObsolescenceTracker::Child( |
104 | 745M | const DocHybridTime& write_time, const MonoDelta& ttl) const { |
105 | 745M | auto new_write_time_watermark = std::max(write_time, write_time_watermark_); |
106 | 745M | if (expiration_.has_value()) { |
107 | 69.7M | DCHECK(read_time_.has_value()); |
108 | 69.7M | return ObsolescenceTracker( |
109 | 69.7M | read_time_.value(), new_write_time_watermark, |
110 | 69.7M | GetNewExpiration(expiration_.value(), ttl, write_time)); |
111 | 69.7M | } |
112 | 675M | return ObsolescenceTracker(new_write_time_watermark); |
113 | 745M | } |
114 | | |
115 | 82.8M | ObsolescenceTracker ObsolescenceTracker::Child(const DocHybridTime& write_time) const { |
116 | 82.8M | auto new_write_time_watermark = std::max(write_time, write_time_watermark_); |
117 | 82.8M | if (expiration_.has_value()) { |
118 | 20.7M | return ObsolescenceTracker(read_time_.value(), new_write_time_watermark, expiration_.get()); |
119 | 20.7M | } |
120 | 62.1M | return ObsolescenceTracker(new_write_time_watermark); |
121 | 82.8M | } |
122 | | |
123 | | boost::optional<uint64_t> ObsolescenceTracker::GetTtlRemainingSeconds( |
124 | 613M | const HybridTime& ttl_write_time) const { |
125 | 613M | if (!expiration_.has_value()) { |
126 | 548M | return boost::none; |
127 | 548M | } |
128 | | |
129 | 65.2M | DCHECK(read_time_.has_value()); |
130 | | |
131 | 65.2M | auto ttl_value = expiration_.value().ttl; |
132 | 65.2M | auto doc_read_time = read_time_.value(); |
133 | | |
134 | 65.2M | if (ttl_value == ValueControlFields::kMaxTtl) { |
135 | 41.0M | return -1; |
136 | 41.0M | } |
137 | 24.2M | int64_t time_since_ttl_write_seconds = ( |
138 | 24.2M | server::HybridClock::GetPhysicalValueMicros(doc_read_time.read) - |
139 | 24.2M | server::HybridClock::GetPhysicalValueMicros(ttl_write_time)) / |
140 | 24.2M | MonoTime::kMicrosecondsPerSecond; |
141 | 24.2M | int64_t ttl_value_seconds = ttl_value.ToMilliseconds() / MonoTime::kMillisecondsPerSecond; |
142 | 24.2M | int64_t ttl_remaining_seconds = ttl_value_seconds - time_since_ttl_write_seconds; |
143 | 24.2M | return std::max(static_cast<int64_t>(0), ttl_remaining_seconds); |
144 | 65.2M | } |
145 | | |
146 | | namespace { |
147 | | |
148 | | // This class wraps access to a SubDocument instance specified in one of two ways: |
149 | | // (1) -- From a pointer to an already existing instance of a SubDocument |
150 | | // (2) -- From a pointer to an existing instance of LazySubDocumentHolder which represents the |
151 | | // "parent" of the SubDocument accessed via this LazySubDocumentHolder. |
152 | | // |
153 | | // Using this class, a SubDocument which is *specified* via (2) above will not be constructed unless |
154 | | // it is accessed, either directly or via a child LazySubDocumentHolder instance. This class allows |
155 | | // us to provide a unified interface to the reading/constructing code which may determine if a |
156 | | // particular SubDocument is live based on deferred knowledge of whether any of it's children are |
157 | | // live. |
158 | | class LazySubDocumentHolder { |
159 | | public: |
160 | | // Specifies the SubDocument "target" provided. The pointer must be valid for the lifetime of this |
161 | | // instance. The provided key Slice must be valid and the underlying data should not change during |
162 | | // the lifetime of this instance. |
163 | 697M | LazySubDocumentHolder(SubDocument* target, Slice key) : target_(target), key_(key) { |
164 | 697M | DCHECK(target_); |
165 | 697M | } |
166 | | |
167 | | // Specifies a SubDocument with key "key" lazily constructed from "parent". Provided parent must |
168 | | // be valid and remain at the same address for the lifetime of this instance. The provided key |
169 | | // Slice must be valid and the underlying data should not change during the lifetime of this |
170 | | // instance. |
171 | 51.1M | LazySubDocumentHolder(LazySubDocumentHolder* parent, Slice key) : parent_(parent), key_(key) { |
172 | 51.1M | DCHECK(parent_); |
173 | 51.1M | } |
174 | | |
175 | | // Returns true if the SubDocument specified by this instance exists. |
176 | 178M | bool IsConstructed() const { return target_; } |
177 | | |
178 | | // Get a pointer to the SubDocument specified by this instance. Construct that SubDocument first |
179 | | // if it has not yet been constructed. |
180 | | Result<SubDocument*> Get(); |
181 | | |
182 | | private: |
183 | | // The constructed SubDocument specified by this instance. |
184 | | SubDocument* target_ = nullptr; |
185 | | |
186 | | // A pointer to the parent_ of the specified SubDocument. This must be non-null unless this |
187 | | // instance was constructed via a concrete SubDocument*. |
188 | | LazySubDocumentHolder* const parent_ = nullptr; |
189 | | |
190 | | // The key of the specified SubDocument. |
191 | | Slice key_; |
192 | | }; |
193 | | |
194 | 711M | Result<SubDocument*> LazySubDocumentHolder::Get() { |
195 | | // If target_ is not null, just return it. Otherwise, we must construct the SubDocument from its |
196 | | // parent. |
197 | 711M | if (target_) { |
198 | 705M | return target_; |
199 | 705M | } |
200 | | |
201 | | // Presumably, the parent_ key is a prefix of key_, otherwise it's not a valid parent. |
202 | 18.4E | DCHECK(key_.starts_with(parent_->key_)) |
203 | 18.4E | << "Attempting to construct SubDocument for key: " << SubDocKey::DebugSliceToString(key_) |
204 | 18.4E | << " from parent whose key: " << SubDocKey::DebugSliceToString(parent_->key_) |
205 | 18.4E | << " is not a prefix of child key"; |
206 | | // This code takes each subdoc key part after parent_ in key_ and traverses from the parent_ |
207 | | // SubDocument to a SubDocument created via successive calls to GetOrAddChild. The SubDocument |
208 | | // returned is owned by and borrowed from parent_. |
209 | 5.89M | SubDocument* current = VERIFY_RESULT(parent_->Get()); |
210 | 0 | Slice temp = key_; |
211 | 5.89M | temp.remove_prefix(parent_->key_.size()); |
212 | 6.07M | for (;;) { |
213 | 6.07M | PrimitiveValue child_key_part; |
214 | 6.07M | RETURN_NOT_OK(child_key_part.DecodeFromKey(&temp)); |
215 | 6.07M | current = current->GetOrAddChild(child_key_part).first; |
216 | 6.07M | if (temp.empty()) { |
217 | 6.07M | target_ = current; |
218 | 6.07M | return target_; |
219 | 6.07M | } |
220 | 6.07M | } |
221 | 18.4E | return STATUS( |
222 | 5.89M | InternalError, |
223 | 5.89M | "We return this status at the end of a function with a terminal infinite loop. We should " |
224 | 5.89M | "never get here."); |
225 | 5.89M | } |
226 | | |
227 | | // This class provides a wrapper to access data corresponding to a RocksDB row. |
228 | | class DocDbRowData { |
229 | | public: |
230 | | DocDbRowData(const Slice& key, const DocHybridTime& write_time, Value&& value); |
231 | | |
232 | | static Result<std::unique_ptr<DocDbRowData>> CurrentRow(IntentAwareIterator* iter); |
233 | | |
234 | 1.53G | const KeyBytes& key() const { return target_key_; } |
235 | | |
236 | 1.35G | const Value& value() const { return value_; } |
237 | | |
238 | 2.63G | const DocHybridTime& write_time() const { return write_time_; } |
239 | | |
240 | 746M | bool IsTombstone() const { return value_.value_type() == ValueType::kTombstone; } |
241 | | |
242 | 619M | bool IsCollection() const { return IsCollectionType(value_.value_type()); } |
243 | | |
244 | 741M | bool IsPrimitiveValue() const { return IsPrimitiveValueType(value_.value_type()); } |
245 | | |
246 | 679M | PrimitiveValue* mutable_primitive_value() { return value_.mutable_primitive_value(); } |
247 | | |
248 | | private: |
249 | | const KeyBytes target_key_; |
250 | | const DocHybridTime write_time_; |
251 | | Value value_; |
252 | | |
253 | | DISALLOW_COPY_AND_ASSIGN(DocDbRowData); |
254 | | }; |
255 | | |
256 | | DocDbRowData::DocDbRowData( |
257 | | const Slice& key, const DocHybridTime& write_time, Value&& value): |
258 | 741M | target_key_(std::move(key)), write_time_(std::move(write_time)), value_(std::move(value)) {} |
259 | | |
260 | 746M | Result<std::unique_ptr<DocDbRowData>> DocDbRowData::CurrentRow(IntentAwareIterator* iter) { |
261 | 746M | auto key_data = VERIFY_RESULT(iter->FetchKey()); |
262 | 852k | DCHECK(key_data.same_transaction || |
263 | 852k | iter->read_time().global_limit >= key_data.write_time.hybrid_time()) |
264 | 852k | << "Bad key: " << SubDocKey::DebugSliceToString(key_data.key) |
265 | 852k | << ", global limit: " << iter->read_time().global_limit |
266 | 852k | << ", write time: " << key_data.write_time.hybrid_time(); |
267 | 746M | Value value; |
268 | | // TODO -- we could optimize be decoding directly into a SubDocument instance on the heap which |
269 | | // could be later bound to our result SubDocument. This could work if e.g. Value could be |
270 | | // initialized with a PrimitiveValue*. |
271 | 746M | RETURN_NOT_OK(value.Decode(iter->value())); |
272 | | |
273 | 746M | if (key_data.write_time == DocHybridTime::kMin) { |
274 | 0 | return STATUS(Corruption, "No hybrid timestamp found on entry"); |
275 | 0 | } |
276 | | |
277 | 746M | return std::make_unique<DocDbRowData>(key_data.key, key_data.write_time, std::move(value)); |
278 | 746M | } |
279 | | |
280 | | // This class provides a convenience handle for modifying a SubDocument specified by a provided |
281 | | // LazySubDocumentHolder. Importantly, it is responsible for the semantics of when a |
282 | | // LazySubDocumentHolder should be realized. Notably, it does *not* construct the specified |
283 | | // SubDocument if it is instructed to store a tombstone, and it will only modify the SubDocument |
284 | | // value if it already has been constructed. |
285 | | class DocDbRowAssembler { |
286 | | public: |
287 | 694M | DocDbRowAssembler(SubDocument* target, Slice key) : root_(target, key) {} |
288 | | |
289 | | DocDbRowAssembler(DocDbRowAssembler* parent_assembler, Slice key): |
290 | 51.1M | root_(&parent_assembler->root_, key) {} |
291 | | |
292 | | CHECKED_STATUS SetEmptyCollection(); |
293 | | |
294 | | CHECKED_STATUS SetTombstone(); |
295 | | |
296 | | CHECKED_STATUS SetPrimitiveValue(DocDbRowData* row); |
297 | | |
298 | | Result<bool> HasStoredValue(); |
299 | | |
300 | | private: |
301 | | LazySubDocumentHolder root_; |
302 | | bool is_tombstoned_ = false; |
303 | | |
304 | | DISALLOW_COPY_AND_ASSIGN(DocDbRowAssembler); |
305 | | }; |
306 | | |
307 | 4.17M | Status DocDbRowAssembler::SetEmptyCollection() { |
308 | 4.17M | auto* subdoc = VERIFY_RESULT(root_.Get()); |
309 | 0 | *subdoc = SubDocument(); |
310 | 4.17M | return Status::OK(); |
311 | 4.17M | } |
312 | | |
313 | 127M | Status DocDbRowAssembler::SetTombstone() { |
314 | 127M | if (!root_.IsConstructed()) { |
315 | | // Do not construct a child subdocument from the parent if it is not constructed, since this is |
316 | | // a tombstone. |
317 | 45.1M | return Status::OK(); |
318 | 45.1M | } |
319 | 82.4M | auto* subdoc = VERIFY_RESULT(root_.Get()); |
320 | 0 | *subdoc = SubDocument(ValueType::kTombstone); |
321 | 82.4M | is_tombstoned_ = true; |
322 | 82.4M | return Status::OK(); |
323 | 82.4M | } |
324 | | |
325 | 613M | Status DocDbRowAssembler::SetPrimitiveValue(DocDbRowData* row) { |
326 | | // TODO -- this interface with a non-const row pointer is not ideal. It's awkward to allow the |
327 | | // DocDbRowAssembler to modify the DocDbRowData's state. In the future, it might make more |
328 | | // sense to have ScopedDocDbRowContext orchestrate coordination between these classes. |
329 | | // TODO -- we currently modify the DocDbRowData's mutable primitive_value, and then make |
330 | | // a copy here to store onto the SubDocument. This should be made more efficient, especially |
331 | | // since it's on the critical path of all reads. |
332 | 613M | auto* subdoc = VERIFY_RESULT(root_.Get()); |
333 | | |
334 | 0 | auto* mutable_primitive_value = row->mutable_primitive_value(); |
335 | | |
336 | 613M | if (row->value().has_user_timestamp()) { |
337 | 87 | mutable_primitive_value->SetWriteTime(row->value().user_timestamp()); |
338 | 613M | } else { |
339 | 613M | mutable_primitive_value->SetWriteTime(row->write_time().hybrid_time().GetPhysicalValueMicros()); |
340 | 613M | } |
341 | 613M | *subdoc = SubDocument(*mutable_primitive_value); |
342 | 613M | return Status::OK(); |
343 | 613M | } |
344 | | |
345 | 51.1M | Result<bool> DocDbRowAssembler::HasStoredValue() { |
346 | 51.1M | if (!root_.IsConstructed()) { |
347 | 45.1M | return false; |
348 | 45.1M | } |
349 | 6.07M | auto* subdoc = VERIFY_RESULT(root_.Get()); |
350 | 6.07M | return subdoc->value_type() != ValueType::kInvalid |
351 | 6.07M | && subdoc->value_type() != ValueType::kTombstone6.07M ; |
352 | 6.07M | } |
353 | | |
354 | | class ScopedDocDbRowContext; |
355 | | class ScopedDocDbRowContextWithData; |
356 | | |
357 | | // This class represents a collection of ScopedDocDbRowContext instances corresponding to a DocDB |
358 | | // collection. The user of this class can optionally call SetFirstChild on an owned |
359 | | // ScopedDocDbRowContext in case it has already read the first row of this collection, and that will |
360 | | // be returned before reading subsequent rows in the collection. Note this may only be done before |
361 | | // the ScopedDocDbCollectionContext instance has read any other rows. |
362 | | class ScopedDocDbCollectionContext { |
363 | | public: |
364 | | explicit ScopedDocDbCollectionContext(ScopedDocDbRowContext* parent); |
365 | | |
366 | | CHECKED_STATUS SetFirstChild(std::unique_ptr<DocDbRowData> first_row); |
367 | | |
368 | | Result<ScopedDocDbRowContextWithData*> GetNextChild(); |
369 | | |
370 | | private: |
371 | | void SetNextChild(std::unique_ptr<DocDbRowData> child_row); |
372 | | |
373 | | ScopedDocDbRowContext* const parent_ = nullptr; |
374 | | std::unique_ptr<ScopedDocDbRowContextWithData> current_child_ = nullptr; |
375 | | ScopedDocDbRowContextWithData* next_child_ = nullptr; |
376 | | }; |
377 | | |
378 | | // This class encapsulates all relevant context for reading the RocksDB state corresponding to a |
379 | | // particular key and constructing a SubDocument instance which reflects that state. This context is |
380 | | // used by control-flow functions at the bottom of this file. The context for a key also |
381 | | // encapsulates a mechanism to collect the context for any children of the same key via |
382 | | // ScopedDocDbCollectionContext, which is similarly used by control-flow functions at the bottom of |
383 | | // this file. |
384 | | class ScopedDocDbRowContext { |
385 | | public: |
386 | | ScopedDocDbRowContext( |
387 | | IntentAwareIterator* iter, |
388 | | DeadlineInfo* deadline_info, |
389 | | Slice key, |
390 | | SubDocument* assembly_target, |
391 | | ObsolescenceTracker obsolescence_tracker); |
392 | | |
393 | | ScopedDocDbRowContext( |
394 | | IntentAwareIterator* iter, |
395 | | DeadlineInfo* deadline_info, |
396 | | Slice key, |
397 | | DocDbRowAssembler* ancestor_assembler, |
398 | | ObsolescenceTracker obsolescence_tracker); |
399 | | |
400 | 942M | DocDbRowAssembler* mutable_assembler() { return &assembler_; } |
401 | | |
402 | 745M | const ObsolescenceTracker* obsolescence_tracker() const { return &obsolescence_tracker_; } |
403 | | |
404 | | ScopedDocDbCollectionContext* collection(); |
405 | | |
406 | | CHECKED_STATUS CheckDeadline(); |
407 | | |
408 | | protected: |
409 | | IntentAwareIterator* const iter_; |
410 | | DeadlineInfo* const deadline_info_; |
411 | | const Slice key_; |
412 | | const IntentAwareIteratorPrefixScope prefix_scope_; |
413 | | DocDbRowAssembler assembler_; |
414 | | ObsolescenceTracker obsolescence_tracker_; |
415 | | boost::optional<ScopedDocDbCollectionContext> collection_ = boost::none; |
416 | | |
417 | | private: |
418 | | friend class ScopedDocDbCollectionContext; |
419 | | |
420 | | DISALLOW_COPY_AND_ASSIGN(ScopedDocDbRowContext); |
421 | | }; |
422 | | |
423 | | ScopedDocDbRowContext::ScopedDocDbRowContext( |
424 | | IntentAwareIterator* iter, |
425 | | DeadlineInfo* deadline_info, |
426 | | Slice key, |
427 | | SubDocument* assembly_target, |
428 | | ObsolescenceTracker obsolescence_tracker): |
429 | | iter_(iter), |
430 | | deadline_info_(deadline_info), |
431 | | key_(key), |
432 | | prefix_scope_(key_, iter_), |
433 | | assembler_(assembly_target, key_), |
434 | 695M | obsolescence_tracker_(obsolescence_tracker) {} |
435 | | |
436 | | ScopedDocDbRowContext::ScopedDocDbRowContext( |
437 | | IntentAwareIterator* iter, |
438 | | DeadlineInfo* deadline_info, |
439 | | Slice key, |
440 | | DocDbRowAssembler* ancestor_assembler, |
441 | | ObsolescenceTracker obsolescence_tracker): |
442 | | iter_(iter), |
443 | | deadline_info_(deadline_info), |
444 | | key_(key), |
445 | | prefix_scope_(key_, iter_), |
446 | | assembler_(ancestor_assembler, key_), |
447 | 51.1M | obsolescence_tracker_(obsolescence_tracker) {} |
448 | | |
449 | 94.5M | ScopedDocDbCollectionContext* ScopedDocDbRowContext::collection() { |
450 | 94.5M | if (collection_ == boost::none94.5M ) { |
451 | 94.5M | iter_->SeekPastSubKey(key_); |
452 | 94.5M | collection_.emplace(this); |
453 | 94.5M | } |
454 | 94.5M | return &*collection_; |
455 | 94.5M | } |
456 | | |
457 | 749M | Status ScopedDocDbRowContext::CheckDeadline() { |
458 | 749M | if (deadline_info_749M && deadline_info_->CheckAndSetDeadlinePassed()) { |
459 | 0 | return STATUS(Expired, "Deadline for query passed."); |
460 | 0 | } |
461 | 749M | return Status::OK(); |
462 | 749M | } |
463 | | |
464 | | class ScopedDocDbRowContextWithData : public ScopedDocDbRowContext { |
465 | | public: |
466 | | ScopedDocDbRowContextWithData( |
467 | | std::unique_ptr<DocDbRowData> row, |
468 | | IntentAwareIterator* iter, |
469 | | DeadlineInfo* deadline_info, |
470 | | SubDocument* assembly_target, |
471 | | ObsolescenceTracker ancestor_obsolescence_tracker); |
472 | | |
473 | | ScopedDocDbRowContextWithData( |
474 | | std::unique_ptr<DocDbRowData> row, |
475 | | IntentAwareIterator* iter, |
476 | | DeadlineInfo* deadline_info, |
477 | | DocDbRowAssembler* ancestor_assembler, |
478 | | ObsolescenceTracker ancestor_obsolescence_tracker); |
479 | | |
480 | 793M | DocDbRowData* data() const { return data_.get(); } |
481 | | |
482 | | void SeekOutOfPrefix(); |
483 | | |
484 | | private: |
485 | | std::unique_ptr<DocDbRowData> data_; |
486 | | |
487 | | DISALLOW_COPY_AND_ASSIGN(ScopedDocDbRowContextWithData); |
488 | | }; |
489 | | |
490 | | ScopedDocDbRowContextWithData::ScopedDocDbRowContextWithData( |
491 | | std::unique_ptr<DocDbRowData> row, |
492 | | IntentAwareIterator* iter, |
493 | | DeadlineInfo* deadline_info, |
494 | | SubDocument* assembly_target, |
495 | | ObsolescenceTracker ancestor_obsolescence_tracker): |
496 | | ScopedDocDbRowContext( |
497 | | iter, deadline_info, row->key(), assembly_target, |
498 | | ancestor_obsolescence_tracker.Child(row->write_time(), row->value().ttl())), |
499 | 694M | data_(std::move(row)) {} |
500 | | |
501 | | ScopedDocDbRowContextWithData::ScopedDocDbRowContextWithData( |
502 | | std::unique_ptr<DocDbRowData> row, |
503 | | IntentAwareIterator* iter, |
504 | | DeadlineInfo* deadline_info, |
505 | | DocDbRowAssembler* ancestor_assembler, |
506 | | ObsolescenceTracker ancestor_obsolescence_tracker): |
507 | | ScopedDocDbRowContext( |
508 | | iter, deadline_info, row->key(), ancestor_assembler, |
509 | | ancestor_obsolescence_tracker.Child(row->write_time(), row->value().ttl())), |
510 | 51.1M | data_(std::move(row)) {} |
511 | | |
512 | 51.1M | void ScopedDocDbRowContextWithData::SeekOutOfPrefix() { |
513 | 51.1M | iter_->SeekOutOfSubDoc(data_->key()); |
514 | 51.1M | } |
515 | | |
516 | | ScopedDocDbCollectionContext::ScopedDocDbCollectionContext(ScopedDocDbRowContext* parent): |
517 | 94.6M | parent_(parent) {} |
518 | | |
519 | 1.26k | Status ScopedDocDbCollectionContext::SetFirstChild(std::unique_ptr<DocDbRowData> first_row) { |
520 | 1.26k | if (next_child_) { |
521 | 0 | return STATUS(IllegalState, "Cannot set first_child if already set."); |
522 | 0 | } |
523 | 1.26k | if (current_child_) { |
524 | 0 | return STATUS(IllegalState, "Cannot set first_child if a child has already been read."); |
525 | 0 | } |
526 | 1.26k | SetNextChild(std::move(first_row)); |
527 | 1.26k | next_child_ = current_child_.get(); |
528 | 1.26k | return Status::OK(); |
529 | 1.26k | } |
530 | | |
531 | 145M | Result<ScopedDocDbRowContextWithData*> ScopedDocDbCollectionContext::GetNextChild() { |
532 | 145M | if (next_child_) { |
533 | | // If there is a next_child_, then we've already stored a row which we read before. Serve that, |
534 | | // and reset it to resume normal operation next time. |
535 | 1.26k | next_child_ = nullptr; |
536 | 145M | } else { |
537 | 145M | if (current_child_) { |
538 | | // prev_child_ points to the row we served last, which we are now done with, so we should |
539 | | // seek out of the scope of it and reset its state before proceeding to read the next row. |
540 | | // Note -- we currently seek away from the previous row only when a new row is requested by |
541 | | // the caller. It might be better from a perf perspective to do this before instead. |
542 | 51.1M | current_child_->SeekOutOfPrefix(); |
543 | | |
544 | | // Reset current child to eliminate the IntentAwareIteratorPrefixScope it was holding. |
545 | 51.1M | current_child_.reset(); |
546 | 51.1M | } |
547 | 145M | if (parent_->iter_->valid()) { |
548 | 51.1M | SetNextChild(VERIFY_RESULT(DocDbRowData::CurrentRow(parent_->iter_))); |
549 | 51.1M | } |
550 | 145M | } |
551 | 145M | DCHECK( |
552 | 32.6k | !current_child_ || |
553 | 32.6k | !parent_ || |
554 | 32.6k | current_child_->data()->key().AsSlice().starts_with(parent_->key_)) |
555 | 32.6k | << "Child key " << SubDocKey::DebugSliceToString(current_child_->data()->key().AsSlice()) |
556 | 32.6k | << " does not include parent key " << SubDocKey::DebugSliceToString(parent_->key_) |
557 | 32.6k | << " as prefix."; |
558 | 145M | return current_child_.get(); |
559 | 145M | } |
560 | | |
561 | 51.2M | void ScopedDocDbCollectionContext::SetNextChild(std::unique_ptr<DocDbRowData> child_row) { |
562 | 51.2M | current_child_ = std::make_unique<ScopedDocDbRowContextWithData>( |
563 | 51.2M | std::move(child_row), parent_->iter_, parent_->deadline_info_, parent_->mutable_assembler(), |
564 | 51.2M | parent_->obsolescence_tracker_); |
565 | 51.2M | } |
566 | | |
567 | | CHECKED_STATUS ProcessSubDocument(ScopedDocDbRowContextWithData* scope); |
568 | | |
569 | 94.6M | Result<uint32_t> ProcessChildren(ScopedDocDbCollectionContext* collection) { |
570 | 94.6M | uint32_t num_children = 0; |
571 | 94.6M | if (collection94.6M ) { |
572 | 145M | while (ScopedDocDbRowContextWithData* child = VERIFY_RESULT(collection->GetNextChild())) { |
573 | 51.1M | RETURN_NOT_OK(ProcessSubDocument(child)); |
574 | 51.1M | if (VERIFY_RESULT(child->mutable_assembler()->HasStoredValue())) { |
575 | 6.07M | ++num_children; |
576 | 6.07M | } |
577 | 51.1M | } |
578 | 94.6M | } |
579 | 94.6M | return num_children; |
580 | 94.6M | } |
581 | | |
582 | 4.17M | Status ProcessCollection(ScopedDocDbRowContextWithData* scope) { |
583 | | // Set this row to an empty collection since it is alive/valid before processing its children. |
584 | 4.17M | RETURN_NOT_OK(scope->mutable_assembler()->SetEmptyCollection()); |
585 | 4.17M | RETURN_NOT_OK(ProcessChildren(scope->collection())); |
586 | 4.17M | return Status::OK(); |
587 | 4.17M | } |
588 | | |
589 | 90.2M | Status MaybeReviveCollection(ScopedDocDbRowContextWithData* scope) { |
590 | 90.2M | auto num_children = VERIFY_RESULT(ProcessChildren(scope->collection())); |
591 | 90.4M | if (num_children == 090.2M ) { |
592 | 90.4M | return scope->mutable_assembler()->SetTombstone(); |
593 | 90.4M | } |
594 | 18.4E | return Status::OK(); |
595 | 90.2M | } |
596 | | |
597 | 749M | Status ProcessSubDocument(ScopedDocDbRowContextWithData* scope) { |
598 | 749M | RETURN_NOT_OK(scope->CheckDeadline()); |
599 | | |
600 | 749M | auto data = scope->data(); |
601 | 749M | auto assembler = scope->mutable_assembler(); |
602 | 749M | auto obsolescence_tracker = scope->obsolescence_tracker(); |
603 | | |
604 | 749M | if (data->IsTombstone() || obsolescence_tracker->IsObsolete(data->write_time())674M ) { |
605 | 127M | if (data->IsPrimitiveValue()) { |
606 | 18.4E | VLOG(4) << "Discarding overwritten or expired primitive value"; |
607 | 37.0M | return assembler->SetTombstone(); |
608 | 37.0M | } |
609 | | // If the latest written value is a tombstone or the record is expired at a top level, only |
610 | | // surface a subdocument if it has a valid (unexpired, non-tombstoned) child which overwrites |
611 | | // this record. Note: these semantics are only relevant to CQL reads. |
612 | 90.4M | return MaybeReviveCollection(scope); |
613 | 127M | } |
614 | | |
615 | 622M | if (data->IsCollection()) { |
616 | 4.17M | return ProcessCollection(scope); |
617 | 4.17M | } |
618 | | |
619 | 617M | if (data->IsPrimitiveValue()) { |
620 | 614M | auto ttl_opt = obsolescence_tracker->GetTtlRemainingSeconds(data->write_time().hybrid_time()); |
621 | 614M | if (ttl_opt) { |
622 | 65.2M | data->mutable_primitive_value()->SetTtl(*ttl_opt); |
623 | 65.2M | } |
624 | 614M | return assembler->SetPrimitiveValue(data); |
625 | 614M | } |
626 | | |
627 | 3.69M | return STATUS_FORMAT( |
628 | 617M | Corruption, |
629 | 617M | "Expected primitive value type, collection, or tobmstone. Got $0", |
630 | 617M | data->value().value_type()); |
631 | 617M | } |
632 | | |
633 | | } // namespace |
634 | | |
635 | | SubDocumentReader::SubDocumentReader( |
636 | | const KeyBytes& target_subdocument_key, |
637 | | IntentAwareIterator* iter, |
638 | | DeadlineInfo* deadline_info, |
639 | | const ObsolescenceTracker& ancestor_obsolescence_tracker): |
640 | | target_subdocument_key_(target_subdocument_key), iter_(iter), deadline_info_(deadline_info), |
641 | 719M | ancestor_obsolescence_tracker_(ancestor_obsolescence_tracker) {} |
642 | | |
643 | 722M | Status SubDocumentReader::Get(SubDocument* result) { |
644 | 722M | IntentAwareIteratorPrefixScope target_scope(target_subdocument_key_, iter_); |
645 | 722M | if (!iter_->valid()) { |
646 | 25.8M | *result = SubDocument(ValueType::kInvalid); |
647 | 25.8M | return Status::OK(); |
648 | 25.8M | } |
649 | 696M | auto first_row = VERIFY_RESULT(DocDbRowData::CurrentRow(iter_)); |
650 | 0 | auto current_key = first_row->key(); |
651 | | |
652 | 696M | if (current_key == target_subdocument_key_) { |
653 | 696M | ScopedDocDbRowContextWithData context( |
654 | 696M | std::move(first_row), iter_, deadline_info_, result, ancestor_obsolescence_tracker_); |
655 | 696M | return ProcessSubDocument(&context); |
656 | 696M | } |
657 | | // If the currently-pointed-to key is not equal to our target, but we are still in a valid state, |
658 | | // then that key must have the target key as a prefix, meaning we are pointing to a child of our |
659 | | // target. We should therefore process the rows as if we're already in a collection, rooted at the |
660 | | // target key. |
661 | 562k | ScopedDocDbRowContext context( |
662 | 562k | iter_, deadline_info_, target_subdocument_key_, result, ancestor_obsolescence_tracker_); |
663 | 562k | ScopedDocDbCollectionContext collection(&context); |
664 | 562k | RETURN_NOT_OK(collection.SetFirstChild(std::move(first_row))); |
665 | 562k | auto num_children = VERIFY_RESULT(ProcessChildren(&collection)); |
666 | 562k | if (num_children == 0) { |
667 | 1.02k | *result = SubDocument(ValueType::kTombstone); |
668 | 1.02k | } |
669 | 562k | return Status::OK(); |
670 | 562k | } |
671 | | |
672 | | SubDocumentReaderBuilder::SubDocumentReaderBuilder( |
673 | | IntentAwareIterator* iter, DeadlineInfo* deadline_info) |
674 | 14.2M | : iter_(iter), deadline_info_(deadline_info) {} |
675 | | |
676 | | Result<std::unique_ptr<SubDocumentReader>> SubDocumentReaderBuilder::Build( |
677 | 721M | const KeyBytes& sub_doc_key) { |
678 | 721M | return std::make_unique<SubDocumentReader>( |
679 | 721M | sub_doc_key, iter_, deadline_info_, parent_obsolescence_tracker_); |
680 | 721M | } |
681 | | |
682 | | Status SubDocumentReaderBuilder::InitObsolescenceInfo( |
683 | | const ObsolescenceTracker& table_obsolescence_tracker, |
684 | 83.6M | const Slice& root_doc_key, const Slice& target_subdocument_key) { |
685 | 83.6M | parent_obsolescence_tracker_ = table_obsolescence_tracker; |
686 | | |
687 | | // Look at ancestors to collect ttl/write-time metadata. |
688 | 83.6M | IntentAwareIteratorPrefixScope prefix_scope(root_doc_key, iter_); |
689 | 83.6M | Slice temp_key = target_subdocument_key; |
690 | 83.6M | Slice prev_iter_key = temp_key.Prefix(root_doc_key.size()); |
691 | 83.6M | temp_key.remove_prefix(root_doc_key.size()); |
692 | 83.8M | for (;;) { |
693 | | // for each iteration of this loop, we consume another piece of the subdoc key path |
694 | 83.8M | auto decode_result = VERIFY_RESULT(SubDocKey::DecodeSubkey(&temp_key)); |
695 | 83.8M | if (!decode_result) { |
696 | | // Stop once key_slice has consumed all subdoc keys and FindLastWriteTime has been called |
697 | | // with all but the last subdoc key |
698 | 83.5M | break; |
699 | 83.5M | } |
700 | 295k | RETURN_NOT_OK(UpdateWithParentWriteInfo(prev_iter_key)); |
701 | 295k | prev_iter_key = Slice(prev_iter_key.data(), temp_key.data() - prev_iter_key.data()); |
702 | 295k | } |
703 | 83.6M | DCHECK_EQ(prev_iter_key, target_subdocument_key); |
704 | 83.6M | return UpdateWithParentWriteInfo(target_subdocument_key); |
705 | 83.6M | } |
706 | | |
707 | | Status SubDocumentReaderBuilder::UpdateWithParentWriteInfo( |
708 | 83.7M | const Slice& parent_key_without_ht) { |
709 | 83.7M | Slice value; |
710 | 83.7M | DocHybridTime doc_ht = parent_obsolescence_tracker_.GetHighWriteTime(); |
711 | 83.7M | RETURN_NOT_OK(iter_->FindLatestRecord(parent_key_without_ht, &doc_ht, &value)); |
712 | | |
713 | 83.7M | if (!iter_->valid()) { |
714 | 905k | return Status::OK(); |
715 | 905k | } |
716 | | |
717 | 82.8M | parent_obsolescence_tracker_ = parent_obsolescence_tracker_.Child(doc_ht); |
718 | 82.8M | return Status::OK(); |
719 | 83.7M | } |
720 | | |
721 | | } // namespace docdb |
722 | | } // namespace yb |