YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/docdb/subdoc_reader.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/docdb/subdoc_reader.h"
15
16
#include <cstddef>
17
#include <memory>
18
#include <string>
19
#include <vector>
20
21
#include <glog/logging.h>
22
23
#include "yb/common/doc_hybrid_time.h"
24
#include "yb/common/hybrid_time.h"
25
#include "yb/common/transaction.h"
26
#include "yb/common/typedefs.h"
27
28
#include "yb/docdb/deadline_info.h"
29
#include "yb/docdb/doc_key.h"
30
#include "yb/docdb/doc_ttl_util.h"
31
#include "yb/docdb/expiration.h"
32
#include "yb/docdb/intent_aware_iterator.h"
33
#include "yb/docdb/key_bytes.h"
34
#include "yb/docdb/primitive_value.h"
35
#include "yb/docdb/subdocument.h"
36
#include "yb/docdb/value.h"
37
#include "yb/docdb/value_type.h"
38
39
#include "yb/gutil/integral_types.h"
40
#include "yb/gutil/macros.h"
41
42
#include "yb/server/hybrid_clock.h"
43
44
#include "yb/util/monotime.h"
45
#include "yb/util/result.h"
46
#include "yb/util/status.h"
47
#include "yb/util/status_format.h"
48
49
using std::vector;
50
51
using yb::HybridTime;
52
53
namespace yb {
54
namespace docdb {
55
56
namespace {
57
58
Expiration GetNewExpiration(
59
    const Expiration& parent_exp, const MonoDelta& ttl,
60
69.7M
    const DocHybridTime& new_write_time) {
61
69.7M
  Expiration new_exp = parent_exp;
62
  // We may need to update the TTL in individual columns.
63
69.7M
  if (new_write_time.hybrid_time() >= new_exp.write_ht) {
64
    // We want to keep the default TTL otherwise.
65
69.7M
    if (ttl != ValueControlFields::kMaxTtl) {
66
98.1k
      new_exp.write_ht = new_write_time.hybrid_time();
67
98.1k
      new_exp.ttl = ttl;
68
69.6M
    } else if (new_exp.ttl.IsNegative()) {
69
0
      new_exp.ttl = -new_exp.ttl;
70
0
    }
71
69.7M
  }
72
73
  // If the hybrid time is kMin, then we must be using default TTL.
74
69.7M
  if (new_exp.write_ht == HybridTime::kMin) {
75
69.5M
    new_exp.write_ht = new_write_time.hybrid_time();
76
69.5M
  }
77
78
69.7M
  return new_exp;
79
69.7M
}
80
81
} // namespace
82
83
ObsolescenceTracker::ObsolescenceTracker(DocHybridTime write_time_watermark):
84
740M
    write_time_watermark_(write_time_watermark) {}
85
86
ObsolescenceTracker::ObsolescenceTracker(
87
    const ReadHybridTime& read_time, DocHybridTime write_time_watermark, Expiration expiration):
88
97.9M
    write_time_watermark_(write_time_watermark), read_time_(read_time), expiration_(expiration) {}
89
90
91.3M
const DocHybridTime& ObsolescenceTracker::GetHighWriteTime() { return write_time_watermark_; }
91
92
674M
bool ObsolescenceTracker::IsObsolete(const DocHybridTime& write_time) const {
93
674M
  if (expiration_.has_value()) {
94
65.4M
    DCHECK(read_time_.has_value());
95
65.4M
    if (HasExpiredTTL(expiration_.value().write_ht, expiration_.value().ttl,
96
65.4M
                      read_time_.value().read)) {
97
17.7k
      return true;
98
17.7k
    }
99
65.4M
  }
100
674M
  return write_time < write_time_watermark_;
101
674M
}
102
103
ObsolescenceTracker ObsolescenceTracker::Child(
104
745M
    const DocHybridTime& write_time, const MonoDelta& ttl) const {
105
745M
  auto new_write_time_watermark = std::max(write_time, write_time_watermark_);
106
745M
  if (expiration_.has_value()) {
107
69.7M
    DCHECK(read_time_.has_value());
108
69.7M
    return ObsolescenceTracker(
109
69.7M
        read_time_.value(), new_write_time_watermark,
110
69.7M
        GetNewExpiration(expiration_.value(), ttl, write_time));
111
69.7M
  }
112
675M
  return ObsolescenceTracker(new_write_time_watermark);
113
745M
}
114
115
82.8M
ObsolescenceTracker ObsolescenceTracker::Child(const DocHybridTime& write_time) const {
116
82.8M
  auto new_write_time_watermark = std::max(write_time, write_time_watermark_);
117
82.8M
  if (expiration_.has_value()) {
118
20.7M
    return ObsolescenceTracker(read_time_.value(), new_write_time_watermark, expiration_.get());
119
20.7M
  }
120
62.1M
  return ObsolescenceTracker(new_write_time_watermark);
121
82.8M
}
122
123
boost::optional<uint64_t> ObsolescenceTracker::GetTtlRemainingSeconds(
124
613M
    const HybridTime& ttl_write_time) const {
125
613M
  if (!expiration_.has_value()) {
126
548M
    return boost::none;
127
548M
  }
128
129
65.2M
  DCHECK(read_time_.has_value());
130
131
65.2M
  auto ttl_value = expiration_.value().ttl;
132
65.2M
  auto doc_read_time = read_time_.value();
133
134
65.2M
  if (ttl_value == ValueControlFields::kMaxTtl) {
135
41.0M
    return -1;
136
41.0M
  }
137
24.2M
  int64_t time_since_ttl_write_seconds = (
138
24.2M
      server::HybridClock::GetPhysicalValueMicros(doc_read_time.read) -
139
24.2M
      server::HybridClock::GetPhysicalValueMicros(ttl_write_time)) /
140
24.2M
      MonoTime::kMicrosecondsPerSecond;
141
24.2M
  int64_t ttl_value_seconds = ttl_value.ToMilliseconds() / MonoTime::kMillisecondsPerSecond;
142
24.2M
  int64_t ttl_remaining_seconds = ttl_value_seconds - time_since_ttl_write_seconds;
143
24.2M
  return std::max(static_cast<int64_t>(0), ttl_remaining_seconds);
144
65.2M
}
145
146
namespace {
147
148
// This class wraps access to a SubDocument instance specified in one of two ways:
149
// (1) -- From a pointer to an already existing instance of a SubDocument
150
// (2) -- From a pointer to an existing instance of LazySubDocumentHolder which represents the
151
//        "parent" of the SubDocument accessed via this LazySubDocumentHolder.
152
//
153
// Using this class, a SubDocument which is *specified* via (2) above will not be constructed unless
154
// it is accessed, either directly or via a child LazySubDocumentHolder instance. This class allows
155
// us to provide a unified interface to the reading/constructing code which may determine if a
156
// particular SubDocument is live based on deferred knowledge of whether any of it's children are
157
// live.
158
class LazySubDocumentHolder {
159
 public:
160
  // Specifies the SubDocument "target" provided. The pointer must be valid for the lifetime of this
161
  // instance. The provided key Slice must be valid and the underlying data should not change during
162
  // the lifetime of this instance.
163
697M
  LazySubDocumentHolder(SubDocument* target, Slice key) : target_(target), key_(key) {
164
697M
    DCHECK(target_);
165
697M
  }
166
167
  // Specifies a SubDocument with key "key" lazily constructed from "parent". Provided parent must
168
  // be valid and remain at the same address for the lifetime of this instance. The provided key
169
  // Slice must be valid and the underlying data should not change during the lifetime of this
170
  // instance.
171
51.1M
  LazySubDocumentHolder(LazySubDocumentHolder* parent, Slice key) : parent_(parent), key_(key) {
172
51.1M
    DCHECK(parent_);
173
51.1M
  }
174
175
  // Returns true if the SubDocument specified by this instance exists.
176
178M
  bool IsConstructed() const { return target_; }
177
178
  // Get a pointer to the SubDocument specified by this instance. Construct that SubDocument first
179
  // if it has not yet been constructed.
180
  Result<SubDocument*> Get();
181
182
 private:
183
  // The constructed SubDocument specified by this instance.
184
  SubDocument* target_ = nullptr;
185
186
  // A pointer to the parent_ of the specified SubDocument. This must be non-null unless this
187
  // instance was constructed via a concrete SubDocument*.
188
  LazySubDocumentHolder* const parent_ = nullptr;
189
190
  // The key of the specified SubDocument.
191
  Slice key_;
192
};
193
194
711M
Result<SubDocument*> LazySubDocumentHolder::Get() {
195
  // If target_ is not null, just return it. Otherwise, we must construct the SubDocument from its
196
  // parent.
197
711M
  if (target_) {
198
705M
    return target_;
199
705M
  }
200
201
  // Presumably, the parent_ key is a prefix of key_, otherwise it's not a valid parent.
202
18.4E
  DCHECK(key_.starts_with(parent_->key_))
203
18.4E
      << "Attempting to construct SubDocument for key: " << SubDocKey::DebugSliceToString(key_)
204
18.4E
      << " from parent whose key: " << SubDocKey::DebugSliceToString(parent_->key_)
205
18.4E
      << " is not a prefix of child key";
206
  // This code takes each subdoc key part after parent_ in key_ and traverses from the parent_
207
  // SubDocument to a SubDocument created via successive calls to GetOrAddChild. The SubDocument
208
  // returned is owned by and borrowed from parent_.
209
5.89M
  SubDocument* current = VERIFY_RESULT(parent_->Get());
210
0
  Slice temp = key_;
211
5.89M
  temp.remove_prefix(parent_->key_.size());
212
6.07M
  for (;;) {
213
6.07M
    PrimitiveValue child_key_part;
214
6.07M
    RETURN_NOT_OK(child_key_part.DecodeFromKey(&temp));
215
6.07M
    current = current->GetOrAddChild(child_key_part).first;
216
6.07M
    if (temp.empty()) {
217
6.07M
      target_ = current;
218
6.07M
      return target_;
219
6.07M
    }
220
6.07M
  }
221
18.4E
  return STATUS(
222
5.89M
      InternalError,
223
5.89M
      "We return this status at the end of a function with a terminal infinite loop. We should "
224
5.89M
      "never get here.");
225
5.89M
}
226
227
// This class provides a wrapper to access data corresponding to a RocksDB row.
228
class DocDbRowData {
229
 public:
230
  DocDbRowData(const Slice& key, const DocHybridTime& write_time, Value&& value);
231
232
  static Result<std::unique_ptr<DocDbRowData>> CurrentRow(IntentAwareIterator* iter);
233
234
1.53G
  const KeyBytes& key() const { return target_key_; }
235
236
1.35G
  const Value& value() const { return value_; }
237
238
2.63G
  const DocHybridTime& write_time() const { return write_time_; }
239
240
746M
  bool IsTombstone() const { return value_.value_type() == ValueType::kTombstone; }
241
242
619M
  bool IsCollection() const { return IsCollectionType(value_.value_type()); }
243
244
741M
  bool IsPrimitiveValue() const { return IsPrimitiveValueType(value_.value_type()); }
245
246
679M
  PrimitiveValue* mutable_primitive_value() { return value_.mutable_primitive_value(); }
247
248
 private:
249
  const KeyBytes target_key_;
250
  const DocHybridTime write_time_;
251
  Value value_;
252
253
  DISALLOW_COPY_AND_ASSIGN(DocDbRowData);
254
};
255
256
DocDbRowData::DocDbRowData(
257
    const Slice& key, const DocHybridTime& write_time, Value&& value):
258
741M
    target_key_(std::move(key)), write_time_(std::move(write_time)), value_(std::move(value)) {}
259
260
746M
Result<std::unique_ptr<DocDbRowData>> DocDbRowData::CurrentRow(IntentAwareIterator* iter) {
261
746M
  auto key_data = VERIFY_RESULT(iter->FetchKey());
262
852k
  DCHECK(key_data.same_transaction ||
263
852k
      iter->read_time().global_limit >= key_data.write_time.hybrid_time())
264
852k
      << "Bad key: " << SubDocKey::DebugSliceToString(key_data.key)
265
852k
      << ", global limit: " << iter->read_time().global_limit
266
852k
      << ", write time: " << key_data.write_time.hybrid_time();
267
746M
  Value value;
268
  // TODO -- we could optimize be decoding directly into a SubDocument instance on the heap which
269
  // could be later bound to our result SubDocument. This could work if e.g. Value could be
270
  // initialized with a PrimitiveValue*.
271
746M
  RETURN_NOT_OK(value.Decode(iter->value()));
272
273
746M
  if (key_data.write_time == DocHybridTime::kMin) {
274
0
    return STATUS(Corruption, "No hybrid timestamp found on entry");
275
0
  }
276
277
746M
  return std::make_unique<DocDbRowData>(key_data.key, key_data.write_time, std::move(value));
278
746M
}
279
280
// This class provides a convenience handle for modifying a SubDocument specified by a provided
281
// LazySubDocumentHolder. Importantly, it is responsible for the semantics of when a
282
// LazySubDocumentHolder should be realized. Notably, it does *not* construct the specified
283
// SubDocument if it is instructed to store a tombstone, and it will only modify the SubDocument
284
// value if it already has been constructed.
285
class DocDbRowAssembler {
286
 public:
287
694M
  DocDbRowAssembler(SubDocument* target, Slice key) : root_(target, key) {}
288
289
  DocDbRowAssembler(DocDbRowAssembler* parent_assembler, Slice key):
290
51.1M
      root_(&parent_assembler->root_, key) {}
291
292
  CHECKED_STATUS SetEmptyCollection();
293
294
  CHECKED_STATUS SetTombstone();
295
296
  CHECKED_STATUS SetPrimitiveValue(DocDbRowData* row);
297
298
  Result<bool> HasStoredValue();
299
300
 private:
301
  LazySubDocumentHolder root_;
302
  bool is_tombstoned_ = false;
303
304
  DISALLOW_COPY_AND_ASSIGN(DocDbRowAssembler);
305
};
306
307
4.17M
Status DocDbRowAssembler::SetEmptyCollection() {
308
4.17M
  auto* subdoc = VERIFY_RESULT(root_.Get());
309
0
  *subdoc = SubDocument();
310
4.17M
  return Status::OK();
311
4.17M
}
312
313
127M
Status DocDbRowAssembler::SetTombstone() {
314
127M
  if (!root_.IsConstructed()) {
315
    // Do not construct a child subdocument from the parent if it is not constructed, since this is
316
    // a tombstone.
317
45.1M
    return Status::OK();
318
45.1M
  }
319
82.4M
  auto* subdoc = VERIFY_RESULT(root_.Get());
320
0
  *subdoc = SubDocument(ValueType::kTombstone);
321
82.4M
  is_tombstoned_ = true;
322
82.4M
  return Status::OK();
323
82.4M
}
324
325
613M
Status DocDbRowAssembler::SetPrimitiveValue(DocDbRowData* row) {
326
  // TODO -- this interface with a non-const row pointer is not ideal. It's awkward to allow the
327
  // DocDbRowAssembler to modify the DocDbRowData's state. In the future, it might make more
328
  // sense to have ScopedDocDbRowContext orchestrate coordination between these classes.
329
  // TODO -- we currently modify the DocDbRowData's mutable primitive_value, and then make
330
  // a copy here to store onto the SubDocument. This should be made more efficient, especially
331
  // since it's on the critical path of all reads.
332
613M
  auto* subdoc = VERIFY_RESULT(root_.Get());
333
334
0
  auto* mutable_primitive_value = row->mutable_primitive_value();
335
336
613M
  if (row->value().has_user_timestamp()) {
337
87
    mutable_primitive_value->SetWriteTime(row->value().user_timestamp());
338
613M
  } else {
339
613M
    mutable_primitive_value->SetWriteTime(row->write_time().hybrid_time().GetPhysicalValueMicros());
340
613M
  }
341
613M
  *subdoc = SubDocument(*mutable_primitive_value);
342
613M
  return Status::OK();
343
613M
}
344
345
51.1M
Result<bool> DocDbRowAssembler::HasStoredValue() {
346
51.1M
  if (!root_.IsConstructed()) {
347
45.1M
    return false;
348
45.1M
  }
349
6.07M
  auto* subdoc = VERIFY_RESULT(root_.Get());
350
6.07M
  return subdoc->value_type() != ValueType::kInvalid
351
6.07M
      && 
subdoc->value_type() != ValueType::kTombstone6.07M
;
352
6.07M
}
353
354
class ScopedDocDbRowContext;
355
class ScopedDocDbRowContextWithData;
356
357
// This class represents a collection of ScopedDocDbRowContext instances corresponding to a DocDB
358
// collection. The user of this class can optionally call SetFirstChild on an owned
359
// ScopedDocDbRowContext in case it has already read the first row of this collection, and that will
360
// be returned before reading subsequent rows in the collection. Note this may only be done before
361
// the ScopedDocDbCollectionContext instance has read any other rows.
362
class ScopedDocDbCollectionContext {
363
 public:
364
  explicit ScopedDocDbCollectionContext(ScopedDocDbRowContext* parent);
365
366
  CHECKED_STATUS SetFirstChild(std::unique_ptr<DocDbRowData> first_row);
367
368
  Result<ScopedDocDbRowContextWithData*> GetNextChild();
369
370
 private:
371
  void SetNextChild(std::unique_ptr<DocDbRowData> child_row);
372
373
  ScopedDocDbRowContext* const parent_ = nullptr;
374
  std::unique_ptr<ScopedDocDbRowContextWithData> current_child_ = nullptr;
375
  ScopedDocDbRowContextWithData* next_child_ = nullptr;
376
};
377
378
// This class encapsulates all relevant context for reading the RocksDB state corresponding to a
379
// particular key and constructing a SubDocument instance which reflects that state. This context is
380
// used by control-flow functions at the bottom of this file. The context for a key also
381
// encapsulates a mechanism to collect the context for any children of the same key via
382
// ScopedDocDbCollectionContext, which is similarly used by control-flow functions at the bottom of
383
// this file.
384
class ScopedDocDbRowContext {
385
 public:
386
  ScopedDocDbRowContext(
387
      IntentAwareIterator* iter,
388
      DeadlineInfo* deadline_info,
389
      Slice key,
390
      SubDocument* assembly_target,
391
      ObsolescenceTracker obsolescence_tracker);
392
393
  ScopedDocDbRowContext(
394
      IntentAwareIterator* iter,
395
      DeadlineInfo* deadline_info,
396
      Slice key,
397
      DocDbRowAssembler* ancestor_assembler,
398
      ObsolescenceTracker obsolescence_tracker);
399
400
942M
  DocDbRowAssembler* mutable_assembler() { return &assembler_; }
401
402
745M
  const ObsolescenceTracker* obsolescence_tracker() const { return &obsolescence_tracker_; }
403
404
  ScopedDocDbCollectionContext* collection();
405
406
  CHECKED_STATUS CheckDeadline();
407
408
 protected:
409
  IntentAwareIterator* const iter_;
410
  DeadlineInfo* const deadline_info_;
411
  const Slice key_;
412
  const IntentAwareIteratorPrefixScope prefix_scope_;
413
  DocDbRowAssembler assembler_;
414
  ObsolescenceTracker obsolescence_tracker_;
415
  boost::optional<ScopedDocDbCollectionContext> collection_ = boost::none;
416
417
 private:
418
  friend class ScopedDocDbCollectionContext;
419
420
  DISALLOW_COPY_AND_ASSIGN(ScopedDocDbRowContext);
421
};
422
423
ScopedDocDbRowContext::ScopedDocDbRowContext(
424
      IntentAwareIterator* iter,
425
      DeadlineInfo* deadline_info,
426
      Slice key,
427
      SubDocument* assembly_target,
428
      ObsolescenceTracker obsolescence_tracker):
429
    iter_(iter),
430
    deadline_info_(deadline_info),
431
    key_(key),
432
    prefix_scope_(key_, iter_),
433
    assembler_(assembly_target, key_),
434
695M
    obsolescence_tracker_(obsolescence_tracker) {}
435
436
ScopedDocDbRowContext::ScopedDocDbRowContext(
437
      IntentAwareIterator* iter,
438
      DeadlineInfo* deadline_info,
439
      Slice key,
440
      DocDbRowAssembler* ancestor_assembler,
441
      ObsolescenceTracker obsolescence_tracker):
442
    iter_(iter),
443
    deadline_info_(deadline_info),
444
    key_(key),
445
    prefix_scope_(key_, iter_),
446
    assembler_(ancestor_assembler, key_),
447
51.1M
    obsolescence_tracker_(obsolescence_tracker) {}
448
449
94.5M
ScopedDocDbCollectionContext* ScopedDocDbRowContext::collection() {
450
94.5M
  if (
collection_ == boost::none94.5M
) {
451
94.5M
    iter_->SeekPastSubKey(key_);
452
94.5M
    collection_.emplace(this);
453
94.5M
  }
454
94.5M
  return &*collection_;
455
94.5M
}
456
457
749M
Status ScopedDocDbRowContext::CheckDeadline() {
458
749M
  if (
deadline_info_749M
&& deadline_info_->CheckAndSetDeadlinePassed()) {
459
0
    return STATUS(Expired, "Deadline for query passed.");
460
0
  }
461
749M
  return Status::OK();
462
749M
}
463
464
class ScopedDocDbRowContextWithData : public ScopedDocDbRowContext {
465
 public:
466
  ScopedDocDbRowContextWithData(
467
      std::unique_ptr<DocDbRowData> row,
468
      IntentAwareIterator* iter,
469
      DeadlineInfo* deadline_info,
470
      SubDocument* assembly_target,
471
      ObsolescenceTracker ancestor_obsolescence_tracker);
472
473
  ScopedDocDbRowContextWithData(
474
      std::unique_ptr<DocDbRowData> row,
475
      IntentAwareIterator* iter,
476
      DeadlineInfo* deadline_info,
477
      DocDbRowAssembler* ancestor_assembler,
478
      ObsolescenceTracker ancestor_obsolescence_tracker);
479
480
793M
  DocDbRowData* data() const { return data_.get(); }
481
482
  void SeekOutOfPrefix();
483
484
 private:
485
  std::unique_ptr<DocDbRowData> data_;
486
487
  DISALLOW_COPY_AND_ASSIGN(ScopedDocDbRowContextWithData);
488
};
489
490
ScopedDocDbRowContextWithData::ScopedDocDbRowContextWithData(
491
    std::unique_ptr<DocDbRowData> row,
492
    IntentAwareIterator* iter,
493
    DeadlineInfo* deadline_info,
494
    SubDocument* assembly_target,
495
    ObsolescenceTracker ancestor_obsolescence_tracker):
496
    ScopedDocDbRowContext(
497
        iter, deadline_info, row->key(), assembly_target,
498
        ancestor_obsolescence_tracker.Child(row->write_time(), row->value().ttl())),
499
694M
    data_(std::move(row)) {}
500
501
ScopedDocDbRowContextWithData::ScopedDocDbRowContextWithData(
502
    std::unique_ptr<DocDbRowData> row,
503
    IntentAwareIterator* iter,
504
    DeadlineInfo* deadline_info,
505
    DocDbRowAssembler* ancestor_assembler,
506
    ObsolescenceTracker ancestor_obsolescence_tracker):
507
    ScopedDocDbRowContext(
508
        iter, deadline_info, row->key(), ancestor_assembler,
509
        ancestor_obsolescence_tracker.Child(row->write_time(), row->value().ttl())),
510
51.1M
    data_(std::move(row)) {}
511
512
51.1M
void ScopedDocDbRowContextWithData::SeekOutOfPrefix() {
513
51.1M
  iter_->SeekOutOfSubDoc(data_->key());
514
51.1M
}
515
516
ScopedDocDbCollectionContext::ScopedDocDbCollectionContext(ScopedDocDbRowContext* parent):
517
94.6M
    parent_(parent) {}
518
519
1.26k
Status ScopedDocDbCollectionContext::SetFirstChild(std::unique_ptr<DocDbRowData> first_row) {
520
1.26k
  if (next_child_) {
521
0
    return STATUS(IllegalState, "Cannot set first_child if already set.");
522
0
  }
523
1.26k
  if (current_child_) {
524
0
    return STATUS(IllegalState, "Cannot set first_child if a child has already been read.");
525
0
  }
526
1.26k
  SetNextChild(std::move(first_row));
527
1.26k
  next_child_ = current_child_.get();
528
1.26k
  return Status::OK();
529
1.26k
}
530
531
145M
Result<ScopedDocDbRowContextWithData*> ScopedDocDbCollectionContext::GetNextChild() {
532
145M
  if (next_child_) {
533
    // If there is a next_child_, then we've already stored a row which we read before. Serve that,
534
    // and reset it to resume normal operation next time.
535
1.26k
    next_child_ = nullptr;
536
145M
  } else {
537
145M
    if (current_child_) {
538
      // prev_child_ points to the row we served last, which we are now done with, so we should
539
      // seek out of the scope of it and reset its state before proceeding to read the next row.
540
      // Note -- we currently seek away from the previous row only when a new row is requested by
541
      // the caller. It might be better from a perf perspective to do this before instead.
542
51.1M
      current_child_->SeekOutOfPrefix();
543
544
      // Reset current child to eliminate the IntentAwareIteratorPrefixScope it was holding.
545
51.1M
      current_child_.reset();
546
51.1M
    }
547
145M
    if (parent_->iter_->valid()) {
548
51.1M
      SetNextChild(VERIFY_RESULT(DocDbRowData::CurrentRow(parent_->iter_)));
549
51.1M
    }
550
145M
  }
551
145M
  DCHECK(
552
32.6k
      !current_child_ ||
553
32.6k
      !parent_ ||
554
32.6k
      current_child_->data()->key().AsSlice().starts_with(parent_->key_))
555
32.6k
      << "Child key " << SubDocKey::DebugSliceToString(current_child_->data()->key().AsSlice())
556
32.6k
      << " does not include parent key " << SubDocKey::DebugSliceToString(parent_->key_)
557
32.6k
      << " as prefix.";
558
145M
  return current_child_.get();
559
145M
}
560
561
51.2M
void ScopedDocDbCollectionContext::SetNextChild(std::unique_ptr<DocDbRowData> child_row) {
562
51.2M
  current_child_ = std::make_unique<ScopedDocDbRowContextWithData>(
563
51.2M
      std::move(child_row), parent_->iter_, parent_->deadline_info_, parent_->mutable_assembler(),
564
51.2M
      parent_->obsolescence_tracker_);
565
51.2M
}
566
567
CHECKED_STATUS ProcessSubDocument(ScopedDocDbRowContextWithData* scope);
568
569
94.6M
Result<uint32_t> ProcessChildren(ScopedDocDbCollectionContext* collection) {
570
94.6M
  uint32_t num_children = 0;
571
94.6M
  if (
collection94.6M
) {
572
145M
    while (ScopedDocDbRowContextWithData* child = VERIFY_RESULT(collection->GetNextChild())) {
573
51.1M
      RETURN_NOT_OK(ProcessSubDocument(child));
574
51.1M
      if (VERIFY_RESULT(child->mutable_assembler()->HasStoredValue())) {
575
6.07M
        ++num_children;
576
6.07M
      }
577
51.1M
    }
578
94.6M
  }
579
94.6M
  return num_children;
580
94.6M
}
581
582
4.17M
Status ProcessCollection(ScopedDocDbRowContextWithData* scope) {
583
  // Set this row to an empty collection since it is alive/valid before processing its children.
584
4.17M
  RETURN_NOT_OK(scope->mutable_assembler()->SetEmptyCollection());
585
4.17M
  RETURN_NOT_OK(ProcessChildren(scope->collection()));
586
4.17M
  return Status::OK();
587
4.17M
}
588
589
90.2M
Status MaybeReviveCollection(ScopedDocDbRowContextWithData* scope) {
590
90.2M
  auto num_children = VERIFY_RESULT(ProcessChildren(scope->collection()));
591
90.4M
  if (
num_children == 090.2M
) {
592
90.4M
    return scope->mutable_assembler()->SetTombstone();
593
90.4M
  }
594
18.4E
  return Status::OK();
595
90.2M
}
596
597
749M
Status ProcessSubDocument(ScopedDocDbRowContextWithData* scope) {
598
749M
  RETURN_NOT_OK(scope->CheckDeadline());
599
600
749M
  auto data = scope->data();
601
749M
  auto assembler = scope->mutable_assembler();
602
749M
  auto obsolescence_tracker = scope->obsolescence_tracker();
603
604
749M
  if (data->IsTombstone() || 
obsolescence_tracker->IsObsolete(data->write_time())674M
) {
605
127M
    if (data->IsPrimitiveValue()) {
606
18.4E
      VLOG(4) << "Discarding overwritten or expired primitive value";
607
37.0M
      return assembler->SetTombstone();
608
37.0M
    }
609
    // If the latest written value is a tombstone or the record is expired at a top level, only
610
    // surface a subdocument if it has a valid (unexpired, non-tombstoned) child which overwrites
611
    // this record. Note: these semantics are only relevant to CQL reads.
612
90.4M
    return MaybeReviveCollection(scope);
613
127M
  }
614
615
622M
  if (data->IsCollection()) {
616
4.17M
    return ProcessCollection(scope);
617
4.17M
  }
618
619
617M
  if (data->IsPrimitiveValue()) {
620
614M
    auto ttl_opt = obsolescence_tracker->GetTtlRemainingSeconds(data->write_time().hybrid_time());
621
614M
    if (ttl_opt) {
622
65.2M
      data->mutable_primitive_value()->SetTtl(*ttl_opt);
623
65.2M
    }
624
614M
    return assembler->SetPrimitiveValue(data);
625
614M
  }
626
627
3.69M
  return STATUS_FORMAT(
628
617M
      Corruption,
629
617M
      "Expected primitive value type, collection, or tobmstone. Got $0",
630
617M
      data->value().value_type());
631
617M
}
632
633
}  // namespace
634
635
SubDocumentReader::SubDocumentReader(
636
    const KeyBytes& target_subdocument_key,
637
    IntentAwareIterator* iter,
638
    DeadlineInfo* deadline_info,
639
    const ObsolescenceTracker& ancestor_obsolescence_tracker):
640
    target_subdocument_key_(target_subdocument_key), iter_(iter), deadline_info_(deadline_info),
641
719M
    ancestor_obsolescence_tracker_(ancestor_obsolescence_tracker) {}
642
643
722M
Status SubDocumentReader::Get(SubDocument* result) {
644
722M
  IntentAwareIteratorPrefixScope target_scope(target_subdocument_key_, iter_);
645
722M
  if (!iter_->valid()) {
646
25.8M
    *result = SubDocument(ValueType::kInvalid);
647
25.8M
    return Status::OK();
648
25.8M
  }
649
696M
  auto first_row = VERIFY_RESULT(DocDbRowData::CurrentRow(iter_));
650
0
  auto current_key = first_row->key();
651
652
696M
  if (current_key == target_subdocument_key_) {
653
696M
    ScopedDocDbRowContextWithData context(
654
696M
        std::move(first_row), iter_, deadline_info_, result, ancestor_obsolescence_tracker_);
655
696M
    return ProcessSubDocument(&context);
656
696M
  }
657
  // If the currently-pointed-to key is not equal to our target, but we are still in a valid state,
658
  // then that key must have the target key as a prefix, meaning we are pointing to a child of our
659
  // target. We should therefore process the rows as if we're already in a collection, rooted at the
660
  // target key.
661
562k
  ScopedDocDbRowContext context(
662
562k
      iter_, deadline_info_, target_subdocument_key_, result, ancestor_obsolescence_tracker_);
663
562k
  ScopedDocDbCollectionContext collection(&context);
664
562k
  RETURN_NOT_OK(collection.SetFirstChild(std::move(first_row)));
665
562k
  auto num_children = VERIFY_RESULT(ProcessChildren(&collection));
666
562k
  if (num_children == 0) {
667
1.02k
    *result = SubDocument(ValueType::kTombstone);
668
1.02k
  }
669
562k
  return Status::OK();
670
562k
}
671
672
SubDocumentReaderBuilder::SubDocumentReaderBuilder(
673
    IntentAwareIterator* iter, DeadlineInfo* deadline_info)
674
14.2M
    : iter_(iter), deadline_info_(deadline_info) {}
675
676
Result<std::unique_ptr<SubDocumentReader>> SubDocumentReaderBuilder::Build(
677
721M
    const KeyBytes& sub_doc_key) {
678
721M
  return std::make_unique<SubDocumentReader>(
679
721M
      sub_doc_key, iter_, deadline_info_, parent_obsolescence_tracker_);
680
721M
}
681
682
Status SubDocumentReaderBuilder::InitObsolescenceInfo(
683
    const ObsolescenceTracker& table_obsolescence_tracker,
684
83.6M
    const Slice& root_doc_key, const Slice& target_subdocument_key) {
685
83.6M
  parent_obsolescence_tracker_ = table_obsolescence_tracker;
686
687
  // Look at ancestors to collect ttl/write-time metadata.
688
83.6M
  IntentAwareIteratorPrefixScope prefix_scope(root_doc_key, iter_);
689
83.6M
  Slice temp_key = target_subdocument_key;
690
83.6M
  Slice prev_iter_key = temp_key.Prefix(root_doc_key.size());
691
83.6M
  temp_key.remove_prefix(root_doc_key.size());
692
83.8M
  for (;;) {
693
    // for each iteration of this loop, we consume another piece of the subdoc key path
694
83.8M
    auto decode_result = VERIFY_RESULT(SubDocKey::DecodeSubkey(&temp_key));
695
83.8M
    if (!decode_result) {
696
      // Stop once key_slice has consumed all subdoc keys and FindLastWriteTime has been called
697
      // with all but the last subdoc key
698
83.5M
      break;
699
83.5M
    }
700
295k
    RETURN_NOT_OK(UpdateWithParentWriteInfo(prev_iter_key));
701
295k
    prev_iter_key = Slice(prev_iter_key.data(), temp_key.data() - prev_iter_key.data());
702
295k
  }
703
83.6M
  DCHECK_EQ(prev_iter_key, target_subdocument_key);
704
83.6M
  return UpdateWithParentWriteInfo(target_subdocument_key);
705
83.6M
}
706
707
Status SubDocumentReaderBuilder::UpdateWithParentWriteInfo(
708
83.7M
    const Slice& parent_key_without_ht) {
709
83.7M
  Slice value;
710
83.7M
  DocHybridTime doc_ht = parent_obsolescence_tracker_.GetHighWriteTime();
711
83.7M
  RETURN_NOT_OK(iter_->FindLatestRecord(parent_key_without_ht, &doc_ht, &value));
712
713
83.7M
  if (!iter_->valid()) {
714
905k
    return Status::OK();
715
905k
  }
716
717
82.8M
  parent_obsolescence_tracker_ = parent_obsolescence_tracker_.Child(doc_ht);
718
82.8M
  return Status::OK();
719
83.7M
}
720
721
}  // namespace docdb
722
}  // namespace yb