/Users/deen/code/yugabyte-db/src/yb/docdb/doc_reader.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include "yb/docdb/doc_reader.h" |
15 | | |
16 | | #include <string> |
17 | | #include <vector> |
18 | | |
19 | | #include "yb/common/doc_hybrid_time.h" |
20 | | #include "yb/common/hybrid_time.h" |
21 | | #include "yb/common/transaction.h" |
22 | | |
23 | | #include "yb/docdb/docdb_fwd.h" |
24 | | #include "yb/docdb/shared_lock_manager_fwd.h" |
25 | | #include "yb/docdb/doc_key.h" |
26 | | #include "yb/docdb/doc_ttl_util.h" |
27 | | #include "yb/docdb/docdb-internal.h" |
28 | | #include "yb/docdb/docdb_rocksdb_util.h" |
29 | | #include "yb/docdb/intent_aware_iterator.h" |
30 | | #include "yb/docdb/subdoc_reader.h" |
31 | | #include "yb/docdb/subdocument.h" |
32 | | #include "yb/docdb/value.h" |
33 | | #include "yb/docdb/value_type.h" |
34 | | |
35 | | #include "yb/util/monotime.h" |
36 | | #include "yb/util/result.h" |
37 | | #include "yb/util/status.h" |
38 | | |
39 | | using std::vector; |
40 | | |
41 | | using yb::HybridTime; |
42 | | |
43 | | namespace yb { |
44 | | namespace docdb { |
45 | | |
46 | | |
47 | | // TODO(dtxn) scan through all involved transactions first to cache statuses in a batch, |
48 | | // so during building subdocument we don't need to request them one by one. |
49 | | // TODO(dtxn) we need to restart read with scan_ht = commit_ht if some transaction was committed |
50 | | // at time commit_ht within [scan_ht; read_request_time + max_clock_skew). Also we need |
51 | | // to wait until time scan_ht = commit_ht passed. |
52 | | // TODO(dtxn) for each scanned key (and its subkeys) we need to avoid *new* values committed at |
53 | | // ht <= scan_ht (or just ht < scan_ht?) |
54 | | // Question: what will break if we allow later commit at ht <= scan_ht ? Need to write down |
55 | | // detailed example. |
56 | | |
57 | | Result<boost::optional<SubDocument>> TEST_GetSubDocument( |
58 | | const Slice& sub_doc_key, |
59 | | const DocDB& doc_db, |
60 | | const rocksdb::QueryId query_id, |
61 | | const TransactionOperationContext& txn_op_context, |
62 | | CoarseTimePoint deadline, |
63 | | const ReadHybridTime& read_time, |
64 | 0 | const std::vector<PrimitiveValue>* projection) { |
65 | 0 | auto iter = CreateIntentAwareIterator( |
66 | 0 | doc_db, BloomFilterMode::USE_BLOOM_FILTER, sub_doc_key, query_id, |
67 | 0 | txn_op_context, deadline, read_time); |
68 | 0 | DOCDB_DEBUG_LOG("GetSubDocument for key $0 @ $1", sub_doc_key.ToDebugHexString(), |
69 | 0 | iter->read_time().ToString()); |
70 | 0 | iter->SeekToLastDocKey(); |
71 | 0 | DocDBTableReader doc_reader(iter.get(), deadline); |
72 | 0 | RETURN_NOT_OK(doc_reader.UpdateTableTombstoneTime(sub_doc_key)); |
73 | |
|
74 | 0 | SubDocument result; |
75 | 0 | if (VERIFY_RESULT(doc_reader.Get(sub_doc_key, projection, &result))) { |
76 | 0 | return result; |
77 | 0 | } |
78 | 0 | return boost::none; |
79 | 0 | } |
80 | | |
81 | | DocDBTableReader::DocDBTableReader(IntentAwareIterator* iter, CoarseTimePoint deadline) |
82 | | : iter_(iter), |
83 | | deadline_info_(deadline), |
84 | 6.01M | subdoc_reader_builder_(iter_, &deadline_info_) {} |
85 | | |
86 | 3.75M | void DocDBTableReader::SetTableTtl(const Schema& table_schema) { |
87 | 3.75M | Expiration table_ttl(TableTTL(table_schema)); |
88 | 3.75M | table_obsolescence_tracker_ = ObsolescenceTracker( |
89 | 3.75M | iter_->read_time(), table_obsolescence_tracker_.GetHighWriteTime(), table_ttl); |
90 | 3.75M | } |
91 | | |
92 | 6.02M | Status DocDBTableReader::UpdateTableTombstoneTime(const Slice& root_doc_key) { |
93 | 6.02M | if (root_doc_key[0] == ValueTypeAsChar::kPgTableOid) { |
94 | | // Update table_tombstone_time based on what is written to RocksDB if its not already set. |
95 | | // Otherwise, just accept its value. |
96 | | // TODO -- this is a bit of a hack to allow DocRowwiseIterator to pass along the table tombstone |
97 | | // time read at a previous invocation of this same code. If instead the DocRowwiseIterator owned |
98 | | // an instance of SubDocumentReaderBuilder, and this method call was hoisted up to that level, |
99 | | // passing around this table_tombstone_time would no longer be necessary. |
100 | 58 | DocKey table_id; |
101 | 58 | RETURN_NOT_OK(table_id.DecodeFrom(root_doc_key, DocKeyPart::kUpToId)); |
102 | 58 | iter_->Seek(table_id); |
103 | | |
104 | 58 | Slice value; |
105 | 58 | auto table_id_encoded = table_id.Encode(); |
106 | 58 | DocHybridTime doc_ht = DocHybridTime::kMin; |
107 | | |
108 | 58 | RETURN_NOT_OK(iter_->FindLatestRecord(table_id_encoded, &doc_ht, &value)); |
109 | 58 | ValueType value_type; |
110 | 58 | RETURN_NOT_OK(Value::DecodePrimitiveValueType(value, &value_type)); |
111 | 58 | if (value_type == ValueType::kTombstone) { |
112 | 23 | SCHECK_NE(doc_ht, DocHybridTime::kInvalid, Corruption, |
113 | 23 | "Invalid hybrid time for table tombstone"); |
114 | 23 | table_obsolescence_tracker_ = table_obsolescence_tracker_.Child(doc_ht, MonoDelta::kMax); |
115 | 23 | } |
116 | 58 | } |
117 | 6.02M | return Status::OK();; |
118 | 0 | } |
119 | | |
120 | 34.1M | CHECKED_STATUS DocDBTableReader::InitForKey(const Slice& sub_doc_key) { |
121 | 34.1M | auto dockey_size = |
122 | 34.1M | VERIFY_RESULT(DocKey::EncodedSize(sub_doc_key, DocKeyPart::kWholeDocKey)); |
123 | 34.1M | const Slice root_doc_key(sub_doc_key.data(), dockey_size); |
124 | 34.1M | iter_->SeekForward(root_doc_key); |
125 | 34.1M | RETURN_NOT_OK(subdoc_reader_builder_.InitObsolescenceInfo( |
126 | 34.1M | table_obsolescence_tracker_, root_doc_key, sub_doc_key)); |
127 | 34.1M | return Status::OK(); |
128 | 34.1M | } |
129 | | |
130 | | Result<bool> DocDBTableReader::Get( |
131 | 34.1M | const Slice& root_doc_key, const vector<PrimitiveValue>* projection, SubDocument* result) { |
132 | 34.1M | RETURN_NOT_OK(InitForKey(root_doc_key)); |
133 | | // Seed key_bytes with the subdocument key. For each subkey in the projection, build subdocument |
134 | | // and reuse key_bytes while appending the subkey. |
135 | 34.1M | KeyBytes key_bytes; |
136 | | // Preallocate some extra space to avoid allocation for small subkeys. |
137 | 34.1M | key_bytes.Reserve(root_doc_key.size() + kMaxBytesPerEncodedHybridTime + 32); |
138 | 34.1M | key_bytes.AppendRawBytes(root_doc_key); |
139 | 34.1M | if (projection != nullptr) { |
140 | 34.1M | bool doc_found = false; |
141 | 34.1M | const size_t subdocument_key_size = key_bytes.size(); |
142 | 256M | for (const PrimitiveValue& subkey : *projection) { |
143 | | // Append subkey to subdocument key. Reserve extra kMaxBytesPerEncodedHybridTime + 1 bytes in |
144 | | // key_bytes to avoid the internal buffer from getting reallocated and moved by SeekForward() |
145 | | // appending the hybrid time, thereby invalidating the buffer pointer saved by prefix_scope. |
146 | 256M | subkey.AppendToKey(&key_bytes); |
147 | 256M | key_bytes.Reserve(key_bytes.size() + kMaxBytesPerEncodedHybridTime + 1); |
148 | | // This seek is to initialize the iterator for BuildSubDocument call. |
149 | 256M | iter_->SeekForward(&key_bytes); |
150 | 256M | SubDocument descendant; |
151 | 256M | auto reader = VERIFY_RESULT(subdoc_reader_builder_.Build(key_bytes)); |
152 | 256M | RETURN_NOT_OK(reader->Get(&descendant)); |
153 | 256M | doc_found = doc_found || ( |
154 | 38.3M | descendant.value_type() != ValueType::kInvalid |
155 | 38.0M | && descendant.value_type() != ValueType::kTombstone); |
156 | 256M | result->SetChild(subkey, std::move(descendant)); |
157 | | |
158 | | // Restore subdocument key by truncating the appended subkey. |
159 | 256M | key_bytes.Truncate(subdocument_key_size); |
160 | 256M | } |
161 | 34.1M | if (doc_found) { |
162 | 33.8M | iter_->SeekOutOfSubDoc(root_doc_key); |
163 | 33.8M | return true; |
164 | 33.8M | } |
165 | 351k | } |
166 | | |
167 | | // If doc is not found, decide if some non-projection column exists. |
168 | | // Currently we read the whole doc here, |
169 | | // may be optimized by exiting on the first column in future. |
170 | | // TODO -- is resetting *result = SubDocument() needed here? |
171 | | // TODO -- Add some metrics to understand: |
172 | | // (a) how often we scan back |
173 | | // (b) how often it's useful |
174 | | // Also maybe in debug mode add some every-n logging of the rocksdb values for which it is |
175 | | // useful |
176 | 351k | iter_->Seek(key_bytes); |
177 | 351k | auto reader = VERIFY_RESULT(subdoc_reader_builder_.Build(key_bytes)); |
178 | 351k | RETURN_NOT_OK(reader->Get(result)); |
179 | 351k | return result->value_type() != ValueType::kInvalid |
180 | 366k | && result->value_type() != ValueType::kTombstone; |
181 | 351k | } |
182 | | |
183 | | } // namespace docdb |
184 | | } // namespace yb |