/Users/deen/code/yugabyte-db/src/yb/tablet/tablet.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | #ifndef YB_TABLET_TABLET_H_ |
33 | | #define YB_TABLET_TABLET_H_ |
34 | | |
35 | | #include <boost/intrusive/list.hpp> |
36 | | |
37 | | #include "yb/common/common_fwd.h" |
38 | | #include "yb/common/read_hybrid_time.h" |
39 | | #include "yb/common/snapshot.h" |
40 | | #include "yb/common/transaction.h" |
41 | | |
42 | | #include "yb/consensus/consensus_fwd.h" |
43 | | #include "yb/consensus/consensus_types.pb.h" |
44 | | |
45 | | #include "yb/docdb/docdb_fwd.h" |
46 | | #include "yb/docdb/docdb_types.h" |
47 | | #include "yb/docdb/key_bounds.h" |
48 | | #include "yb/docdb/shared_lock_manager.h" |
49 | | |
50 | | #include "yb/gutil/ref_counted.h" |
51 | | |
52 | | #include "yb/rocksdb/rocksdb_fwd.h" |
53 | | #include "yb/rocksdb/options.h" |
54 | | #include "yb/rocksdb/table.h" |
55 | | #include "yb/rocksdb/types.h" |
56 | | |
57 | | #include "yb/tablet/tablet_fwd.h" |
58 | | #include "yb/tablet/abstract_tablet.h" |
59 | | #include "yb/tablet/mvcc.h" |
60 | | #include "yb/tablet/operation_filter.h" |
61 | | #include "yb/tablet/tablet_options.h" |
62 | | #include "yb/tablet/transaction_intent_applier.h" |
63 | | |
64 | | #include "yb/util/status_fwd.h" |
65 | | #include "yb/util/enums.h" |
66 | | #include "yb/util/locks.h" |
67 | | #include "yb/util/net/net_fwd.h" |
68 | | #include "yb/util/operation_counter.h" |
69 | | #include "yb/util/strongly_typed_bool.h" |
70 | | #include "yb/util/threadpool.h" |
71 | | |
72 | | namespace yb { |
73 | | |
74 | | class FsManager; |
75 | | class MemTracker; |
76 | | class MetricEntity; |
77 | | class RowChangeList; |
78 | | |
79 | | namespace server { |
80 | | class Clock; |
81 | | } |
82 | | |
83 | | namespace tablet { |
84 | | |
85 | | YB_STRONGLY_TYPED_BOOL(IncludeIntents); |
86 | | YB_STRONGLY_TYPED_BOOL(Abortable); |
87 | | |
88 | 171k | inline FlushFlags operator|(FlushFlags lhs, FlushFlags rhs) { |
89 | 171k | return static_cast<FlushFlags>(to_underlying(lhs) | to_underlying(rhs)); |
90 | 171k | } |
91 | | |
92 | 438k | inline FlushFlags operator&(FlushFlags lhs, FlushFlags rhs) { |
93 | 438k | return static_cast<FlushFlags>(to_underlying(lhs) & to_underlying(rhs)); |
94 | 438k | } |
95 | | |
96 | 438k | inline bool HasFlags(FlushFlags lhs, FlushFlags rhs) { |
97 | 438k | return (lhs & rhs) != FlushFlags::kNone; |
98 | 438k | } |
99 | | |
100 | | class WriteOperation; |
101 | | |
102 | | using AddTableListener = std::function<Status(const TableInfo&)>; |
103 | | |
104 | | class TabletScopedIf : public RefCountedThreadSafe<TabletScopedIf> { |
105 | | public: |
106 | | virtual std::string Key() const = 0; |
107 | | protected: |
108 | | friend class RefCountedThreadSafe<TabletScopedIf>; |
109 | 0 | virtual ~TabletScopedIf() { } |
110 | | }; |
111 | | |
112 | | YB_STRONGLY_TYPED_BOOL(AllowBootstrappingState); |
113 | | YB_STRONGLY_TYPED_BOOL(ResetSplit); |
114 | | |
115 | | struct TabletScopedRWOperationPauses { |
116 | | ScopedRWOperationPause abortable; |
117 | | ScopedRWOperationPause non_abortable; |
118 | | |
119 | 249k | std::array<ScopedRWOperationPause*, 2> AsArray() { |
120 | 249k | return {&abortable, &non_abortable}; |
121 | 249k | } |
122 | | }; |
123 | | |
124 | | class Tablet : public AbstractTablet, public TransactionIntentApplier { |
125 | | public: |
126 | | class CompactionFaultHooks; |
127 | | class FlushCompactCommonHooks; |
128 | | class FlushFaultHooks; |
129 | | |
130 | | // A function that returns the current majority-replicated hybrid time leader lease, or waits |
131 | | // until a hybrid time leader lease with at least the given hybrid time is acquired |
132 | | // (first argument), or a timeout occurs (second argument). HybridTime::kInvalid is returned |
133 | | // in case of a timeout. |
134 | | using HybridTimeLeaseProvider = std::function<Result<FixedHybridTimeLease>( |
135 | | HybridTime, CoarseTimePoint)>; |
136 | | using TransactionIdSet = std::unordered_set<TransactionId, TransactionIdHash>; |
137 | | |
138 | | // Create a new tablet. |
139 | | // |
140 | | // If 'metric_registry' is non-nullptr, then this tablet will create a 'tablet' entity |
141 | | // within the provided registry. Otherwise, no metrics are collected. |
142 | | explicit Tablet(const TabletInitData& data); |
143 | | |
144 | | ~Tablet(); |
145 | | |
146 | | // Open the tablet. |
147 | | // Upon completion, the tablet enters the kBootstrapping state. |
148 | | CHECKED_STATUS Open(); |
149 | | |
150 | | CHECKED_STATUS EnableCompactions(ScopedRWOperationPause* non_abortable_ops_pause); |
151 | | |
152 | | // Performs backfill for the key range beginning from the row immediately after |
153 | | // <backfill_from>, until either it reaches the end of the tablet |
154 | | // or the current time is past deadline. |
155 | | // *<number_of_rows_processed> will be set to the number of rows backfilled. |
156 | | // <backfilled_until> will be set to the first row that was not backfilled, so that the |
157 | | // next API call can resume from where the backfill was left off. |
158 | | // Note that <backfilled_until> only applies to the non-failing indexes. |
159 | | CHECKED_STATUS BackfillIndexesForYsql( |
160 | | const std::vector<IndexInfo>& indexes, |
161 | | const std::string& backfill_from, |
162 | | const CoarseTimePoint deadline, |
163 | | const HybridTime read_time, |
164 | | const HostPort& pgsql_proxy_bind_address, |
165 | | const std::string& database_name, |
166 | | const uint64_t postgres_auth_key, |
167 | | size_t* number_of_rows_processed, |
168 | | std::string* backfilled_until); |
169 | | |
170 | | CHECKED_STATUS VerifyIndexTableConsistencyForCQL( |
171 | | const std::vector<IndexInfo>& indexes, |
172 | | const std::string& start_key, |
173 | | const int num_rows, |
174 | | const CoarseTimePoint deadline, |
175 | | const HybridTime read_time, |
176 | | std::unordered_map<TableId, uint64>* consistency_stats, |
177 | | std::string* verified_until); |
178 | | |
179 | | CHECKED_STATUS VerifyMainTableConsistencyForCQL( |
180 | | const TableId& main_table_id, |
181 | | const std::string& start_key, |
182 | | const int num_rows, |
183 | | const CoarseTimePoint deadline, |
184 | | const HybridTime read_time, |
185 | | std::unordered_map<TableId, uint64>* consistency_stats, |
186 | | std::string* verified_until); |
187 | | |
188 | | CHECKED_STATUS VerifyTableConsistencyForCQL( |
189 | | const std::vector<TableId>& table_ids, |
190 | | const std::vector<yb::ColumnSchema>& columns, |
191 | | const std::string& start_key, |
192 | | const int num_rows, |
193 | | const CoarseTimePoint deadline, |
194 | | const HybridTime read_time, |
195 | | const bool is_main_table, |
196 | | std::unordered_map<TableId, uint64>* consistency_stats, |
197 | | std::string* verified_until); |
198 | | |
199 | | CHECKED_STATUS VerifyTableInBatches( |
200 | | const QLTableRow& row, |
201 | | const std::vector<TableId>& table_ids, |
202 | | const HybridTime read_time, |
203 | | const CoarseTimePoint deadline, |
204 | | const bool is_main_table, |
205 | | std::vector<std::pair<const TableId, QLReadRequestPB>>* requests, |
206 | | CoarseTimePoint* last_flushed_at, |
207 | | std::unordered_set<TableId>* failed_indexes, |
208 | | std::unordered_map<TableId, uint64>* consistency_stats); |
209 | | |
210 | | CHECKED_STATUS FlushVerifyBatchIfRequired( |
211 | | const HybridTime read_time, |
212 | | const CoarseTimePoint deadline, |
213 | | std::vector<std::pair<const TableId, QLReadRequestPB>>* requests, |
214 | | CoarseTimePoint* last_flushed_at, |
215 | | std::unordered_set<TableId>* failed_indexes, |
216 | | std::unordered_map<TableId, uint64>* index_consistency_states); |
217 | | CHECKED_STATUS FlushVerifyBatch( |
218 | | const HybridTime read_time, |
219 | | const CoarseTimePoint deadline, |
220 | | std::vector<std::pair<const TableId, QLReadRequestPB>>* requests, |
221 | | CoarseTimePoint* last_flushed_at, |
222 | | std::unordered_set<TableId>* failed_indexes, |
223 | | std::unordered_map<TableId, uint64>* index_consistency_states); |
224 | | |
225 | | // Performs backfill for the key range beginning from the row <backfill_from>, |
226 | | // until either it reaches the end of the tablet |
227 | | // or the current time is past deadline. |
228 | | // *<number_of_rows_processed> will be set to the number of rows backfilled. |
229 | | // <backfilled_until> will be set to the first row that was not backfilled, so that the |
230 | | // next API call can resume from where the backfill was left off. |
231 | | // Note that <backfilled_until> only applies to the non-failing indexes. |
232 | | // <failed_indexes> will be updated with the collection of index-ids for which any errors |
233 | | // were encountered. |
234 | | CHECKED_STATUS BackfillIndexes( |
235 | | const std::vector<IndexInfo>& indexes, |
236 | | const std::string& backfill_from, |
237 | | const CoarseTimePoint deadline, |
238 | | const HybridTime read_time, |
239 | | size_t* number_of_rows_processed, |
240 | | std::string* backfilled_until, |
241 | | std::unordered_set<TableId>* failed_indexes); |
242 | | |
243 | | CHECKED_STATUS UpdateIndexInBatches( |
244 | | const QLTableRow& row, |
245 | | const std::vector<IndexInfo>& indexes, |
246 | | const HybridTime write_time, |
247 | | const CoarseTimePoint deadline, |
248 | | std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests, |
249 | | std::unordered_set<TableId>* failed_indexes); |
250 | | |
251 | | Result<std::shared_ptr<client::YBSession>> GetSessionForVerifyOrBackfill( |
252 | | const CoarseTimePoint deadline); |
253 | | |
254 | | CHECKED_STATUS FlushWriteIndexBatchIfRequired( |
255 | | const HybridTime write_time, |
256 | | const CoarseTimePoint deadline, |
257 | | std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests, |
258 | | std::unordered_set<TableId>* failed_indexes); |
259 | | CHECKED_STATUS FlushWriteIndexBatch( |
260 | | const HybridTime write_time, |
261 | | const CoarseTimePoint deadline, |
262 | | std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests, |
263 | | std::unordered_set<TableId>* failed_indexes); |
264 | | |
265 | | template <typename SomeYBqlOp> |
266 | | CHECKED_STATUS FlushWithRetries( |
267 | | std::shared_ptr<client::YBSession> session, |
268 | | const std::vector<std::shared_ptr<SomeYBqlOp>>& index_ops, |
269 | | int num_retries, |
270 | | std::unordered_set<TableId>* failed_indexes); |
271 | | |
272 | | // Mark that the tablet has finished bootstrapping. |
273 | | // This transitions from kBootstrapping to kOpen state. |
274 | | void MarkFinishedBootstrapping(); |
275 | | |
276 | | // This can be called to proactively prevent new operations from being handled, even before |
277 | | // Shutdown() is called. |
278 | | // Returns true if it was the first call to StartShutdown. |
279 | | bool StartShutdown(IsDropTable is_drop_table = IsDropTable::kFalse); |
280 | 171k | bool IsShutdownRequested() const { |
281 | 171k | return shutdown_requested_.load(std::memory_order::memory_order_acquire); |
282 | 171k | } |
283 | | |
284 | | void CompleteShutdown(IsDropTable is_drop_table = IsDropTable::kFalse); |
285 | | |
286 | | CHECKED_STATUS ImportData(const std::string& source_dir); |
287 | | |
288 | | Result<docdb::ApplyTransactionState> ApplyIntents(const TransactionApplyData& data) override; |
289 | | |
290 | | CHECKED_STATUS RemoveIntents(const RemoveIntentsData& data, const TransactionId& id) override; |
291 | | |
292 | | CHECKED_STATUS RemoveIntents( |
293 | | const RemoveIntentsData& data, const TransactionIdSet& transactions) override; |
294 | | |
295 | | CHECKED_STATUS GetIntents( |
296 | | const TransactionId& id, std::vector<docdb::IntentKeyValueForCDC>* keyValueIntents, |
297 | | docdb::ApplyTransactionState* stream_state); |
298 | | |
299 | | // Apply all of the row operations associated with this transaction. |
300 | | CHECKED_STATUS ApplyRowOperations( |
301 | | WriteOperation* operation, |
302 | | AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse); |
303 | | |
304 | | CHECKED_STATUS ApplyOperation( |
305 | | const Operation& operation, int64_t batch_idx, |
306 | | const docdb::KeyValueWriteBatchPB& write_batch, |
307 | | AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse); |
308 | | |
309 | | // Apply a set of RocksDB row operations. |
310 | | // If rocksdb_write_batch is specified it could contain preencoded RocksDB operations. |
311 | | CHECKED_STATUS ApplyKeyValueRowOperations( |
312 | | int64_t batch_idx, // index of this batch in its transaction |
313 | | const docdb::KeyValueWriteBatchPB& put_batch, |
314 | | const rocksdb::UserFrontiers* frontiers, |
315 | | HybridTime hybrid_time, |
316 | | AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse); |
317 | | |
318 | | void WriteToRocksDB( |
319 | | const rocksdb::UserFrontiers* frontiers, |
320 | | rocksdb::WriteBatch* write_batch, |
321 | | docdb::StorageDbType storage_db_type); |
322 | | |
323 | | //------------------------------------------------------------------------------------------------ |
324 | | // Redis Request Processing. |
325 | | // Takes a Redis WriteRequestPB as input with its redis_write_batch. |
326 | | // Constructs a WriteRequestPB containing a serialized WriteBatch that will be |
327 | | // replicated by Raft. (Makes a copy, it is caller's responsibility to deallocate |
328 | | // write_request afterwards if it is no longer needed). |
329 | | // The operation acquires the necessary locks required to correctly serialize concurrent write |
330 | | // operations to same/conflicting part of the key/sub-key space. The locks acquired are returned |
331 | | // via the 'keys_locked' vector, so that they may be unlocked later when the operation has been |
332 | | // committed. |
333 | | void KeyValueBatchFromRedisWriteBatch(std::unique_ptr<WriteQuery> query); |
334 | | |
335 | | CHECKED_STATUS HandleRedisReadRequest( |
336 | | CoarseTimePoint deadline, |
337 | | const ReadHybridTime& read_time, |
338 | | const RedisReadRequestPB& redis_read_request, |
339 | | RedisResponsePB* response) override; |
340 | | |
341 | | //------------------------------------------------------------------------------------------------ |
342 | | // CQL Request Processing. |
343 | | CHECKED_STATUS HandleQLReadRequest( |
344 | | CoarseTimePoint deadline, |
345 | | const ReadHybridTime& read_time, |
346 | | const QLReadRequestPB& ql_read_request, |
347 | | const TransactionMetadataPB& transaction_metadata, |
348 | | QLReadRequestResult* result) override; |
349 | | |
350 | | CHECKED_STATUS CreatePagingStateForRead( |
351 | | const QLReadRequestPB& ql_read_request, const size_t row_count, |
352 | | QLResponsePB* response) const override; |
353 | | |
354 | | // The QL equivalent of KeyValueBatchFromRedisWriteBatch, works similarly. |
355 | | void KeyValueBatchFromQLWriteBatch(std::unique_ptr<WriteQuery> query); |
356 | | |
357 | | //------------------------------------------------------------------------------------------------ |
358 | | // Postgres Request Processing. |
359 | | CHECKED_STATUS HandlePgsqlReadRequest( |
360 | | CoarseTimePoint deadline, |
361 | | const ReadHybridTime& read_time, |
362 | | bool is_explicit_request_read_time, |
363 | | const PgsqlReadRequestPB& pgsql_read_request, |
364 | | const TransactionMetadataPB& transaction_metadata, |
365 | | const SubTransactionMetadataPB& subtransaction_metadata, |
366 | | PgsqlReadRequestResult* result, |
367 | | size_t* num_rows_read) override; |
368 | | |
369 | | CHECKED_STATUS CreatePagingStateForRead( |
370 | | const PgsqlReadRequestPB& pgsql_read_request, const size_t row_count, |
371 | | PgsqlResponsePB* response) const override; |
372 | | |
373 | | CHECKED_STATUS PreparePgsqlWriteOperations(WriteQuery* query); |
374 | | void KeyValueBatchFromPgsqlWriteBatch(std::unique_ptr<WriteQuery> query); |
375 | | |
376 | | // Create a new row iterator which yields the rows as of the current MVCC |
377 | | // state of this tablet. |
378 | | // The returned iterator is not initialized. |
379 | | Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> NewRowIterator( |
380 | | const Schema& projection, |
381 | | const ReadHybridTime read_hybrid_time = {}, |
382 | | const TableId& table_id = "", |
383 | | CoarseTimePoint deadline = CoarseTimePoint::max(), |
384 | | AllowBootstrappingState allow_bootstrapping_state = AllowBootstrappingState::kFalse, |
385 | | const Slice& sub_doc_key = Slice()) const; |
386 | | |
387 | | Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> NewRowIterator( |
388 | | const TableId& table_id) const; |
389 | | |
390 | | Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> CreateCDCSnapshotIterator( |
391 | | const Schema& projection, |
392 | | const ReadHybridTime& time, |
393 | | const string& next_key); |
394 | | //------------------------------------------------------------------------------------------------ |
395 | | // Makes RocksDB Flush. |
396 | | CHECKED_STATUS Flush(FlushMode mode, |
397 | | FlushFlags flags = FlushFlags::kAllDbs, |
398 | | int64_t ignore_if_flushed_after_tick = rocksdb::FlushOptions::kNeverIgnore); |
399 | | |
400 | | CHECKED_STATUS WaitForFlush(); |
401 | | |
402 | | // Prepares the transaction context for the alter schema operation. |
403 | | // An error will be returned if the specified schema is invalid (e.g. |
404 | | // key mismatch, or missing IDs) |
405 | | CHECKED_STATUS CreatePreparedChangeMetadata( |
406 | | ChangeMetadataOperation* operation, |
407 | | const Schema* schema); |
408 | | |
409 | | // Apply the Schema of the specified operation. |
410 | | CHECKED_STATUS AlterSchema(ChangeMetadataOperation* operation); |
411 | | |
412 | | // Used to update the tablets on the index table that the index has been backfilled. |
413 | | // This means that major compactions can now garbage collect delete markers. |
414 | | CHECKED_STATUS MarkBackfillDone(const TableId& table_id = ""); |
415 | | |
416 | | // Change wal_retention_secs in the metadata. |
417 | | CHECKED_STATUS AlterWalRetentionSecs(ChangeMetadataOperation* operation); |
418 | | |
419 | | // Apply replicated add table operation. |
420 | | CHECKED_STATUS AddTable(const TableInfoPB& table_info); |
421 | | |
422 | | // Apply replicated remove table operation. |
423 | | CHECKED_STATUS RemoveTable(const std::string& table_id); |
424 | | |
425 | | // Truncate this tablet by resetting the content of RocksDB. |
426 | | CHECKED_STATUS Truncate(TruncateOperation* operation); |
427 | | |
428 | | // Verbosely dump this entire tablet to the logs. This is only |
429 | | // really useful when debugging unit tests failures where the tablet |
430 | | // has a very small number of rows. |
431 | | CHECKED_STATUS DebugDump(vector<std::string>* lines = nullptr); |
432 | | |
433 | | const yb::SchemaPtr schema() const; |
434 | | |
435 | | // Returns a reference to the key projection of the tablet schema. |
436 | | // The schema keys are immutable. |
437 | 0 | const Schema& key_schema() const { return *key_schema_; } |
438 | | |
439 | | // Return the MVCC manager for this tablet. |
440 | 147M | MvccManager* mvcc_manager() { return &mvcc_; } |
441 | | |
442 | 3.24M | docdb::SharedLockManager* shared_lock_manager() { return &shared_lock_manager_; } |
443 | | |
444 | 8.16M | std::atomic<int64_t>* monotonic_counter() { return &monotonic_counter_; } |
445 | | |
446 | | // Set the conter to at least 'value'. |
447 | | void UpdateMonotonicCounter(int64_t value); |
448 | | |
449 | 5.06k | const RaftGroupMetadata *metadata() const { return metadata_.get(); } |
450 | 40.9M | RaftGroupMetadata *metadata() { return metadata_.get(); } |
451 | | |
452 | | rocksdb::Env& rocksdb_env() const; |
453 | | |
454 | | const std::string& tablet_id() const override; |
455 | | |
456 | 9.28M | bool system() const override { |
457 | 9.28M | return false; |
458 | 9.28M | } |
459 | | |
460 | | // Return the metrics for this tablet. |
461 | | // May be nullptr in unit tests, etc. |
462 | 7.67M | TabletMetrics* metrics() { return metrics_.get(); } |
463 | | |
464 | | // Return handle to the metric entity of this tablet/table. |
465 | 601k | const scoped_refptr<MetricEntity>& GetTableMetricsEntity() const { |
466 | 601k | return table_metrics_entity_; |
467 | 601k | } |
468 | 593k | const scoped_refptr<MetricEntity>& GetTabletMetricsEntity() const { |
469 | 593k | return tablet_metrics_entity_; |
470 | 593k | } |
471 | | |
472 | | // Returns a reference to this tablet's memory tracker. |
473 | 3.18M | const std::shared_ptr<MemTracker>& mem_tracker() const { return mem_tracker_; } |
474 | | |
475 | 94.9M | TableType table_type() const override { return table_type_; } |
476 | | |
477 | | // Returns true if a RocksDB-backed tablet has any SSTables. |
478 | | Result<bool> HasSSTables() const; |
479 | | |
480 | | // Returns the maximum persistent op id from all SSTables in RocksDB. |
481 | | // First for regular records and second for intents. |
482 | | // When invalid_if_no_new_data is true then function would return invalid op id when no new |
483 | | // data is present in corresponding db. |
484 | | Result<DocDbOpIds> MaxPersistentOpId(bool invalid_if_no_new_data = false) const; |
485 | | |
486 | | // Returns the maximum persistent hybrid_time across all SSTables in RocksDB. |
487 | | Result<HybridTime> MaxPersistentHybridTime() const; |
488 | | |
489 | | // Returns oldest mutable memtable write hybrid time in RocksDB or HybridTime::kMax if memtable |
490 | | // is empty. |
491 | | Result<HybridTime> OldestMutableMemtableWriteHybridTime() const; |
492 | | |
493 | | // For non-kudu table type fills key-value batch in transaction state request and updates |
494 | | // request in state. Due to acquiring locks it can block the thread. |
495 | | void AcquireLocksAndPerformDocOperations(std::unique_ptr<WriteQuery> query); |
496 | | |
497 | | // Given a propopsed "history cutoff" timestamp, returns either that value, if possible, or a |
498 | | // smaller value corresponding to the oldest active reader, whichever is smaller. This ensures |
499 | | // that data needed by active read operations is not compacted away. |
500 | | // |
501 | | // Also updates the "earliest allowed read time" of the tablet to be equal to the returned value, |
502 | | // (if it is still lower than the value about to be returned), so that new readers with timestamps |
503 | | // earlier than that will be rejected. |
504 | | HybridTime UpdateHistoryCutoff(HybridTime proposed_cutoff); |
505 | | |
506 | 4.57M | const scoped_refptr<server::Clock> &clock() const { |
507 | 4.57M | return clock_; |
508 | 4.57M | } |
509 | | |
510 | | SchemaPtr GetSchema(const std::string& table_id = "") const override; |
511 | | |
512 | | Schema GetKeySchema(const std::string& table_id = "") const; |
513 | | |
514 | 10.7M | const docdb::YQLStorageIf& QLStorage() const override { |
515 | 10.7M | return *ql_storage_; |
516 | 10.7M | } |
517 | | |
518 | | // Provide a way for write operations to wait when tablet schema is |
519 | | // being changed. |
520 | | ScopedRWOperationPause PauseWritePermits(CoarseTimePoint deadline); |
521 | | ScopedRWOperation GetPermitToWrite(CoarseTimePoint deadline); |
522 | | |
523 | | // Used from tests |
524 | 0 | const std::shared_ptr<rocksdb::Statistics>& regulardb_statistics() const { |
525 | 0 | return regulardb_statistics_; |
526 | 0 | } |
527 | | |
528 | 0 | const std::shared_ptr<rocksdb::Statistics>& intentsdb_statistics() const { |
529 | 0 | return intentsdb_statistics_; |
530 | 0 | } |
531 | | |
532 | 55.1M | TransactionCoordinator* transaction_coordinator() { |
533 | 55.1M | return transaction_coordinator_.get(); |
534 | 55.1M | } |
535 | | |
536 | 23.0M | TransactionParticipant* transaction_participant() const { |
537 | 23.0M | return transaction_participant_.get(); |
538 | 23.0M | } |
539 | | |
540 | | // Returns true if the tablet was created after a split but it has not yet had data from it's |
541 | | // parent which are now outside of its key range removed. |
542 | | Result<bool> StillHasOrphanedPostSplitData(); |
543 | | |
544 | | // Wrapper for StillHasOrphanedPostSplitData. Conservatively returns true if |
545 | | // StillHasOrphanedPostSplitData failed, otherwise returns the result value. |
546 | | bool MayHaveOrphanedPostSplitData(); |
547 | | |
548 | | // If true, we should report, in our heartbeat to the master, that loadbalancer moves should be |
549 | | // disabled. We do so, for example, when StillHasOrphanedPostSplitData() returns true. |
550 | | bool ShouldDisableLbMove(); |
551 | | |
552 | | void ForceRocksDBCompactInTest(); |
553 | | |
554 | | CHECKED_STATUS ForceFullRocksDBCompact(); |
555 | | |
556 | 8.25M | docdb::DocDB doc_db() const { return { regular_db_.get(), intents_db_.get(), &key_bounds_ }; } |
557 | | |
558 | | // Returns approximate middle key for tablet split: |
559 | | // - for hash-based partitions: encoded hash code in order to split by hash code. |
560 | | // - for range-based partitions: encoded doc key in order to split by row. |
561 | | Result<std::string> GetEncodedMiddleSplitKey() const; |
562 | | |
563 | | std::string TEST_DocDBDumpStr(IncludeIntents include_intents = IncludeIntents::kFalse); |
564 | | |
565 | | void TEST_DocDBDumpToContainer( |
566 | | IncludeIntents include_intents, std::unordered_set<std::string>* out); |
567 | | |
568 | | // Dumps DocDB contents to log, every record as a separate log message, with the given prefix. |
569 | | void TEST_DocDBDumpToLog(IncludeIntents include_intents); |
570 | | |
571 | | size_t TEST_CountRegularDBRecords(); |
572 | | |
573 | | CHECKED_STATUS CreateReadIntents( |
574 | | const TransactionMetadataPB& transaction_metadata, |
575 | | const SubTransactionMetadataPB& subtransaction_metadata, |
576 | | const google::protobuf::RepeatedPtrField<QLReadRequestPB>& ql_batch, |
577 | | const google::protobuf::RepeatedPtrField<PgsqlReadRequestPB>& pgsql_batch, |
578 | | docdb::KeyValueWriteBatchPB* out); |
579 | | |
580 | | uint64_t GetCurrentVersionSstFilesSize() const; |
581 | | uint64_t GetCurrentVersionSstFilesUncompressedSize() const; |
582 | | std::pair<uint64_t, uint64_t> GetCurrentVersionSstFilesAllSizes() const; |
583 | | uint64_t GetCurrentVersionNumSSTFiles() const; |
584 | | |
585 | | void ListenNumSSTFilesChanged(std::function<void()> listener); |
586 | | |
587 | | // Returns the number of memtables in intents and regular db-s. |
588 | | std::pair<int, int> GetNumMemtables() const; |
589 | | |
590 | 150k | void SetHybridTimeLeaseProvider(HybridTimeLeaseProvider provider) { |
591 | 150k | ht_lease_provider_ = std::move(provider); |
592 | 150k | } |
593 | | |
594 | 150k | void SetMemTableFlushFilterFactory(std::function<rocksdb::MemTableFilter()> factory) { |
595 | 150k | mem_table_flush_filter_factory_ = std::move(factory); |
596 | 150k | } |
597 | | |
598 | | // When a compaction starts with a particular "history cutoff" timestamp, it calls this function |
599 | | // to disallow reads at a time lower than that history cutoff timestamp, to avoid reading |
600 | | // invalid/incomplete data. |
601 | | // |
602 | | // Returns true if the new history cutoff timestamp was successfully registered, or false if |
603 | | // it can't be used because there are pending reads at lower timestamps. |
604 | | HybridTime Get(HybridTime lower_bound); |
605 | | |
606 | | bool ShouldApplyWrite(); |
607 | | |
608 | 40 | rocksdb::DB* TEST_db() const { |
609 | 40 | return regular_db_.get(); |
610 | 40 | } |
611 | | |
612 | 0 | rocksdb::DB* TEST_intents_db() const { |
613 | 0 | return intents_db_.get(); |
614 | 0 | } |
615 | | |
616 | | CHECKED_STATUS TEST_SwitchMemtable(); |
617 | | |
618 | | // Initialize RocksDB's max persistent op id and hybrid time to that of the operation state. |
619 | | // Necessary for cases like truncate or restore snapshot when RocksDB is reset. |
620 | | CHECKED_STATUS ModifyFlushedFrontier( |
621 | | const docdb::ConsensusFrontier& value, |
622 | | rocksdb::FrontierModificationMode mode, |
623 | | FlushFlags flags = FlushFlags::kAllDbs); |
624 | | |
625 | | // Get the isolation level of the given transaction from the metadata stored in the provisional |
626 | | // records RocksDB. |
627 | | Result<IsolationLevel> GetIsolationLevel(const TransactionMetadataPB& transaction) override; |
628 | | |
629 | | // Creates an on-disk sub tablet of this tablet with specified ID, partition and key bounds. |
630 | | // Flushes this tablet data onto disk before creating sub tablet. |
631 | | // Also updates flushed frontier for regular and intents DBs to match split_op_id and |
632 | | // split_op_hybrid_time. |
633 | | // In case of error sub-tablet could be partially persisted on disk. |
634 | | Result<RaftGroupMetadataPtr> CreateSubtablet( |
635 | | const TabletId& tablet_id, const Partition& partition, const docdb::KeyBounds& key_bounds, |
636 | | const yb::OpId& split_op_id, const HybridTime& split_op_hybrid_time); |
637 | | |
638 | | // Scans the intent db. Potentially takes a long time. Used for testing/debugging. |
639 | | Result<int64_t> CountIntents(); |
640 | | |
641 | | // Flushed intents db if necessary. |
642 | | void FlushIntentsDbIfNecessary(const yb::OpId& lastest_log_entry_op_id); |
643 | | |
644 | 1.00M | bool is_sys_catalog() const { return is_sys_catalog_; } |
645 | | bool IsTransactionalRequest(bool is_ysql_request) const override; |
646 | | |
647 | | void SetCleanupPool(ThreadPool* thread_pool); |
648 | | |
649 | 5.98k | TabletSnapshots& snapshots() { |
650 | 5.98k | return *snapshots_; |
651 | 5.98k | } |
652 | | |
653 | 2.94k | SnapshotCoordinator* snapshot_coordinator() { |
654 | 2.94k | return snapshot_coordinator_; |
655 | 2.94k | } |
656 | | |
657 | 0 | docdb::YQLRowwiseIteratorIf* cdc_iterator() { |
658 | 0 | return cdc_iterator_; |
659 | 0 | } |
660 | | |
661 | | // Allows us to add tablet-specific information that will get deref'd when the tablet does. |
662 | 444 | void AddAdditionalMetadata(const std::string& key, std::shared_ptr<void> additional_metadata) { |
663 | 444 | std::lock_guard<std::mutex> lock(control_path_mutex_); |
664 | 444 | additional_metadata_.emplace(key, std::move(additional_metadata)); |
665 | 444 | } |
666 | | |
667 | 881 | std::shared_ptr<void> GetAdditionalMetadata(const std::string& key) { |
668 | 881 | std::lock_guard<std::mutex> lock(control_path_mutex_); |
669 | 881 | auto val = additional_metadata_.find(key); |
670 | 881 | return (val != additional_metadata_.end()) ? val->second437 : nullptr444 ; |
671 | 881 | } |
672 | | |
673 | | void InitRocksDBOptions( |
674 | | rocksdb::Options* options, const std::string& log_prefix, |
675 | | rocksdb::BlockBasedTableOptions table_options = rocksdb::BlockBasedTableOptions()); |
676 | | |
677 | 45.0M | TabletRetentionPolicy* RetentionPolicy() override { |
678 | 45.0M | return retention_policy_.get(); |
679 | 45.0M | } |
680 | | |
681 | | // Triggers a compaction on this tablet if it is the result of a tablet split but has not yet been |
682 | | // compacted. Assumes ownership of the provided thread pool token, and uses it to submit the |
683 | | // compaction task. It is an error to call this method if a post-split compaction has been |
684 | | // triggered previously by this tablet. |
685 | | CHECKED_STATUS TriggerPostSplitCompactionIfNeeded( |
686 | | std::function<std::unique_ptr<ThreadPoolToken>()> get_token_for_compaction); |
687 | | |
688 | | // Verifies the data on this tablet for consistency. Returns status OK if checks pass. |
689 | | CHECKED_STATUS VerifyDataIntegrity(); |
690 | | |
691 | | CHECKED_STATUS CheckOperationAllowed(const OpId& op_id, consensus::OperationType op_type) |
692 | | EXCLUDES(operation_filters_mutex_); |
693 | | |
694 | | void RegisterOperationFilter(OperationFilter* filter) EXCLUDES(operation_filters_mutex_); |
695 | | void UnregisterOperationFilter(OperationFilter* filter) EXCLUDES(operation_filters_mutex_); |
696 | | |
697 | | void SplitDone(); |
698 | | CHECKED_STATUS RestoreStarted(const TxnSnapshotRestorationId& restoration_id); |
699 | | CHECKED_STATUS RestoreFinished( |
700 | | const TxnSnapshotRestorationId& restoration_id, HybridTime restoration_hybrid_time); |
701 | | CHECKED_STATUS CheckRestorations(const RestorationCompleteTimeMap& restoration_complete_time); |
702 | | |
703 | 3.24M | bool txns_enabled() const { |
704 | 3.24M | return txns_enabled_; |
705 | 3.24M | } |
706 | | |
707 | 15.1k | client::YBClient& client() { |
708 | 15.1k | return *client_future_.get(); |
709 | 15.1k | } |
710 | | |
711 | 27.7k | client::TransactionManager* transaction_manager() { |
712 | 27.7k | return transaction_manager_.get(); |
713 | 27.7k | } |
714 | | |
715 | | // Creates a new shared pointer of the object managed by metadata_cache_. This is done |
716 | | // atomically to avoid race conditions. |
717 | | std::shared_ptr<client::YBMetaDataCache> YBMetaDataCache(); |
718 | | |
719 | | ScopedRWOperation CreateNonAbortableScopedRWOperation( |
720 | | const CoarseTimePoint deadline = CoarseTimePoint()) const; |
721 | | |
722 | | Result<TransactionOperationContext> CreateTransactionOperationContext( |
723 | | const TransactionMetadataPB& transaction_metadata, |
724 | | bool is_ysql_catalog_table, |
725 | | const SubTransactionMetadataPB* subtransaction_metadata = nullptr) const; |
726 | | |
727 | 4.50M | const Schema* unique_index_key_schema() const { |
728 | 4.50M | return unique_index_key_schema_.get(); |
729 | 4.50M | } |
730 | | |
731 | | private: |
732 | | friend class Iterator; |
733 | | friend class TabletPeerTest; |
734 | | friend class ScopedReadOperation; |
735 | | friend class TabletComponent; |
736 | | |
737 | | class RegularRocksDbListener; |
738 | | |
739 | | FRIEND_TEST(TestTablet, TestGetLogRetentionSizeForIndex); |
740 | | |
741 | | CHECKED_STATUS OpenKeyValueTablet(); |
742 | | virtual CHECKED_STATUS CreateTabletDirectories(const string& db_dir, FsManager* fs); |
743 | | |
744 | | std::vector<yb::ColumnSchema> GetColumnSchemasForIndex(const std::vector<IndexInfo>& indexes); |
745 | | |
746 | | void DocDBDebugDump(std::vector<std::string> *lines); |
747 | | |
748 | | CHECKED_STATUS WriteTransactionalBatch( |
749 | | int64_t batch_idx, // index of this batch in its transaction |
750 | | const docdb::KeyValueWriteBatchPB& put_batch, |
751 | | HybridTime hybrid_time, |
752 | | const rocksdb::UserFrontiers* frontiers); |
753 | | |
754 | | Result<TransactionOperationContext> CreateTransactionOperationContext( |
755 | | const boost::optional<TransactionId>& transaction_id, |
756 | | bool is_ysql_catalog_table, |
757 | | const SubTransactionMetadataPB* subtransaction_metadata = nullptr) const; |
758 | | |
759 | | // Pause abortable/non-abortable new read/write operations and wait for all |
760 | | // abortable/non-abortable pending read/write operations to finish. |
761 | | // If stop is false, ScopedRWOperation constructor will wait while ScopedRWOperationPause is |
762 | | // alive. |
763 | | // If stop is true, ScopedRWOperation constructor will create an instance with an error (see |
764 | | // ScopedRWOperation::ok()) while ScopedRWOperationPause is alive. |
765 | | ScopedRWOperationPause PauseReadWriteOperations( |
766 | | Abortable abortable, Stop stop = Stop::kFalse); |
767 | | |
768 | | // Pauses new non-abortable read/write operations and wait for all of those that are pending to |
769 | | // complete. |
770 | | // Starts RocksDB shutdown (that will abort abortable read/write operations). |
771 | | // Pauses new abortable read/write operations and wait for all of those that are pending to |
772 | | // complete. |
773 | | // Returns TabletScopedRWOperationPauses that are preventing new read/write operations from being |
774 | | // started. |
775 | | Result<TabletScopedRWOperationPauses> StartShutdownRocksDBs( |
776 | | DisableFlushOnShutdown disable_flush_on_shutdown, Stop stop = Stop::kFalse); |
777 | | |
778 | | CHECKED_STATUS CompleteShutdownRocksDBs( |
779 | | Destroy destroy, TabletScopedRWOperationPauses* ops_pauses); |
780 | | |
781 | | ScopedRWOperation CreateAbortableScopedRWOperation( |
782 | | const CoarseTimePoint deadline = CoarseTimePoint()) const; |
783 | | |
784 | | CHECKED_STATUS DoEnableCompactions(); |
785 | | |
786 | | std::string LogPrefix() const; |
787 | | |
788 | | std::string LogPrefix(docdb::StorageDbType db_type) const; |
789 | | |
790 | | Result<bool> IsQueryOnlyForTablet(const PgsqlReadRequestPB& pgsql_read_request, |
791 | | size_t row_count) const; |
792 | | |
793 | | Result<bool> HasScanReachedMaxPartitionKey( |
794 | | const PgsqlReadRequestPB& pgsql_read_request, |
795 | | const string& partition_key, |
796 | | size_t row_count) const; |
797 | | |
798 | | // Sets metadata_cache_ to nullptr. This is done atomically to avoid race conditions. |
799 | | void ResetYBMetaDataCache(); |
800 | | |
801 | | // Creates a new client::YBMetaDataCache object and atomically assigns it to metadata_cache_. |
802 | | void CreateNewYBMetaDataCache(); |
803 | | |
804 | | void TriggerPostSplitCompactionSync(); |
805 | | |
806 | | // Opens read-only rocksdb at the specified directory and checks for any file corruption. |
807 | | CHECKED_STATUS OpenDbAndCheckIntegrity(const std::string& db_dir); |
808 | | |
809 | | // Add or remove restoring operation filter if necessary. |
810 | | // If reset_split is true, also reset split state. |
811 | | void SyncRestoringOperationFilter(ResetSplit reset_split) EXCLUDES(operation_filters_mutex_); |
812 | | void UnregisterOperationFilterUnlocked(OperationFilter* filter) |
813 | | REQUIRES(operation_filters_mutex_); |
814 | | |
815 | | std::unique_ptr<const Schema> key_schema_; |
816 | | |
817 | | RaftGroupMetadataPtr metadata_; |
818 | | TableType table_type_; |
819 | | |
820 | | // Lock protecting access to the 'components_' member (i.e the rowsets in the tablet) |
821 | | // |
822 | | // Shared mode: |
823 | | // - Writers take this in shared mode at the same time as they obtain an MVCC hybrid_time |
824 | | // and capture a reference to components_. This ensures that we can use the MVCC hybrid_time |
825 | | // to determine which writers are writing to which components during compaction. |
826 | | // - Readers take this in shared mode while capturing their iterators. This ensures that |
827 | | // they see a consistent view when racing against flush/compact. |
828 | | // |
829 | | // Exclusive mode: |
830 | | // - Flushes/compactions take this lock in order to lock out concurrent updates. |
831 | | // |
832 | | // NOTE: callers should avoid taking this lock for a long time, even in shared mode. |
833 | | // This is because the lock has some concept of fairness -- if, while a long reader |
834 | | // is active, a writer comes along, then all future short readers will be blocked. |
835 | | // TODO: now that this is single-threaded again, we should change it to rw_spinlock |
836 | | mutable rw_spinlock component_lock_; |
837 | | |
838 | | scoped_refptr<log::LogAnchorRegistry> log_anchor_registry_; |
839 | | std::shared_ptr<MemTracker> mem_tracker_; |
840 | | std::shared_ptr<MemTracker> block_based_table_mem_tracker_; |
841 | | |
842 | | MetricEntityPtr tablet_metrics_entity_; |
843 | | MetricEntityPtr table_metrics_entity_; |
844 | | std::unique_ptr<TabletMetrics> metrics_; |
845 | | std::shared_ptr<void> metric_detacher_; |
846 | | |
847 | | // A pointer to the server's clock. |
848 | | scoped_refptr<server::Clock> clock_; |
849 | | |
850 | | MvccManager mvcc_; |
851 | | |
852 | | // Lock used to serialize the creation of RocksDB checkpoints. |
853 | | mutable std::mutex create_checkpoint_lock_; |
854 | | |
855 | | enum State { |
856 | | kInitialized, |
857 | | kBootstrapping, |
858 | | kOpen, |
859 | | kShutdown |
860 | | }; |
861 | | State state_ = kInitialized; |
862 | | |
863 | | // Fault hooks. In production code, these will always be nullptr. |
864 | | std::shared_ptr<CompactionFaultHooks> compaction_hooks_; |
865 | | std::shared_ptr<FlushFaultHooks> flush_hooks_; |
866 | | std::shared_ptr<FlushCompactCommonHooks> common_hooks_; |
867 | | |
868 | | // Statistics for the RocksDB database. |
869 | | std::shared_ptr<rocksdb::Statistics> regulardb_statistics_; |
870 | | std::shared_ptr<rocksdb::Statistics> intentsdb_statistics_; |
871 | | |
872 | | // RocksDB database instances for key-value tables. |
873 | | std::unique_ptr<rocksdb::DB> regular_db_; |
874 | | std::unique_ptr<rocksdb::DB> intents_db_; |
875 | | std::atomic<bool> rocksdb_shutdown_requested_{false}; |
876 | | |
877 | | // Optional key bounds (see docdb::KeyBounds) served by this tablet. |
878 | | docdb::KeyBounds key_bounds_; |
879 | | |
880 | | std::unique_ptr<docdb::YQLStorageIf> ql_storage_; |
881 | | |
882 | | // This is for docdb fine-grained locking. |
883 | | docdb::SharedLockManager shared_lock_manager_; |
884 | | |
885 | | // For the block cache and memory manager shared across tablets |
886 | | const TabletOptions tablet_options_; |
887 | | |
888 | | // A lightweight way to reject new operations when the tablet is shutting down. This is used to |
889 | | // prevent race conditions between destroying the RocksDB instance and read/write operations. |
890 | | std::atomic_bool shutdown_requested_{false}; |
891 | | |
892 | | // This is a special atomic counter per tablet that increases monotonically. |
893 | | // It is like timestamp, but doesn't need locks to read or update. |
894 | | // This is raft replicated as well. Each replicate message contains the current number. |
895 | | // It is guaranteed to keep increasing for committed entries even across tablet server |
896 | | // restarts and leader changes. |
897 | | std::atomic<int64_t> monotonic_counter_{0}; |
898 | | |
899 | | // Number of pending non-abortable operations. We use this to make sure we don't shut down RocksDB |
900 | | // before all non-abortable pending operations are finished. We don't have a strict definition of |
901 | | // an "operation" for the purpose of this counter. We simply wait for this counter to go to zero |
902 | | // before starting RocksDB shutdown. |
903 | | // Note: as of 2021-06-28 applying of Raft operations could not handle errors that happened due to |
904 | | // RocksDB shutdown. |
905 | | // |
906 | | // This is marked mutable because read path member functions (which are const) are using this. |
907 | | mutable RWOperationCounter pending_non_abortable_op_counter_; |
908 | | |
909 | | // Similar to pending_non_abortable_op_counter_ but for operations that could be aborted, i.e. |
910 | | // operations that could handle RocksDB shutdown during their execution, for example manual |
911 | | // compactions. |
912 | | // We wait for this counter to go to zero after starting RocksDB shutdown and before destroying |
913 | | // RocksDB in-memory instance. |
914 | | mutable RWOperationCounter pending_abortable_op_counter_; |
915 | | |
916 | | // Used by Alter/Schema-change ops to pause new write ops from being submitted. |
917 | | RWOperationCounter write_ops_being_submitted_counter_; |
918 | | |
919 | | std::unique_ptr<TransactionCoordinator> transaction_coordinator_; |
920 | | |
921 | | std::unique_ptr<TransactionParticipant> transaction_participant_; |
922 | | |
923 | | std::shared_future<client::YBClient*> client_future_; |
924 | | |
925 | | // Created only when secondary indexes are present. |
926 | | std::unique_ptr<client::TransactionManager> transaction_manager_; |
927 | | |
928 | | // This object should not be accessed directly to avoid race conditions. |
929 | | // Use methods YBMetaDataCache, CreateNewYBMetaDataCache, and ResetYBMetaDataCache to read it |
930 | | // and modify it. |
931 | | std::shared_ptr<client::YBMetaDataCache> metadata_cache_; |
932 | | |
933 | | // Created only if it is a unique index tablet. |
934 | | std::unique_ptr<Schema> unique_index_key_schema_; |
935 | | |
936 | | std::atomic<int64_t> last_committed_write_index_{0}; |
937 | | |
938 | | HybridTimeLeaseProvider ht_lease_provider_; |
939 | | |
940 | | Result<HybridTime> DoGetSafeTime( |
941 | | RequireLease require_lease, HybridTime min_allowed, CoarseTimePoint deadline) const override; |
942 | | |
943 | | Result<bool> IntentsDbFlushFilter(const rocksdb::MemTable& memtable); |
944 | | |
945 | | template <class Ids> |
946 | | CHECKED_STATUS RemoveIntentsImpl(const RemoveIntentsData& data, const Ids& ids); |
947 | | |
948 | | // Tries to find intent .SST files that could be deleted and remove them. |
949 | | void CleanupIntentFiles(); |
950 | | void DoCleanupIntentFiles(); |
951 | | |
952 | | void RegularDbFilesChanged(); |
953 | | |
954 | | Result<HybridTime> ApplierSafeTime(HybridTime min_allowed, CoarseTimePoint deadline) override; |
955 | | |
956 | 74 | void MinRunningHybridTimeSatisfied() override { |
957 | 74 | CleanupIntentFiles(); |
958 | 74 | } |
959 | | |
960 | | template <class F> |
961 | | auto GetRegularDbStat(const F& func, const decltype(func())& default_value) const; |
962 | | |
963 | | std::function<rocksdb::MemTableFilter()> mem_table_flush_filter_factory_; |
964 | | |
965 | | client::LocalTabletFilter local_tablet_filter_; |
966 | | |
967 | | // This is typically "P <peer_id>", so we can get a log prefix "T <tablet_id> P <peer_id>: ". |
968 | | std::string log_prefix_suffix_; |
969 | | |
970 | | IsSysCatalogTablet is_sys_catalog_; |
971 | | TransactionsEnabled txns_enabled_; |
972 | | |
973 | | std::unique_ptr<ThreadPoolToken> cleanup_intent_files_token_; |
974 | | |
975 | | std::unique_ptr<TabletSnapshots> snapshots_; |
976 | | |
977 | | SnapshotCoordinator* snapshot_coordinator_ = nullptr; |
978 | | |
979 | | docdb::YQLRowwiseIteratorIf* cdc_iterator_ = nullptr; |
980 | | |
981 | | mutable std::mutex control_path_mutex_; |
982 | | std::unordered_map<std::string, std::shared_ptr<void>> additional_metadata_ |
983 | | GUARDED_BY(control_path_mutex_); |
984 | | |
985 | | std::mutex num_sst_files_changed_listener_mutex_; |
986 | | std::function<void()> num_sst_files_changed_listener_ |
987 | | GUARDED_BY(num_sst_files_changed_listener_mutex_); |
988 | | |
989 | | std::shared_ptr<TabletRetentionPolicy> retention_policy_; |
990 | | |
991 | | // Thread pool token for manually triggering compactions for tablets created from a split. This |
992 | | // member is set when a post-split compaction is triggered on this tablet as the result of a call |
993 | | // to TriggerPostSplitCompactionIfNeeded. It is an error to attempt to trigger another post-split |
994 | | // compaction if this member is already set, as the existence of this member implies that such a |
995 | | // compaction has already been triggered for this instance. |
996 | | std::unique_ptr<ThreadPoolToken> post_split_compaction_task_pool_token_ = nullptr; |
997 | | |
998 | | simple_spinlock operation_filters_mutex_; |
999 | | |
1000 | | boost::intrusive::list<OperationFilter> operation_filters_ GUARDED_BY(operation_filters_mutex_); |
1001 | | |
1002 | | std::unique_ptr<OperationFilter> completed_split_operation_filter_ |
1003 | | GUARDED_BY(operation_filters_mutex_); |
1004 | | std::unique_ptr<log::LogAnchor> completed_split_log_anchor_ GUARDED_BY(operation_filters_mutex_); |
1005 | | |
1006 | | std::unique_ptr<OperationFilter> restoring_operation_filter_ GUARDED_BY(operation_filters_mutex_); |
1007 | | |
1008 | | DISALLOW_COPY_AND_ASSIGN(Tablet); |
1009 | | }; |
1010 | | |
1011 | | // A helper class to manage read transactions. Grabs and registers a read point with the tablet |
1012 | | // when created, and deregisters the read point when this object is destructed. |
1013 | | class ScopedReadOperation { |
1014 | | public: |
1015 | 11.9M | ScopedReadOperation() : tablet_(nullptr) {} |
1016 | | ScopedReadOperation(ScopedReadOperation&& rhs) |
1017 | 20.0M | : tablet_(rhs.tablet_), read_time_(rhs.read_time_) { |
1018 | 20.0M | rhs.tablet_ = nullptr; |
1019 | 20.0M | } |
1020 | | |
1021 | | void operator=(ScopedReadOperation&& rhs); |
1022 | | |
1023 | | static Result<ScopedReadOperation> Create( |
1024 | | AbstractTablet* tablet, |
1025 | | RequireLease require_lease, |
1026 | | ReadHybridTime read_time); |
1027 | | |
1028 | | ScopedReadOperation(const ScopedReadOperation&) = delete; |
1029 | | void operator=(const ScopedReadOperation&) = delete; |
1030 | | |
1031 | | ~ScopedReadOperation(); |
1032 | | |
1033 | 10.0M | const ReadHybridTime& read_time() const { return read_time_; } |
1034 | | |
1035 | 0 | Status status() const { return status_; } |
1036 | | |
1037 | | void Reset(); |
1038 | | |
1039 | | private: |
1040 | | explicit ScopedReadOperation( |
1041 | | AbstractTablet* tablet, const ReadHybridTime& read_time); |
1042 | | |
1043 | | AbstractTablet* tablet_; |
1044 | | ReadHybridTime read_time_; |
1045 | | Status status_; |
1046 | | }; |
1047 | | |
1048 | | bool IsSchemaVersionCompatible( |
1049 | | uint32_t current_version, uint32_t request_version, bool compatible_with_previous_version); |
1050 | | |
1051 | | } // namespace tablet |
1052 | | } // namespace yb |
1053 | | |
1054 | | #endif // YB_TABLET_TABLET_H_ |