/Users/deen/code/yugabyte-db/src/yb/tablet/tablet.h

Source (jump to first uncovered line)
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
#ifndef YB_TABLET_TABLET_H_
#define YB_TABLET_TABLET_H_

#include <boost/intrusive/list.hpp>

#include "yb/common/common_fwd.h"
#include "yb/common/read_hybrid_time.h"
#include "yb/common/snapshot.h"
#include "yb/common/transaction.h"

#include "yb/consensus/consensus_fwd.h"
#include "yb/consensus/consensus_types.pb.h"

#include "yb/docdb/docdb_fwd.h"
#include "yb/docdb/docdb_types.h"
#include "yb/docdb/key_bounds.h"
#include "yb/docdb/shared_lock_manager.h"

#include "yb/gutil/ref_counted.h"

#include "yb/rocksdb/rocksdb_fwd.h"
#include "yb/rocksdb/options.h"
#include "yb/rocksdb/table.h"
#include "yb/rocksdb/types.h"

#include "yb/tablet/tablet_fwd.h"
#include "yb/tablet/abstract_tablet.h"
#include "yb/tablet/mvcc.h"
#include "yb/tablet/operation_filter.h"
#include "yb/tablet/tablet_options.h"
#include "yb/tablet/transaction_intent_applier.h"

#include "yb/util/status_fwd.h"
#include "yb/util/enums.h"
#include "yb/util/locks.h"
#include "yb/util/net/net_fwd.h"
#include "yb/util/operation_counter.h"
#include "yb/util/strongly_typed_bool.h"
#include "yb/util/threadpool.h"

namespace yb {

class FsManager;
class MemTracker;
class MetricEntity;
class RowChangeList;

namespace server {
class Clock;
}

namespace tablet {

YB_STRONGLY_TYPED_BOOL(IncludeIntents);
YB_STRONGLY_TYPED_BOOL(Abortable);

inline FlushFlags operator|(FlushFlags lhs, FlushFlags rhs) {
  return static_cast<FlushFlags>(to_underlying(lhs) | to_underlying(rhs));
}

inline FlushFlags operator&(FlushFlags lhs, FlushFlags rhs) {
  return static_cast<FlushFlags>(to_underlying(lhs) & to_underlying(rhs));
}

inline bool HasFlags(FlushFlags lhs, FlushFlags rhs) {
  return (lhs & rhs) != FlushFlags::kNone;
}

class WriteOperation;

using AddTableListener = std::function<Status(const TableInfo&)>;

class TabletScopedIf : public RefCountedThreadSafe<TabletScopedIf> {
 public:
  virtual std::string Key() const = 0;
 protected:
  friend class RefCountedThreadSafe<TabletScopedIf>;
  virtual ~TabletScopedIf() { }
};

YB_STRONGLY_TYPED_BOOL(AllowBootstrappingState);
YB_STRONGLY_TYPED_BOOL(ResetSplit);

struct TabletScopedRWOperationPauses {
  ScopedRWOperationPause abortable;
  ScopedRWOperationPause non_abortable;

  std::array<ScopedRWOperationPause*, 2> AsArray() {
    return {&abortable, &non_abortable};
  }
};

class Tablet : public AbstractTablet, public TransactionIntentApplier {
 public:
  class CompactionFaultHooks;
  class FlushCompactCommonHooks;
  class FlushFaultHooks;

  // A function that returns the current majority-replicated hybrid time leader lease, or waits
  // until a hybrid time leader lease with at least the given hybrid time is acquired
  // (first argument), or a timeout occurs (second argument). HybridTime::kInvalid is returned
  // in case of a timeout.
  using HybridTimeLeaseProvider = std::function<Result<FixedHybridTimeLease>(
      HybridTime, CoarseTimePoint)>;
  using TransactionIdSet = std::unordered_set<TransactionId, TransactionIdHash>;

  // Create a new tablet.
  //
  // If 'metric_registry' is non-nullptr, then this tablet will create a 'tablet' entity
  // within the provided registry. Otherwise, no metrics are collected.
  explicit Tablet(const TabletInitData& data);

  ~Tablet();

  // Open the tablet.
  // Upon completion, the tablet enters the kBootstrapping state.
  CHECKED_STATUS Open();

  CHECKED_STATUS EnableCompactions(ScopedRWOperationPause* non_abortable_ops_pause);

  // Performs backfill for the key range beginning from the row immediately after
  // <backfill_from>, until either it reaches the end of the tablet
  //    or the current time is past deadline.
  // *<number_of_rows_processed> will be set to the number of rows backfilled.
  // <backfilled_until> will be set to the first row that was not backfilled, so that the
  //    next API call can resume from where the backfill was left off.
  //    Note that <backfilled_until> only applies to the non-failing indexes.
  CHECKED_STATUS BackfillIndexesForYsql(
      const std::vector<IndexInfo>& indexes,
      const std::string& backfill_from,
      const CoarseTimePoint deadline,
      const HybridTime read_time,
      const HostPort& pgsql_proxy_bind_address,
      const std::string& database_name,
      const uint64_t postgres_auth_key,
      size_t* number_of_rows_processed,
      std::string* backfilled_until);

  CHECKED_STATUS VerifyIndexTableConsistencyForCQL(
      const std::vector<IndexInfo>& indexes,
      const std::string& start_key,
      const int num_rows,
      const CoarseTimePoint deadline,
      const HybridTime read_time,
      std::unordered_map<TableId, uint64>* consistency_stats,
      std::string* verified_until);

  CHECKED_STATUS VerifyMainTableConsistencyForCQL(
      const TableId& main_table_id,
      const std::string& start_key,
      const int num_rows,
      const CoarseTimePoint deadline,
      const HybridTime read_time,
      std::unordered_map<TableId, uint64>* consistency_stats,
      std::string* verified_until);

  CHECKED_STATUS VerifyTableConsistencyForCQL(
      const std::vector<TableId>& table_ids,
      const std::vector<yb::ColumnSchema>& columns,
      const std::string& start_key,
      const int num_rows,
      const CoarseTimePoint deadline,
      const HybridTime read_time,
      const bool is_main_table,
      std::unordered_map<TableId, uint64>* consistency_stats,
      std::string* verified_until);

  CHECKED_STATUS VerifyTableInBatches(
      const QLTableRow& row,
      const std::vector<TableId>& table_ids,
      const HybridTime read_time,
      const CoarseTimePoint deadline,
      const bool is_main_table,
      std::vector<std::pair<const TableId, QLReadRequestPB>>* requests,
      CoarseTimePoint* last_flushed_at,
      std::unordered_set<TableId>* failed_indexes,
      std::unordered_map<TableId, uint64>* consistency_stats);

  CHECKED_STATUS FlushVerifyBatchIfRequired(
      const HybridTime read_time,
      const CoarseTimePoint deadline,
      std::vector<std::pair<const TableId, QLReadRequestPB>>* requests,
      CoarseTimePoint* last_flushed_at,
      std::unordered_set<TableId>* failed_indexes,
      std::unordered_map<TableId, uint64>* index_consistency_states);
  CHECKED_STATUS FlushVerifyBatch(
      const HybridTime read_time,
      const CoarseTimePoint deadline,
      std::vector<std::pair<const TableId, QLReadRequestPB>>* requests,
      CoarseTimePoint* last_flushed_at,
      std::unordered_set<TableId>* failed_indexes,
      std::unordered_map<TableId, uint64>* index_consistency_states);

  // Performs backfill for the key range beginning from the row <backfill_from>,
  // until either it reaches the end of the tablet
  //    or the current time is past deadline.
  // *<number_of_rows_processed> will be set to the number of rows backfilled.
  // <backfilled_until> will be set to the first row that was not backfilled, so that the
  //    next API call can resume from where the backfill was left off.
  //    Note that <backfilled_until> only applies to the non-failing indexes.
  // <failed_indexes> will be updated with the collection of index-ids for which any errors
  //    were encountered.
  CHECKED_STATUS BackfillIndexes(
      const std::vector<IndexInfo>& indexes,
      const std::string& backfill_from,
      const CoarseTimePoint deadline,
      const HybridTime read_time,
      size_t* number_of_rows_processed,
      std::string* backfilled_until,
      std::unordered_set<TableId>* failed_indexes);

  CHECKED_STATUS UpdateIndexInBatches(
      const QLTableRow& row,
      const std::vector<IndexInfo>& indexes,
      const HybridTime write_time,
      const CoarseTimePoint deadline,
      std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests,
      std::unordered_set<TableId>* failed_indexes);

  Result<std::shared_ptr<client::YBSession>> GetSessionForVerifyOrBackfill(
      const CoarseTimePoint deadline);

  CHECKED_STATUS FlushWriteIndexBatchIfRequired(
      const HybridTime write_time,
      const CoarseTimePoint deadline,
      std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests,
      std::unordered_set<TableId>* failed_indexes);
  CHECKED_STATUS FlushWriteIndexBatch(
      const HybridTime write_time,
      const CoarseTimePoint deadline,
      std::vector<std::pair<const IndexInfo*, QLWriteRequestPB>>* index_requests,
      std::unordered_set<TableId>* failed_indexes);

  template <typename SomeYBqlOp>
  CHECKED_STATUS FlushWithRetries(
      std::shared_ptr<client::YBSession> session,
      const std::vector<std::shared_ptr<SomeYBqlOp>>& index_ops,
      int num_retries,
      std::unordered_set<TableId>* failed_indexes);

  // Mark that the tablet has finished bootstrapping.
  // This transitions from kBootstrapping to kOpen state.
  void MarkFinishedBootstrapping();

  // This can be called to proactively prevent new operations from being handled, even before
  // Shutdown() is called.
  // Returns true if it was the first call to StartShutdown.
  bool StartShutdown(IsDropTable is_drop_table = IsDropTable::kFalse);
  bool IsShutdownRequested() const {
    return shutdown_requested_.load(std::memory_order::memory_order_acquire);
  }

  void CompleteShutdown(IsDropTable is_drop_table = IsDropTable::kFalse);

  CHECKED_STATUS ImportData(const std::string& source_dir);

  Result<docdb::ApplyTransactionState> ApplyIntents(const TransactionApplyData& data) override;

  CHECKED_STATUS RemoveIntents(const RemoveIntentsData& data, const TransactionId& id) override;

  CHECKED_STATUS RemoveIntents(
      const RemoveIntentsData& data, const TransactionIdSet& transactions) override;

  CHECKED_STATUS GetIntents(
      const TransactionId& id, std::vector<docdb::IntentKeyValueForCDC>* keyValueIntents,
      docdb::ApplyTransactionState* stream_state);

  // Apply all of the row operations associated with this transaction.
  CHECKED_STATUS ApplyRowOperations(
      WriteOperation* operation,
      AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse);

  CHECKED_STATUS ApplyOperation(
      const Operation& operation, int64_t batch_idx,
      const docdb::KeyValueWriteBatchPB& write_batch,
      AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse);

  // Apply a set of RocksDB row operations.
  // If rocksdb_write_batch is specified it could contain preencoded RocksDB operations.
  CHECKED_STATUS ApplyKeyValueRowOperations(
      int64_t batch_idx, // index of this batch in its transaction
      const docdb::KeyValueWriteBatchPB& put_batch,
      const rocksdb::UserFrontiers* frontiers,
      HybridTime hybrid_time,
      AlreadyAppliedToRegularDB already_applied_to_regular_db = AlreadyAppliedToRegularDB::kFalse);

  void WriteToRocksDB(
      const rocksdb::UserFrontiers* frontiers,
      rocksdb::WriteBatch* write_batch,
      docdb::StorageDbType storage_db_type);

  //------------------------------------------------------------------------------------------------
  // Redis Request Processing.
  // Takes a Redis WriteRequestPB as input with its redis_write_batch.
  // Constructs a WriteRequestPB containing a serialized WriteBatch that will be
  // replicated by Raft. (Makes a copy, it is caller's responsibility to deallocate
  // write_request afterwards if it is no longer needed).
  // The operation acquires the necessary locks required to correctly serialize concurrent write
  // operations to same/conflicting part of the key/sub-key space. The locks acquired are returned
  // via the 'keys_locked' vector, so that they may be unlocked later when the operation has been
  // committed.
  void KeyValueBatchFromRedisWriteBatch(std::unique_ptr<WriteQuery> query);

  CHECKED_STATUS HandleRedisReadRequest(
      CoarseTimePoint deadline,
      const ReadHybridTime& read_time,
      const RedisReadRequestPB& redis_read_request,
      RedisResponsePB* response) override;

  //------------------------------------------------------------------------------------------------
  // CQL Request Processing.
  CHECKED_STATUS HandleQLReadRequest(
      CoarseTimePoint deadline,
      const ReadHybridTime& read_time,
      const QLReadRequestPB& ql_read_request,
      const TransactionMetadataPB& transaction_metadata,
      QLReadRequestResult* result) override;

  CHECKED_STATUS CreatePagingStateForRead(
      const QLReadRequestPB& ql_read_request, const size_t row_count,
      QLResponsePB* response) const override;

  // The QL equivalent of KeyValueBatchFromRedisWriteBatch, works similarly.
  void KeyValueBatchFromQLWriteBatch(std::unique_ptr<WriteQuery> query);

  //------------------------------------------------------------------------------------------------
  // Postgres Request Processing.
  CHECKED_STATUS HandlePgsqlReadRequest(
      CoarseTimePoint deadline,
      const ReadHybridTime& read_time,
      bool is_explicit_request_read_time,
      const PgsqlReadRequestPB& pgsql_read_request,
      const TransactionMetadataPB& transaction_metadata,
      const SubTransactionMetadataPB& subtransaction_metadata,
      PgsqlReadRequestResult* result,
      size_t* num_rows_read) override;

  CHECKED_STATUS CreatePagingStateForRead(
      const PgsqlReadRequestPB& pgsql_read_request, const size_t row_count,
      PgsqlResponsePB* response) const override;

  CHECKED_STATUS PreparePgsqlWriteOperations(WriteQuery* query);
  void KeyValueBatchFromPgsqlWriteBatch(std::unique_ptr<WriteQuery> query);

  // Create a new row iterator which yields the rows as of the current MVCC
  // state of this tablet.
  // The returned iterator is not initialized.
  Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> NewRowIterator(
      const Schema& projection,
      const ReadHybridTime read_hybrid_time = {},
      const TableId& table_id = "",
      CoarseTimePoint deadline = CoarseTimePoint::max(),
      AllowBootstrappingState allow_bootstrapping_state = AllowBootstrappingState::kFalse,
      const Slice& sub_doc_key = Slice()) const;

  Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> NewRowIterator(
      const TableId& table_id) const;

  Result<std::unique_ptr<docdb::YQLRowwiseIteratorIf>> CreateCDCSnapshotIterator(
      const Schema& projection,
      const ReadHybridTime& time,
      const string& next_key);
  //------------------------------------------------------------------------------------------------
  // Makes RocksDB Flush.
  CHECKED_STATUS Flush(FlushMode mode,
                       FlushFlags flags = FlushFlags::kAllDbs,
                       int64_t ignore_if_flushed_after_tick = rocksdb::FlushOptions::kNeverIgnore);

  CHECKED_STATUS WaitForFlush();

  // Prepares the transaction context for the alter schema operation.
  // An error will be returned if the specified schema is invalid (e.g.
  // key mismatch, or missing IDs)
  CHECKED_STATUS CreatePreparedChangeMetadata(
      ChangeMetadataOperation* operation,
      const Schema* schema);

  // Apply the Schema of the specified operation.
  CHECKED_STATUS AlterSchema(ChangeMetadataOperation* operation);

  // Used to update the tablets on the index table that the index has been backfilled.
  // This means that major compactions can now garbage collect delete markers.
  CHECKED_STATUS MarkBackfillDone(const TableId& table_id = "");

  // Change wal_retention_secs in the metadata.
  CHECKED_STATUS AlterWalRetentionSecs(ChangeMetadataOperation* operation);

  // Apply replicated add table operation.
  CHECKED_STATUS AddTable(const TableInfoPB& table_info);

  // Apply replicated remove table operation.
  CHECKED_STATUS RemoveTable(const std::string& table_id);

  // Truncate this tablet by resetting the content of RocksDB.
  CHECKED_STATUS Truncate(TruncateOperation* operation);

  // Verbosely dump this entire tablet to the logs. This is only
  // really useful when debugging unit tests failures where the tablet
  // has a very small number of rows.
  CHECKED_STATUS DebugDump(vector<std::string>* lines = nullptr);

  const yb::SchemaPtr schema() const;

  // Returns a reference to the key projection of the tablet schema.
  // The schema keys are immutable.
  const Schema& key_schema() const { return *key_schema_; }

  // Return the MVCC manager for this tablet.
  MvccManager* mvcc_manager() { return &mvcc_; }

  docdb::SharedLockManager* shared_lock_manager() { return &shared_lock_manager_; }

  std::atomic<int64_t>* monotonic_counter() { return &monotonic_counter_; }

  // Set the conter to at least 'value'.
  void UpdateMonotonicCounter(int64_t value);

  const RaftGroupMetadata *metadata() const { return metadata_.get(); }
  RaftGroupMetadata *metadata() { return metadata_.get(); }

  rocksdb::Env& rocksdb_env() const;

  const std::string& tablet_id() const override;

  bool system() const override {
    return false;
  }

  // Return the metrics for this tablet.
  // May be nullptr in unit tests, etc.
  TabletMetrics* metrics() { return metrics_.get(); }

  // Return handle to the metric entity of this tablet/table.
  const scoped_refptr<MetricEntity>& GetTableMetricsEntity() const {
    return table_metrics_entity_;
  }
  const scoped_refptr<MetricEntity>& GetTabletMetricsEntity() const {
    return tablet_metrics_entity_;
  }

  // Returns a reference to this tablet's memory tracker.
  const std::shared_ptr<MemTracker>& mem_tracker() const { return mem_tracker_; }

  TableType table_type() const override { return table_type_; }

  // Returns true if a RocksDB-backed tablet has any SSTables.
  Result<bool> HasSSTables() const;

  // Returns the maximum persistent op id from all SSTables in RocksDB.
  // First for regular records and second for intents.
  // When invalid_if_no_new_data is true then function would return invalid op id when no new
  // data is present in corresponding db.
  Result<DocDbOpIds> MaxPersistentOpId(bool invalid_if_no_new_data = false) const;

  // Returns the maximum persistent hybrid_time across all SSTables in RocksDB.
  Result<HybridTime> MaxPersistentHybridTime() const;

  // Returns oldest mutable memtable write hybrid time in RocksDB or HybridTime::kMax if memtable
  // is empty.
  Result<HybridTime> OldestMutableMemtableWriteHybridTime() const;

  // For non-kudu table type fills key-value batch in transaction state request and updates
  // request in state. Due to acquiring locks it can block the thread.
  void AcquireLocksAndPerformDocOperations(std::unique_ptr<WriteQuery> query);

  // Given a propopsed "history cutoff" timestamp, returns either that value, if possible, or a
  // smaller value corresponding to the oldest active reader, whichever is smaller. This ensures
  // that data needed by active read operations is not compacted away.
  //
  // Also updates the "earliest allowed read time" of the tablet to be equal to the returned value,
  // (if it is still lower than the value about to be returned), so that new readers with timestamps
  // earlier than that will be rejected.
  HybridTime UpdateHistoryCutoff(HybridTime proposed_cutoff);

  const scoped_refptr<server::Clock> &clock() const {
    return clock_;
  }

  SchemaPtr GetSchema(const std::string& table_id = "") const override;

  Schema GetKeySchema(const std::string& table_id = "") const;

  const docdb::YQLStorageIf& QLStorage() const override {
    return *ql_storage_;
  }

  // Provide a way for write operations to wait when tablet schema is
  // being changed.
  ScopedRWOperationPause PauseWritePermits(CoarseTimePoint deadline);
  ScopedRWOperation GetPermitToWrite(CoarseTimePoint deadline);

  // Used from tests
  const std::shared_ptr<rocksdb::Statistics>& regulardb_statistics() const {
    return regulardb_statistics_;
  }

  const std::shared_ptr<rocksdb::Statistics>& intentsdb_statistics() const {
    return intentsdb_statistics_;
  }

  TransactionCoordinator* transaction_coordinator() {
    return transaction_coordinator_.get();
  }

  TransactionParticipant* transaction_participant() const {
    return transaction_participant_.get();
  }

  // Returns true if the tablet was created after a split but it has not yet had data from it's
  // parent which are now outside of its key range removed.
  Result<bool> StillHasOrphanedPostSplitData();

  // Wrapper for StillHasOrphanedPostSplitData. Conservatively returns true if
  // StillHasOrphanedPostSplitData failed, otherwise returns the result value.
  bool MayHaveOrphanedPostSplitData();

  // If true, we should report, in our heartbeat to the master, that loadbalancer moves should be
  // disabled. We do so, for example, when StillHasOrphanedPostSplitData() returns true.
  bool ShouldDisableLbMove();

  void ForceRocksDBCompactInTest();

  CHECKED_STATUS ForceFullRocksDBCompact();

  docdb::DocDB doc_db() const { return { regular_db_.get(), intents_db_.get(), &key_bounds_ }; }

  // Returns approximate middle key for tablet split:
  // - for hash-based partitions: encoded hash code in order to split by hash code.
  // - for range-based partitions: encoded doc key in order to split by row.
  Result<std::string> GetEncodedMiddleSplitKey() const;

  std::string TEST_DocDBDumpStr(IncludeIntents include_intents = IncludeIntents::kFalse);

  void TEST_DocDBDumpToContainer(
      IncludeIntents include_intents, std::unordered_set<std::string>* out);

  // Dumps DocDB contents to log, every record as a separate log message, with the given prefix.
  void TEST_DocDBDumpToLog(IncludeIntents include_intents);

  size_t TEST_CountRegularDBRecords();

  CHECKED_STATUS CreateReadIntents(
      const TransactionMetadataPB& transaction_metadata,
      const SubTransactionMetadataPB& subtransaction_metadata,
      const google::protobuf::RepeatedPtrField<QLReadRequestPB>& ql_batch,
      const google::protobuf::RepeatedPtrField<PgsqlReadRequestPB>& pgsql_batch,
      docdb::KeyValueWriteBatchPB* out);

  uint64_t GetCurrentVersionSstFilesSize() const;
  uint64_t GetCurrentVersionSstFilesUncompressedSize() const;
  std::pair<uint64_t, uint64_t> GetCurrentVersionSstFilesAllSizes() const;
  uint64_t GetCurrentVersionNumSSTFiles() const;

  void ListenNumSSTFilesChanged(std::function<void()> listener);

  // Returns the number of memtables in intents and regular db-s.
  std::pair<int, int> GetNumMemtables() const;

  void SetHybridTimeLeaseProvider(HybridTimeLeaseProvider provider) {
    ht_lease_provider_ = std::move(provider);
  }

  void SetMemTableFlushFilterFactory(std::function<rocksdb::MemTableFilter()> factory) {
    mem_table_flush_filter_factory_ = std::move(factory);
  }

  // When a compaction starts with a particular "history cutoff" timestamp, it calls this function
  // to disallow reads at a time lower than that history cutoff timestamp, to avoid reading
  // invalid/incomplete data.
  //
  // Returns true if the new history cutoff timestamp was successfully registered, or false if
  // it can't be used because there are pending reads at lower timestamps.
  HybridTime Get(HybridTime lower_bound);

  bool ShouldApplyWrite();

  rocksdb::DB* TEST_db() const {
    return regular_db_.get();
  }

  rocksdb::DB* TEST_intents_db() const {
    return intents_db_.get();
  }

  CHECKED_STATUS TEST_SwitchMemtable();

  // Initialize RocksDB's max persistent op id and hybrid time to that of the operation state.
  // Necessary for cases like truncate or restore snapshot when RocksDB is reset.
  CHECKED_STATUS ModifyFlushedFrontier(
      const docdb::ConsensusFrontier& value,
      rocksdb::FrontierModificationMode mode,
      FlushFlags flags = FlushFlags::kAllDbs);

  // Get the isolation level of the given transaction from the metadata stored in the provisional
  // records RocksDB.
  Result<IsolationLevel> GetIsolationLevel(const TransactionMetadataPB& transaction) override;

  // Creates an on-disk sub tablet of this tablet with specified ID, partition and key bounds.
  // Flushes this tablet data onto disk before creating sub tablet.
  // Also updates flushed frontier for regular and intents DBs to match split_op_id and
  // split_op_hybrid_time.
  // In case of error sub-tablet could be partially persisted on disk.
  Result<RaftGroupMetadataPtr> CreateSubtablet(
      const TabletId& tablet_id, const Partition& partition, const docdb::KeyBounds& key_bounds,
      const yb::OpId& split_op_id, const HybridTime& split_op_hybrid_time);

  // Scans the intent db. Potentially takes a long time. Used for testing/debugging.
  Result<int64_t> CountIntents();

  // Flushed intents db if necessary.
  void FlushIntentsDbIfNecessary(const yb::OpId& lastest_log_entry_op_id);

  bool is_sys_catalog() const { return is_sys_catalog_; }
  bool IsTransactionalRequest(bool is_ysql_request) const override;

  void SetCleanupPool(ThreadPool* thread_pool);

  TabletSnapshots& snapshots() {
    return *snapshots_;
  }

  SnapshotCoordinator* snapshot_coordinator() {
    return snapshot_coordinator_;
  }

  docdb::YQLRowwiseIteratorIf* cdc_iterator() {
    return cdc_iterator_;
  }

  // Allows us to add tablet-specific information that will get deref'd when the tablet does.
  void AddAdditionalMetadata(const std::string& key, std::shared_ptr<void> additional_metadata) {
    std::lock_guard<std::mutex> lock(control_path_mutex_);
    additional_metadata_.emplace(key, std::move(additional_metadata));
  }

  std::shared_ptr<void> GetAdditionalMetadata(const std::string& key) {
    std::lock_guard<std::mutex> lock(control_path_mutex_);
    auto val = additional_metadata_.find(key);
    return (val != additional_metadata_.end()) ? val->second437 : nullptr444;
  }

  void InitRocksDBOptions(
      rocksdb::Options* options, const std::string& log_prefix,
      rocksdb::BlockBasedTableOptions table_options = rocksdb::BlockBasedTableOptions());

  TabletRetentionPolicy* RetentionPolicy() override {
    return retention_policy_.get();
  }

  // Triggers a compaction on this tablet if it is the result of a tablet split but has not yet been
  // compacted. Assumes ownership of the provided thread pool token, and uses it to submit the
  // compaction task. It is an error to call this method if a post-split compaction has been
  // triggered previously by this tablet.
  CHECKED_STATUS TriggerPostSplitCompactionIfNeeded(
    std::function<std::unique_ptr<ThreadPoolToken>()> get_token_for_compaction);

  // Verifies the data on this tablet for consistency. Returns status OK if checks pass.
  CHECKED_STATUS VerifyDataIntegrity();

  CHECKED_STATUS CheckOperationAllowed(const OpId& op_id, consensus::OperationType op_type)
      EXCLUDES(operation_filters_mutex_);

  void RegisterOperationFilter(OperationFilter* filter) EXCLUDES(operation_filters_mutex_);
  void UnregisterOperationFilter(OperationFilter* filter) EXCLUDES(operation_filters_mutex_);

  void SplitDone();
  CHECKED_STATUS RestoreStarted(const TxnSnapshotRestorationId& restoration_id);
  CHECKED_STATUS RestoreFinished(
      const TxnSnapshotRestorationId& restoration_id, HybridTime restoration_hybrid_time);
  CHECKED_STATUS CheckRestorations(const RestorationCompleteTimeMap& restoration_complete_time);

  bool txns_enabled() const {
    return txns_enabled_;
  }

  client::YBClient& client() {
    return *client_future_.get();
  }

  client::TransactionManager* transaction_manager() {
    return transaction_manager_.get();
  }

  // Creates a new shared pointer of the object managed by metadata_cache_. This is done
  // atomically to avoid race conditions.
  std::shared_ptr<client::YBMetaDataCache> YBMetaDataCache();

  ScopedRWOperation CreateNonAbortableScopedRWOperation(
      const CoarseTimePoint deadline = CoarseTimePoint()) const;

  Result<TransactionOperationContext> CreateTransactionOperationContext(
      const TransactionMetadataPB& transaction_metadata,
      bool is_ysql_catalog_table,
      const SubTransactionMetadataPB* subtransaction_metadata = nullptr) const;

  const Schema* unique_index_key_schema() const {
    return unique_index_key_schema_.get();
  }

 private:
  friend class Iterator;
  friend class TabletPeerTest;
  friend class ScopedReadOperation;
  friend class TabletComponent;

  class RegularRocksDbListener;

  FRIEND_TEST(TestTablet, TestGetLogRetentionSizeForIndex);

  CHECKED_STATUS OpenKeyValueTablet();
  virtual CHECKED_STATUS CreateTabletDirectories(const string& db_dir, FsManager* fs);

  std::vector<yb::ColumnSchema> GetColumnSchemasForIndex(const std::vector<IndexInfo>& indexes);

  void DocDBDebugDump(std::vector<std::string> *lines);

  CHECKED_STATUS WriteTransactionalBatch(
      int64_t batch_idx, // index of this batch in its transaction
      const docdb::KeyValueWriteBatchPB& put_batch,
      HybridTime hybrid_time,
      const rocksdb::UserFrontiers* frontiers);

  Result<TransactionOperationContext> CreateTransactionOperationContext(
      const boost::optional<TransactionId>& transaction_id,
      bool is_ysql_catalog_table,
      const SubTransactionMetadataPB* subtransaction_metadata = nullptr) const;

  // Pause abortable/non-abortable new read/write operations and wait for all
  // abortable/non-abortable pending read/write operations to finish.
  // If stop is false, ScopedRWOperation constructor will wait while ScopedRWOperationPause is
  // alive.
  // If stop is true, ScopedRWOperation constructor will create an instance with an error (see
  // ScopedRWOperation::ok()) while ScopedRWOperationPause is alive.
  ScopedRWOperationPause PauseReadWriteOperations(
      Abortable abortable, Stop stop = Stop::kFalse);

  // Pauses new non-abortable read/write operations and wait for all of those that are pending to
  // complete.
  // Starts RocksDB shutdown (that will abort abortable read/write operations).
  // Pauses new abortable read/write operations and wait for all of those that are pending to
  // complete.
  // Returns TabletScopedRWOperationPauses that are preventing new read/write operations from being
  // started.
  Result<TabletScopedRWOperationPauses> StartShutdownRocksDBs(
      DisableFlushOnShutdown disable_flush_on_shutdown, Stop stop = Stop::kFalse);

  CHECKED_STATUS CompleteShutdownRocksDBs(
      Destroy destroy, TabletScopedRWOperationPauses* ops_pauses);

  ScopedRWOperation CreateAbortableScopedRWOperation(
      const CoarseTimePoint deadline = CoarseTimePoint()) const;

  CHECKED_STATUS DoEnableCompactions();

  std::string LogPrefix() const;

  std::string LogPrefix(docdb::StorageDbType db_type) const;

  Result<bool> IsQueryOnlyForTablet(const PgsqlReadRequestPB& pgsql_read_request,
      size_t row_count) const;

  Result<bool> HasScanReachedMaxPartitionKey(
      const PgsqlReadRequestPB& pgsql_read_request,
      const string& partition_key,
      size_t row_count) const;

  // Sets metadata_cache_ to nullptr. This is done atomically to avoid race conditions.
  void ResetYBMetaDataCache();

  // Creates a new client::YBMetaDataCache object and atomically assigns it to metadata_cache_.
  void CreateNewYBMetaDataCache();

  void TriggerPostSplitCompactionSync();

  // Opens read-only rocksdb at the specified directory and checks for any file corruption.
  CHECKED_STATUS OpenDbAndCheckIntegrity(const std::string& db_dir);

  // Add or remove restoring operation filter if necessary.
  // If reset_split is true, also reset split state.
  void SyncRestoringOperationFilter(ResetSplit reset_split) EXCLUDES(operation_filters_mutex_);
  void UnregisterOperationFilterUnlocked(OperationFilter* filter)
    REQUIRES(operation_filters_mutex_);

  std::unique_ptr<const Schema> key_schema_;

  RaftGroupMetadataPtr metadata_;
  TableType table_type_;

  // Lock protecting access to the 'components_' member (i.e the rowsets in the tablet)
  //
  // Shared mode:
  // - Writers take this in shared mode at the same time as they obtain an MVCC hybrid_time
  //   and capture a reference to components_. This ensures that we can use the MVCC hybrid_time
  //   to determine which writers are writing to which components during compaction.
  // - Readers take this in shared mode while capturing their iterators. This ensures that
  //   they see a consistent view when racing against flush/compact.
  //
  // Exclusive mode:
  // - Flushes/compactions take this lock in order to lock out concurrent updates.
  //
  // NOTE: callers should avoid taking this lock for a long time, even in shared mode.
  // This is because the lock has some concept of fairness -- if, while a long reader
  // is active, a writer comes along, then all future short readers will be blocked.
  // TODO: now that this is single-threaded again, we should change it to rw_spinlock
  mutable rw_spinlock component_lock_;

  scoped_refptr<log::LogAnchorRegistry> log_anchor_registry_;
  std::shared_ptr<MemTracker> mem_tracker_;
  std::shared_ptr<MemTracker> block_based_table_mem_tracker_;

  MetricEntityPtr tablet_metrics_entity_;
  MetricEntityPtr table_metrics_entity_;
  std::unique_ptr<TabletMetrics> metrics_;
  std::shared_ptr<void> metric_detacher_;

  // A pointer to the server's clock.
  scoped_refptr<server::Clock> clock_;

  MvccManager mvcc_;

  // Lock used to serialize the creation of RocksDB checkpoints.
  mutable std::mutex create_checkpoint_lock_;

  enum State {
    kInitialized,
    kBootstrapping,
    kOpen,
    kShutdown
  };
  State state_ = kInitialized;

  // Fault hooks. In production code, these will always be nullptr.
  std::shared_ptr<CompactionFaultHooks> compaction_hooks_;
  std::shared_ptr<FlushFaultHooks> flush_hooks_;
  std::shared_ptr<FlushCompactCommonHooks> common_hooks_;

  // Statistics for the RocksDB database.
  std::shared_ptr<rocksdb::Statistics> regulardb_statistics_;
  std::shared_ptr<rocksdb::Statistics> intentsdb_statistics_;

  // RocksDB database instances for key-value tables.
  std::unique_ptr<rocksdb::DB> regular_db_;
  std::unique_ptr<rocksdb::DB> intents_db_;
  std::atomic<bool> rocksdb_shutdown_requested_{false};

  // Optional key bounds (see docdb::KeyBounds) served by this tablet.
  docdb::KeyBounds key_bounds_;

  std::unique_ptr<docdb::YQLStorageIf> ql_storage_;

  // This is for docdb fine-grained locking.
  docdb::SharedLockManager shared_lock_manager_;

  // For the block cache and memory manager shared across tablets
  const TabletOptions tablet_options_;

  // A lightweight way to reject new operations when the tablet is shutting down. This is used to
  // prevent race conditions between destroying the RocksDB instance and read/write operations.
  std::atomic_bool shutdown_requested_{false};

  // This is a special atomic counter per tablet that increases monotonically.
  // It is like timestamp, but doesn't need locks to read or update.
  // This is raft replicated as well. Each replicate message contains the current number.
  // It is guaranteed to keep increasing for committed entries even across tablet server
  // restarts and leader changes.
  std::atomic<int64_t> monotonic_counter_{0};

  // Number of pending non-abortable operations. We use this to make sure we don't shut down RocksDB
  // before all non-abortable pending operations are finished. We don't have a strict definition of
  // an "operation" for the purpose of this counter. We simply wait for this counter to go to zero
  // before starting RocksDB shutdown.
  // Note: as of 2021-06-28 applying of Raft operations could not handle errors that happened due to
  // RocksDB shutdown.
  //
  // This is marked mutable because read path member functions (which are const) are using this.
  mutable RWOperationCounter pending_non_abortable_op_counter_;

  // Similar to pending_non_abortable_op_counter_ but for operations that could be aborted, i.e.
  // operations that could handle RocksDB shutdown during their execution, for example manual
  // compactions.
  // We wait for this counter to go to zero after starting RocksDB shutdown and before destroying
  // RocksDB in-memory instance.
  mutable RWOperationCounter pending_abortable_op_counter_;

  // Used by Alter/Schema-change ops to pause new write ops from being submitted.
  RWOperationCounter write_ops_being_submitted_counter_;

  std::unique_ptr<TransactionCoordinator> transaction_coordinator_;

  std::unique_ptr<TransactionParticipant> transaction_participant_;

  std::shared_future<client::YBClient*> client_future_;

  // Created only when secondary indexes are present.
  std::unique_ptr<client::TransactionManager> transaction_manager_;

  // This object should not be accessed directly to avoid race conditions.
  // Use methods YBMetaDataCache, CreateNewYBMetaDataCache, and ResetYBMetaDataCache to read it
  // and modify it.
  std::shared_ptr<client::YBMetaDataCache> metadata_cache_;

  // Created only if it is a unique index tablet.
  std::unique_ptr<Schema> unique_index_key_schema_;

  std::atomic<int64_t> last_committed_write_index_{0};

  HybridTimeLeaseProvider ht_lease_provider_;

  Result<HybridTime> DoGetSafeTime(
      RequireLease require_lease, HybridTime min_allowed, CoarseTimePoint deadline) const override;

  Result<bool> IntentsDbFlushFilter(const rocksdb::MemTable& memtable);

  template <class Ids>
  CHECKED_STATUS RemoveIntentsImpl(const RemoveIntentsData& data, const Ids& ids);

  // Tries to find intent .SST files that could be deleted and remove them.
  void CleanupIntentFiles();
  void DoCleanupIntentFiles();

  void RegularDbFilesChanged();

  Result<HybridTime> ApplierSafeTime(HybridTime min_allowed, CoarseTimePoint deadline) override;

  void MinRunningHybridTimeSatisfied() override {
    CleanupIntentFiles();
  }

  template <class F>
  auto GetRegularDbStat(const F& func, const decltype(func())& default_value) const;

  std::function<rocksdb::MemTableFilter()> mem_table_flush_filter_factory_;

  client::LocalTabletFilter local_tablet_filter_;

  // This is typically "P <peer_id>", so we can get a log prefix "T <tablet_id> P <peer_id>: ".
  std::string log_prefix_suffix_;

  IsSysCatalogTablet is_sys_catalog_;
  TransactionsEnabled txns_enabled_;

  std::unique_ptr<ThreadPoolToken> cleanup_intent_files_token_;

  std::unique_ptr<TabletSnapshots> snapshots_;

  SnapshotCoordinator* snapshot_coordinator_ = nullptr;

  docdb::YQLRowwiseIteratorIf* cdc_iterator_ = nullptr;

  mutable std::mutex control_path_mutex_;
  std::unordered_map<std::string, std::shared_ptr<void>> additional_metadata_
    GUARDED_BY(control_path_mutex_);

  std::mutex num_sst_files_changed_listener_mutex_;
  std::function<void()> num_sst_files_changed_listener_
      GUARDED_BY(num_sst_files_changed_listener_mutex_);

  std::shared_ptr<TabletRetentionPolicy> retention_policy_;

  // Thread pool token for manually triggering compactions for tablets created from a split. This
  // member is set when a post-split compaction is triggered on this tablet as the result of a call
  // to TriggerPostSplitCompactionIfNeeded. It is an error to attempt to trigger another post-split
  // compaction if this member is already set, as the existence of this member implies that such a
  // compaction has already been triggered for this instance.
  std::unique_ptr<ThreadPoolToken> post_split_compaction_task_pool_token_ = nullptr;

  simple_spinlock operation_filters_mutex_;

  boost::intrusive::list<OperationFilter> operation_filters_ GUARDED_BY(operation_filters_mutex_);

  std::unique_ptr<OperationFilter> completed_split_operation_filter_
      GUARDED_BY(operation_filters_mutex_);
  std::unique_ptr<log::LogAnchor> completed_split_log_anchor_ GUARDED_BY(operation_filters_mutex_);

  std::unique_ptr<OperationFilter> restoring_operation_filter_ GUARDED_BY(operation_filters_mutex_);

  DISALLOW_COPY_AND_ASSIGN(Tablet);
};

// A helper class to manage read transactions. Grabs and registers a read point with the tablet
// when created, and deregisters the read point when this object is destructed.
class ScopedReadOperation {
 public:
  ScopedReadOperation() : tablet_(nullptr) {}
  ScopedReadOperation(ScopedReadOperation&& rhs)
      : tablet_(rhs.tablet_), read_time_(rhs.read_time_) {
    rhs.tablet_ = nullptr;
  }

  void operator=(ScopedReadOperation&& rhs);

  static Result<ScopedReadOperation> Create(
      AbstractTablet* tablet,
      RequireLease require_lease,
      ReadHybridTime read_time);

  ScopedReadOperation(const ScopedReadOperation&) = delete;
  void operator=(const ScopedReadOperation&) = delete;

  ~ScopedReadOperation();

  const ReadHybridTime& read_time() const { return read_time_; }

  Status status() const { return status_; }

  void Reset();

 private:
  explicit ScopedReadOperation(
      AbstractTablet* tablet, const ReadHybridTime& read_time);

  AbstractTablet* tablet_;
  ReadHybridTime read_time_;
  Status status_;
};

bool IsSchemaVersionCompatible(
    uint32_t current_version, uint32_t request_version, bool compatible_with_previous_version);

}  // namespace tablet
}  // namespace yb

#endif  // YB_TABLET_TABLET_H_

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43