/Users/deen/code/yugabyte-db/src/postgres/src/backend/access/hash/hashovfl.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * hashovfl.c
 *    Overflow page management code for the Postgres hash access method
 *
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    src/backend/access/hash/hashovfl.c
 *
 * NOTES
 *    Overflow pages look like ordinary relation pages.
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/hash.h"
#include "access/hash_xlog.h"
#include "miscadmin.h"
#include "utils/rel.h"


static uint32 _hash_firstfreebit(uint32 map);


/*
 * Convert overflow page bit number (its index in the free-page bitmaps)
 * to block number within the index.
 */
static BlockNumber
bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
{
  uint32    splitnum = metap->hashm_ovflpoint;
  uint32    i;

  /* Convert zero-based bitnumber to 1-based page number */
  ovflbitnum += 1;

  /* Determine the split number for this page (must be >= 1) */
  for (i = 1;
     i < splitnum && ovflbitnum > metap->hashm_spares[i];
     i++)
     /* loop */ ;

  /*
   * Convert to absolute page number by adding the number of bucket pages
   * that exist before this split point.
   */
  return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
}

/*
 * _hash_ovflblkno_to_bitno
 *
 * Convert overflow page block number to bit number for free-page bitmap.
 */
uint32
_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
{
  uint32    splitnum = metap->hashm_ovflpoint;
  uint32    i;
  uint32    bitnum;

  /* Determine the split number containing this page */
  for (i = 1; i <= splitnum; i++)
  {
    if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
      break;       /* oops */
    bitnum = ovflblkno - _hash_get_totalbuckets(i);

    /*
     * bitnum has to be greater than number of overflow page added in
     * previous split point. The overflow page at this splitnum (i) if any
     * should start from (_hash_get_totalbuckets(i) +
     * metap->hashm_spares[i - 1] + 1).
     */
    if (bitnum > metap->hashm_spares[i - 1] &&
      bitnum <= metap->hashm_spares[i])
      return bitnum - 1; /* -1 to convert 1-based to 0-based */
  }

  ereport(ERROR,
      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
       errmsg("invalid overflow block number %u", ovflblkno)));
  return 0;         /* keep compiler quiet */
}

/*
 *  _hash_addovflpage
 *
 *  Add an overflow page to the bucket whose last page is pointed to by 'buf'.
 *
 *  On entry, the caller must hold a pin but no lock on 'buf'.  The pin is
 *  dropped before exiting (we assume the caller is not interested in 'buf'
 *  anymore) if not asked to retain.  The pin will be retained only for the
 *  primary bucket.  The returned overflow page will be pinned and
 *  write-locked; it is guaranteed to be empty.
 *
 *  The caller must hold a pin, but no lock, on the metapage buffer.
 *  That buffer is returned in the same state.
 *
 * NB: since this could be executed concurrently by multiple processes,
 * one should not assume that the returned overflow page will be the
 * immediate successor of the originally passed 'buf'.  Additional overflow
 * pages might have been added to the bucket chain in between.
 */
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
{
  Buffer    ovflbuf;
  Page    page;
  Page    ovflpage;
  HashPageOpaque pageopaque;
  HashPageOpaque ovflopaque;
  HashMetaPage metap;
  Buffer    mapbuf = InvalidBuffer;
  Buffer    newmapbuf = InvalidBuffer;
  BlockNumber blkno;
  uint32    orig_firstfree;
  uint32    splitnum;
  uint32     *freep = NULL;
  uint32    max_ovflpg;
  uint32    bit;
  uint32    bitmap_page_bit;
  uint32    first_page;
  uint32    last_bit;
  uint32    last_page;
  uint32    i,
        j;
  bool    page_found = false;

  /*
   * Write-lock the tail page.  Here, we need to maintain locking order such
   * that, first acquire the lock on tail page of bucket, then on meta page
   * to find and lock the bitmap page and if it is found, then lock on meta
   * page is released, then finally acquire the lock on new overflow buffer.
   * We need this locking order to avoid deadlock with backends that are
   * doing inserts.
   *
   * Note: We could have avoided locking many buffers here if we made two
   * WAL records for acquiring an overflow page (one to allocate an overflow
   * page and another to add it to overflow bucket chain).  However, doing
   * so can leak an overflow page, if the system crashes after allocation.
   * Needless to say, it is better to have a single record from a
   * performance point of view as well.
   */
  LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

  /* probably redundant... */
  _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

  /* loop to find current tail page, in case someone else inserted too */
  for (;;)
  {
    BlockNumber nextblkno;

    page = BufferGetPage(buf);
    pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
    nextblkno = pageopaque->hasho_nextblkno;

    if (!BlockNumberIsValid(nextblkno))
      break;

    /* we assume we do not need to write the unmodified page */
    if (retain_pin)
    {
      /* pin will be retained only for the primary bucket page */
      Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    }
    else
      _hash_relbuf(rel, buf);

    retain_pin = false;

    buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
  }

  /* Get exclusive lock on the meta page */
  LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);

  _hash_checkpage(rel, metabuf, LH_META_PAGE);
  metap = HashPageGetMeta(BufferGetPage(metabuf));

  /* start search at hashm_firstfree */
  orig_firstfree = metap->hashm_firstfree;
  first_page = orig_firstfree >> BMPG_SHIFT(metap);
  bit = orig_firstfree & BMPG_MASK(metap);
  i = first_page;
  j = bit / BITS_PER_MAP;
  bit &= ~(BITS_PER_MAP - 1);

  /* outer loop iterates once per bitmap page */
  for (;;)
  {
    BlockNumber mapblkno;
    Page    mappage;
    uint32    last_inpage;

    /* want to end search with the last existing overflow page */
    splitnum = metap->hashm_ovflpoint;
    max_ovflpg = metap->hashm_spares[splitnum] - 1;
    last_page = max_ovflpg >> BMPG_SHIFT(metap);
    last_bit = max_ovflpg & BMPG_MASK(metap);

    if (i > last_page)
      break;

    Assert(i < metap->hashm_nmaps);
    mapblkno = metap->hashm_mapp[i];

    if (i == last_page)
      last_inpage = last_bit;
    else
      last_inpage = BMPGSZ_BIT(metap) - 1;

    /* Release exclusive lock on metapage while reading bitmap page */
    LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

    mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
    mappage = BufferGetPage(mapbuf);
    freep = HashPageGetBitmap(mappage);

    for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
    {
      if (freep[j] != ALL_SET)
      {
        page_found = true;

        /* Reacquire exclusive lock on the meta page */
        LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);

        /* convert bit to bit number within page */
        bit += _hash_firstfreebit(freep[j]);
        bitmap_page_bit = bit;

        /* convert bit to absolute bit number */
        bit += (i << BMPG_SHIFT(metap));
        /* Calculate address of the recycled overflow page */
        blkno = bitno_to_blkno(metap, bit);

        /* Fetch and init the recycled page */
        ovflbuf = _hash_getinitbuf(rel, blkno);

        goto found;
      }
    }

    /* No free space here, try to advance to next map page */
    _hash_relbuf(rel, mapbuf);
    mapbuf = InvalidBuffer;
    i++;
    j = 0;          /* scan from start of next map page */
    bit = 0;

    /* Reacquire exclusive lock on the meta page */
    LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
  }

  /*
   * No free pages --- have to extend the relation to add an overflow page.
   * First, check to see if we have to add a new bitmap page too.
   */
  if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
  {
    /*
     * We create the new bitmap page with all pages marked "in use".
     * Actually two pages in the new bitmap's range will exist
     * immediately: the bitmap page itself, and the following page which
     * is the one we return to the caller.  Both of these are correctly
     * marked "in use".  Subsequent pages do not exist yet, but it is
     * convenient to pre-mark them as "in use" too.
     */
    bit = metap->hashm_spares[splitnum];

    /* metapage already has a write lock */
    if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
      ereport(ERROR,
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
           errmsg("out of overflow pages in hash index \"%s\"",
              RelationGetRelationName(rel))));

    newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
  }
  else
  {
    /*
     * Nothing to do here; since the page will be past the last used page,
     * we know its bitmap bit was preinitialized to "in use".
     */
  }

  /* Calculate address of the new overflow page */
  bit = BufferIsValid(newmapbuf) ?
    metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
  blkno = bitno_to_blkno(metap, bit);

  /*
   * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
   * relation length stays in sync with ours.  XXX It's annoying to do this
   * with metapage write lock held; would be better to use a lock that
   * doesn't block incoming searches.
   *
   * It is okay to hold two buffer locks here (one on tail page of bucket
   * and other on new overflow page) since there cannot be anyone else
   * contending for access to ovflbuf.
   */
  ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);

found:

  /*
   * Do the update.  No ereport(ERROR) until changes are logged. We want to
   * log the changes for bitmap page and overflow page together to avoid
   * loss of pages in case the new page is added.
   */
  START_CRIT_SECTION();

  if (page_found)
  {
    Assert(BufferIsValid(mapbuf));

    /* mark page "in use" in the bitmap */
    SETBIT(freep, bitmap_page_bit);
    MarkBufferDirty(mapbuf);
  }
  else
  {
    /* update the count to indicate new overflow page is added */
    metap->hashm_spares[splitnum]++;

    if (BufferIsValid(newmapbuf))
    {
      _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
      MarkBufferDirty(newmapbuf);

      /* add the new bitmap page to the metapage's list of bitmaps */
      metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
      metap->hashm_nmaps++;
      metap->hashm_spares[splitnum]++;
    }

    MarkBufferDirty(metabuf);

    /*
     * for new overflow page, we don't need to explicitly set the bit in
     * bitmap page, as by default that will be set to "in use".
     */
  }

  /*
   * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
   * changing it if someone moved it while we were searching bitmap pages.
   */
  if (metap->hashm_firstfree == orig_firstfree)
  {
    metap->hashm_firstfree = bit + 1;
    MarkBufferDirty(metabuf);
  }

  /* initialize new overflow page */
  ovflpage = BufferGetPage(ovflbuf);
  ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
  ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
  ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
  ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
  ovflopaque->hasho_page_id = HASHO_PAGE_ID;

  MarkBufferDirty(ovflbuf);

  /* logically chain overflow page to previous page */
  pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);

  MarkBufferDirty(buf);

  /* XLOG stuff */
  if (RelationNeedsWAL(rel))
  {
    XLogRecPtr  recptr;
    xl_hash_add_ovfl_page xlrec;

    xlrec.bmpage_found = page_found;
    xlrec.bmsize = metap->hashm_bmsize;

    XLogBeginInsert();
    XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);

    XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
    XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));

    XLogRegisterBuffer(1, buf, REGBUF_STANDARD);

    if (BufferIsValid(mapbuf))
    {
      XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
      XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
    }

    if (BufferIsValid(newmapbuf))
      XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);

    XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
    XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));

    recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);

    PageSetLSN(BufferGetPage(ovflbuf), recptr);
    PageSetLSN(BufferGetPage(buf), recptr);

    if (BufferIsValid(mapbuf))
      PageSetLSN(BufferGetPage(mapbuf), recptr);

    if (BufferIsValid(newmapbuf))
      PageSetLSN(BufferGetPage(newmapbuf), recptr);

    PageSetLSN(BufferGetPage(metabuf), recptr);
  }

  END_CRIT_SECTION();

  if (retain_pin)
    LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  else
    _hash_relbuf(rel, buf);

  if (BufferIsValid(mapbuf))
    _hash_relbuf(rel, mapbuf);

  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

  if (BufferIsValid(newmapbuf))
    _hash_relbuf(rel, newmapbuf);

  return ovflbuf;
}

/*
 *  _hash_firstfreebit()
 *
 *  Return the number of the first bit that is not set in the word 'map'.
 */
static uint32
_hash_firstfreebit(uint32 map)
{
  uint32    i,
        mask;

  mask = 0x1;
  for (i = 0; i < BITS_PER_MAP; i++)
  {
    if (!(mask & map))
      return i;
    mask <<= 1;
  }

  elog(ERROR, "firstfreebit found no free bit");

  return 0;         /* keep compiler quiet */
}

/*
 *  _hash_freeovflpage() -
 *
 *  Remove this overflow page from its bucket's chain, and mark the page as
 *  free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
 *  Add the tuples (itups) to wbuf in this function.  We could do that in the
 *  caller as well, but the advantage of doing it here is we can easily write
 *  the WAL for XLOG_HASH_SQUEEZE_PAGE operation.  Addition of tuples and
 *  removal of overflow page has to done as an atomic operation, otherwise
 *  during replay on standby users might find duplicate records.
 *
 *  Since this function is invoked in VACUUM, we provide an access strategy
 *  parameter that controls fetches of the bucket pages.
 *
 *  Returns the block number of the page that followed the given page
 *  in the bucket, or InvalidBlockNumber if no following page.
 *
 *  NB: caller must not hold lock on metapage, nor on page, that's next to
 *  ovflbuf in the bucket chain.  We don't acquire the lock on page that's
 *  prior to ovflbuf in chain if it is same as wbuf because the caller already
 *  has a lock on same.
 */
BlockNumber
_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
           Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
           Size *tups_size, uint16 nitups,
           BufferAccessStrategy bstrategy)
{
  HashMetaPage metap;
  Buffer    metabuf;
  Buffer    mapbuf;
  BlockNumber ovflblkno;
  BlockNumber prevblkno;
  BlockNumber blkno;
  BlockNumber nextblkno;
  BlockNumber writeblkno;
  HashPageOpaque ovflopaque;
  Page    ovflpage;
  Page    mappage;
  uint32     *freep;
  uint32    ovflbitno;
  int32   bitmappage,
        bitmapbit;
  Bucket    bucket PG_USED_FOR_ASSERTS_ONLY;
  Buffer    prevbuf = InvalidBuffer;
  Buffer    nextbuf = InvalidBuffer;
  bool    update_metap = false;

  /* Get information from the doomed page */
  _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
  ovflblkno = BufferGetBlockNumber(ovflbuf);
  ovflpage = BufferGetPage(ovflbuf);
  ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
  nextblkno = ovflopaque->hasho_nextblkno;
  prevblkno = ovflopaque->hasho_prevblkno;
  writeblkno = BufferGetBlockNumber(wbuf);
  bucket = ovflopaque->hasho_bucket;

  /*
   * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
   * up the bucket chain members behind and ahead of the overflow page being
   * deleted.  Concurrency issues are avoided by using lock chaining as
   * described atop hashbucketcleanup.
   */
  if (BlockNumberIsValid(prevblkno))
  {
    if (prevblkno == writeblkno)
      prevbuf = wbuf;
    else
      prevbuf = _hash_getbuf_with_strategy(rel,
                         prevblkno,
                         HASH_WRITE,
                         LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
                         bstrategy);
  }
  if (BlockNumberIsValid(nextblkno))
    nextbuf = _hash_getbuf_with_strategy(rel,
                       nextblkno,
                       HASH_WRITE,
                       LH_OVERFLOW_PAGE,
                       bstrategy);

  /* Note: bstrategy is intentionally not used for metapage and bitmap */

  /* Read the metapage so we can determine which bitmap page to use */
  metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
  metap = HashPageGetMeta(BufferGetPage(metabuf));

  /* Identify which bit to set */
  ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);

  bitmappage = ovflbitno >> BMPG_SHIFT(metap);
  bitmapbit = ovflbitno & BMPG_MASK(metap);

  if (bitmappage >= metap->hashm_nmaps)
    elog(ERROR, "invalid overflow bit number %u", ovflbitno);
  blkno = metap->hashm_mapp[bitmappage];

  /* Release metapage lock while we access the bitmap page */
  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

  /* read the bitmap page to clear the bitmap bit */
  mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
  mappage = BufferGetPage(mapbuf);
  freep = HashPageGetBitmap(mappage);
  Assert(ISSET(freep, bitmapbit));

  /* Get write-lock on metapage to update firstfree */
  LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);

  /* This operation needs to log multiple tuples, prepare WAL for that */
  if (RelationNeedsWAL(rel))
    XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);

  START_CRIT_SECTION();

  /*
   * we have to insert tuples on the "write" page, being careful to preserve
   * hashkey ordering.  (If we insert many tuples into the same "write" page
   * it would be worth qsort'ing them).
   */
  if (nitups > 0)
  {
    _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
    MarkBufferDirty(wbuf);
  }

  /*
   * Reinitialize the freed overflow page.  Just zeroing the page won't
   * work, because WAL replay routines expect pages to be initialized. See
   * explanation of RBM_NORMAL mode atop XLogReadBufferExtended.  We are
   * careful to make the special space valid here so that tools like
   * pageinspect won't get confused.
   */
  _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));

  ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);

  ovflopaque->hasho_prevblkno = InvalidBlockNumber;
  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
  ovflopaque->hasho_bucket = -1;
  ovflopaque->hasho_flag = LH_UNUSED_PAGE;
  ovflopaque->hasho_page_id = HASHO_PAGE_ID;

  MarkBufferDirty(ovflbuf);

  if (BufferIsValid(prevbuf))
  {
    Page    prevpage = BufferGetPage(prevbuf);
    HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);

    Assert(prevopaque->hasho_bucket == bucket);
    prevopaque->hasho_nextblkno = nextblkno;
    MarkBufferDirty(prevbuf);
  }
  if (BufferIsValid(nextbuf))
  {
    Page    nextpage = BufferGetPage(nextbuf);
    HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);

    Assert(nextopaque->hasho_bucket == bucket);
    nextopaque->hasho_prevblkno = prevblkno;
    MarkBufferDirty(nextbuf);
  }

  /* Clear the bitmap bit to indicate that this overflow page is free */
  CLRBIT(freep, bitmapbit);
  MarkBufferDirty(mapbuf);

  /* if this is now the first free page, update hashm_firstfree */
  if (ovflbitno < metap->hashm_firstfree)
  {
    metap->hashm_firstfree = ovflbitno;
    update_metap = true;
    MarkBufferDirty(metabuf);
  }

  /* XLOG stuff */
  if (RelationNeedsWAL(rel))
  {
    xl_hash_squeeze_page xlrec;
    XLogRecPtr  recptr;
    int     i;

    xlrec.prevblkno = prevblkno;
    xlrec.nextblkno = nextblkno;
    xlrec.ntups = nitups;
    xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
    xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);

    XLogBeginInsert();
    XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);

    /*
     * bucket buffer needs to be registered to ensure that we can acquire
     * a cleanup lock on it during replay.
     */
    if (!xlrec.is_prim_bucket_same_wrt)
      XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE);

    XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
    if (xlrec.ntups > 0)
    {
      XLogRegisterBufData(1, (char *) itup_offsets,
                nitups * sizeof(OffsetNumber));
      for (i = 0; i < nitups; i++)
        XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
    }

    XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);

    /*
     * If prevpage and the writepage (block in which we are moving tuples
     * from overflow) are same, then no need to separately register
     * prevpage.  During replay, we can directly update the nextblock in
     * writepage.
     */
    if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
      XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);

    if (BufferIsValid(nextbuf))
      XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);

    XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
    XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));

    if (update_metap)
    {
      XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
      XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
    }

    recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);

    PageSetLSN(BufferGetPage(wbuf), recptr);
    PageSetLSN(BufferGetPage(ovflbuf), recptr);

    if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
      PageSetLSN(BufferGetPage(prevbuf), recptr);
    if (BufferIsValid(nextbuf))
      PageSetLSN(BufferGetPage(nextbuf), recptr);

    PageSetLSN(BufferGetPage(mapbuf), recptr);

    if (update_metap)
      PageSetLSN(BufferGetPage(metabuf), recptr);
  }

  END_CRIT_SECTION();

  /* release previous bucket if it is not same as write bucket */
  if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
    _hash_relbuf(rel, prevbuf);

  if (BufferIsValid(ovflbuf))
    _hash_relbuf(rel, ovflbuf);

  if (BufferIsValid(nextbuf))
    _hash_relbuf(rel, nextbuf);

  _hash_relbuf(rel, mapbuf);
  _hash_relbuf(rel, metabuf);

  return nextblkno;
}


/*
 *  _hash_initbitmapbuffer()
 *
 *   Initialize a new bitmap page.  All bits in the new bitmap page are set to
 *   "1", indicating "in use".
 */
void
_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
{
  Page    pg;
  HashPageOpaque op;
  uint32     *freep;

  pg = BufferGetPage(buf);

  /* initialize the page */
  if (initpage)
    _hash_pageinit(pg, BufferGetPageSize(buf));

  /* initialize the page's special space */
  op = (HashPageOpaque) PageGetSpecialPointer(pg);
  op->hasho_prevblkno = InvalidBlockNumber;
  op->hasho_nextblkno = InvalidBlockNumber;
  op->hasho_bucket = -1;
  op->hasho_flag = LH_BITMAP_PAGE;
  op->hasho_page_id = HASHO_PAGE_ID;

  /* set all of the bits to 1 */
  freep = HashPageGetBitmap(pg);
  MemSet(freep, 0xFF, bmsize);

  /*
   * Set pd_lower just past the end of the bitmap page data.  We could even
   * set pd_lower equal to pd_upper, but this is more precise and makes the
   * page look compressible to xlog.c.
   */
  ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
}


/*
 *  _hash_squeezebucket(rel, bucket)
 *
 *  Try to squeeze the tuples onto pages occurring earlier in the
 *  bucket chain in an attempt to free overflow pages. When we start
 *  the "squeezing", the page from which we start taking tuples (the
 *  "read" page) is the last bucket in the bucket chain and the page
 *  onto which we start squeezing tuples (the "write" page) is the
 *  first page in the bucket chain.  The read page works backward and
 *  the write page works forward; the procedure terminates when the
 *  read page and write page are the same page.
 *
 *  At completion of this procedure, it is guaranteed that all pages in
 *  the bucket are nonempty, unless the bucket is totally empty (in
 *  which case all overflow pages will be freed).  The original implementation
 *  required that to be true on entry as well, but it's a lot easier for
 *  callers to leave empty overflow pages and let this guy clean it up.
 *
 *  Caller must acquire cleanup lock on the primary page of the target
 *  bucket to exclude any scans that are in progress, which could easily
 *  be confused into returning the same tuple more than once or some tuples
 *  not at all by the rearrangement we are performing here.  To prevent
 *  any concurrent scan to cross the squeeze scan we use lock chaining
 *  similar to hashbucketcleanup.  Refer comments atop hashbucketcleanup.
 *
 *  We need to retain a pin on the primary bucket to ensure that no concurrent
 *  split can start.
 *
 *  Since this function is invoked in VACUUM, we provide an access strategy
 *  parameter that controls fetches of the bucket pages.
 */
void
_hash_squeezebucket(Relation rel,
          Bucket bucket,
          BlockNumber bucket_blkno,
          Buffer bucket_buf,
          BufferAccessStrategy bstrategy)
{
  BlockNumber wblkno;
  BlockNumber rblkno;
  Buffer    wbuf;
  Buffer    rbuf;
  Page    wpage;
  Page    rpage;
  HashPageOpaque wopaque;
  HashPageOpaque ropaque;

  /*
   * start squeezing into the primary bucket page.
   */
  wblkno = bucket_blkno;
  wbuf = bucket_buf;
  wpage = BufferGetPage(wbuf);
  wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

  /*
   * if there aren't any overflow pages, there's nothing to squeeze. caller
   * is responsible for releasing the pin on primary bucket page.
   */
  if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
  {
    LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
    return;
  }

  /*
   * Find the last page in the bucket chain by starting at the base bucket
   * page and working forward.  Note: we assume that a hash bucket chain is
   * usually smaller than the buffer ring being used by VACUUM, else using
   * the access strategy here would be counterproductive.
   */
  rbuf = InvalidBuffer;
  ropaque = wopaque;
  do
  {
    rblkno = ropaque->hasho_nextblkno;
    if (rbuf != InvalidBuffer)
      _hash_relbuf(rel, rbuf);
    rbuf = _hash_getbuf_with_strategy(rel,
                      rblkno,
                      HASH_WRITE,
                      LH_OVERFLOW_PAGE,
                      bstrategy);
    rpage = BufferGetPage(rbuf);
    ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
    Assert(ropaque->hasho_bucket == bucket);
  } while (BlockNumberIsValid(ropaque->hasho_nextblkno));

  /*
   * squeeze the tuples.
   */
  for (;;)
  {
    OffsetNumber roffnum;
    OffsetNumber maxroffnum;
    OffsetNumber deletable[MaxOffsetNumber];
    IndexTuple  itups[MaxIndexTuplesPerPage];
    Size    tups_size[MaxIndexTuplesPerPage];
    OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    uint16    ndeletable = 0;
    uint16    nitups = 0;
    Size    all_tups_size = 0;
    int     i;
    bool    retain_pin = false;

readpage:
    /* Scan each tuple in "read" page */
    maxroffnum = PageGetMaxOffsetNumber(rpage);
    for (roffnum = FirstOffsetNumber;
       roffnum <= maxroffnum;
       roffnum = OffsetNumberNext(roffnum))
    {
      IndexTuple  itup;
      Size    itemsz;

      /* skip dead tuples */
      if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
        continue;

      itup = (IndexTuple) PageGetItem(rpage,
                      PageGetItemId(rpage, roffnum));
      itemsz = IndexTupleSize(itup);
      itemsz = MAXALIGN(itemsz);

      /*
       * Walk up the bucket chain, looking for a page big enough for
       * this item and all other accumulated items.  Exit if we reach
       * the read page.
       */
      while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
      {
        Buffer    next_wbuf = InvalidBuffer;
        bool    tups_moved = false;

        Assert(!PageIsEmpty(wpage));

        if (wblkno == bucket_blkno)
          retain_pin = true;

        wblkno = wopaque->hasho_nextblkno;
        Assert(BlockNumberIsValid(wblkno));

        /* don't need to move to next page if we reached the read page */
        if (wblkno != rblkno)
          next_wbuf = _hash_getbuf_with_strategy(rel,
                               wblkno,
                               HASH_WRITE,
                               LH_OVERFLOW_PAGE,
                               bstrategy);

        if (nitups > 0)
        {
          Assert(nitups == ndeletable);

          /*
           * This operation needs to log multiple tuples, prepare
           * WAL for that.
           */
          if (RelationNeedsWAL(rel))
            XLogEnsureRecordSpace(0, 3 + nitups);

          START_CRIT_SECTION();

          /*
           * we have to insert tuples on the "write" page, being
           * careful to preserve hashkey ordering.  (If we insert
           * many tuples into the same "write" page it would be
           * worth qsort'ing them).
           */
          _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
          MarkBufferDirty(wbuf);

          /* Delete tuples we already moved off read page */
          PageIndexMultiDelete(rpage, deletable, ndeletable);
          MarkBufferDirty(rbuf);

          /* XLOG stuff */
          if (RelationNeedsWAL(rel))
          {
            XLogRecPtr  recptr;
            xl_hash_move_page_contents xlrec;

            xlrec.ntups = nitups;
            xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false;

            XLogBeginInsert();
            XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);

            /*
             * bucket buffer needs to be registered to ensure that
             * we can acquire a cleanup lock on it during replay.
             */
            if (!xlrec.is_prim_bucket_same_wrt)
              XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);

            XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
            XLogRegisterBufData(1, (char *) itup_offsets,
                      nitups * sizeof(OffsetNumber));
            for (i = 0; i < nitups; i++)
              XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);

            XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
            XLogRegisterBufData(2, (char *) deletable,
                      ndeletable * sizeof(OffsetNumber));

            recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);

            PageSetLSN(BufferGetPage(wbuf), recptr);
            PageSetLSN(BufferGetPage(rbuf), recptr);
          }

          END_CRIT_SECTION();

          tups_moved = true;
        }

        /*
         * release the lock on previous page after acquiring the lock
         * on next page
         */
        if (retain_pin)
          LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
        else
          _hash_relbuf(rel, wbuf);

        /* nothing more to do if we reached the read page */
        if (rblkno == wblkno)
        {
          _hash_relbuf(rel, rbuf);
          return;
        }

        wbuf = next_wbuf;
        wpage = BufferGetPage(wbuf);
        wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
        Assert(wopaque->hasho_bucket == bucket);
        retain_pin = false;

        /* be tidy */
        for (i = 0; i < nitups; i++)
          pfree(itups[i]);
        nitups = 0;
        all_tups_size = 0;
        ndeletable = 0;

        /*
         * after moving the tuples, rpage would have been compacted,
         * so we need to rescan it.
         */
        if (tups_moved)
          goto readpage;
      }

      /* remember tuple for deletion from "read" page */
      deletable[ndeletable++] = roffnum;

      /*
       * we need a copy of index tuples as they can be freed as part of
       * overflow page, however we need them to write a WAL record in
       * _hash_freeovflpage.
       */
      itups[nitups] = CopyIndexTuple(itup);
      tups_size[nitups++] = itemsz;
      all_tups_size += itemsz;
    }

    /*
     * If we reach here, there are no live tuples on the "read" page ---
     * it was empty when we got to it, or we moved them all.  So we can
     * just free the page without bothering with deleting tuples
     * individually.  Then advance to the previous "read" page.
     *
     * Tricky point here: if our read and write pages are adjacent in the
     * bucket chain, our write lock on wbuf will conflict with
     * _hash_freeovflpage's attempt to update the sibling links of the
     * removed page.  In that case, we don't need to lock it again.
     */
    rblkno = ropaque->hasho_prevblkno;
    Assert(BlockNumberIsValid(rblkno));

    /* free this overflow page (releases rbuf) */
    _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
               tups_size, nitups, bstrategy);

    /* be tidy */
    for (i = 0; i < nitups; i++)
      pfree(itups[i]);

    /* are we freeing the page adjacent to wbuf? */
    if (rblkno == wblkno)
    {
      /* retain the pin on primary bucket page till end of bucket scan */
      if (wblkno == bucket_blkno)
        LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
      else
        _hash_relbuf(rel, wbuf);
      return;
    }

    rbuf = _hash_getbuf_with_strategy(rel,
                      rblkno,
                      HASH_WRITE,
                      LH_OVERFLOW_PAGE,
                      bstrategy);
    rpage = BufferGetPage(rbuf);
    ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
    Assert(ropaque->hasho_bucket == bucket);
  }

  /* NOTREACHED */
}

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43