/Users/deen/code/yugabyte-db/src/postgres/src/backend/access/hash/hashovfl.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * hashovfl.c |
4 | | * Overflow page management code for the Postgres hash access method |
5 | | * |
6 | | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/hash/hashovfl.c |
12 | | * |
13 | | * NOTES |
14 | | * Overflow pages look like ordinary relation pages. |
15 | | * |
16 | | *------------------------------------------------------------------------- |
17 | | */ |
18 | | #include "postgres.h" |
19 | | |
20 | | #include "access/hash.h" |
21 | | #include "access/hash_xlog.h" |
22 | | #include "miscadmin.h" |
23 | | #include "utils/rel.h" |
24 | | |
25 | | |
26 | | static uint32 _hash_firstfreebit(uint32 map); |
27 | | |
28 | | |
29 | | /* |
30 | | * Convert overflow page bit number (its index in the free-page bitmaps) |
31 | | * to block number within the index. |
32 | | */ |
33 | | static BlockNumber |
34 | | bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) |
35 | 0 | { |
36 | 0 | uint32 splitnum = metap->hashm_ovflpoint; |
37 | 0 | uint32 i; |
38 | | |
39 | | /* Convert zero-based bitnumber to 1-based page number */ |
40 | 0 | ovflbitnum += 1; |
41 | | |
42 | | /* Determine the split number for this page (must be >= 1) */ |
43 | 0 | for (i = 1; |
44 | 0 | i < splitnum && ovflbitnum > metap->hashm_spares[i]; |
45 | 0 | i++) |
46 | 0 | /* loop */ ; |
47 | | |
48 | | /* |
49 | | * Convert to absolute page number by adding the number of bucket pages |
50 | | * that exist before this split point. |
51 | | */ |
52 | 0 | return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); |
53 | 0 | } |
54 | | |
55 | | /* |
56 | | * _hash_ovflblkno_to_bitno |
57 | | * |
58 | | * Convert overflow page block number to bit number for free-page bitmap. |
59 | | */ |
60 | | uint32 |
61 | | _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) |
62 | 0 | { |
63 | 0 | uint32 splitnum = metap->hashm_ovflpoint; |
64 | 0 | uint32 i; |
65 | 0 | uint32 bitnum; |
66 | | |
67 | | /* Determine the split number containing this page */ |
68 | 0 | for (i = 1; i <= splitnum; i++) |
69 | 0 | { |
70 | 0 | if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) |
71 | 0 | break; /* oops */ |
72 | 0 | bitnum = ovflblkno - _hash_get_totalbuckets(i); |
73 | | |
74 | | /* |
75 | | * bitnum has to be greater than number of overflow page added in |
76 | | * previous split point. The overflow page at this splitnum (i) if any |
77 | | * should start from (_hash_get_totalbuckets(i) + |
78 | | * metap->hashm_spares[i - 1] + 1). |
79 | | */ |
80 | 0 | if (bitnum > metap->hashm_spares[i - 1] && |
81 | 0 | bitnum <= metap->hashm_spares[i]) |
82 | 0 | return bitnum - 1; /* -1 to convert 1-based to 0-based */ |
83 | 0 | } |
84 | | |
85 | 0 | ereport(ERROR, |
86 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
87 | 0 | errmsg("invalid overflow block number %u", ovflblkno))); |
88 | 0 | return 0; /* keep compiler quiet */ |
89 | 0 | } |
90 | | |
91 | | /* |
92 | | * _hash_addovflpage |
93 | | * |
94 | | * Add an overflow page to the bucket whose last page is pointed to by 'buf'. |
95 | | * |
96 | | * On entry, the caller must hold a pin but no lock on 'buf'. The pin is |
97 | | * dropped before exiting (we assume the caller is not interested in 'buf' |
98 | | * anymore) if not asked to retain. The pin will be retained only for the |
99 | | * primary bucket. The returned overflow page will be pinned and |
100 | | * write-locked; it is guaranteed to be empty. |
101 | | * |
102 | | * The caller must hold a pin, but no lock, on the metapage buffer. |
103 | | * That buffer is returned in the same state. |
104 | | * |
105 | | * NB: since this could be executed concurrently by multiple processes, |
106 | | * one should not assume that the returned overflow page will be the |
107 | | * immediate successor of the originally passed 'buf'. Additional overflow |
108 | | * pages might have been added to the bucket chain in between. |
109 | | */ |
110 | | Buffer |
111 | | _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) |
112 | 0 | { |
113 | 0 | Buffer ovflbuf; |
114 | 0 | Page page; |
115 | 0 | Page ovflpage; |
116 | 0 | HashPageOpaque pageopaque; |
117 | 0 | HashPageOpaque ovflopaque; |
118 | 0 | HashMetaPage metap; |
119 | 0 | Buffer mapbuf = InvalidBuffer; |
120 | 0 | Buffer newmapbuf = InvalidBuffer; |
121 | 0 | BlockNumber blkno; |
122 | 0 | uint32 orig_firstfree; |
123 | 0 | uint32 splitnum; |
124 | 0 | uint32 *freep = NULL; |
125 | 0 | uint32 max_ovflpg; |
126 | 0 | uint32 bit; |
127 | 0 | uint32 bitmap_page_bit; |
128 | 0 | uint32 first_page; |
129 | 0 | uint32 last_bit; |
130 | 0 | uint32 last_page; |
131 | 0 | uint32 i, |
132 | 0 | j; |
133 | 0 | bool page_found = false; |
134 | | |
135 | | /* |
136 | | * Write-lock the tail page. Here, we need to maintain locking order such |
137 | | * that, first acquire the lock on tail page of bucket, then on meta page |
138 | | * to find and lock the bitmap page and if it is found, then lock on meta |
139 | | * page is released, then finally acquire the lock on new overflow buffer. |
140 | | * We need this locking order to avoid deadlock with backends that are |
141 | | * doing inserts. |
142 | | * |
143 | | * Note: We could have avoided locking many buffers here if we made two |
144 | | * WAL records for acquiring an overflow page (one to allocate an overflow |
145 | | * page and another to add it to overflow bucket chain). However, doing |
146 | | * so can leak an overflow page, if the system crashes after allocation. |
147 | | * Needless to say, it is better to have a single record from a |
148 | | * performance point of view as well. |
149 | | */ |
150 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
151 | | |
152 | | /* probably redundant... */ |
153 | 0 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
154 | | |
155 | | /* loop to find current tail page, in case someone else inserted too */ |
156 | 0 | for (;;) |
157 | 0 | { |
158 | 0 | BlockNumber nextblkno; |
159 | |
|
160 | 0 | page = BufferGetPage(buf); |
161 | 0 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
162 | 0 | nextblkno = pageopaque->hasho_nextblkno; |
163 | |
|
164 | 0 | if (!BlockNumberIsValid(nextblkno)) |
165 | 0 | break; |
166 | | |
167 | | /* we assume we do not need to write the unmodified page */ |
168 | 0 | if (retain_pin) |
169 | 0 | { |
170 | | /* pin will be retained only for the primary bucket page */ |
171 | 0 | Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); |
172 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
173 | 0 | } |
174 | 0 | else |
175 | 0 | _hash_relbuf(rel, buf); |
176 | | |
177 | 0 | retain_pin = false; |
178 | |
|
179 | 0 | buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); |
180 | 0 | } |
181 | | |
182 | | /* Get exclusive lock on the meta page */ |
183 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
184 | |
|
185 | 0 | _hash_checkpage(rel, metabuf, LH_META_PAGE); |
186 | 0 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
187 | | |
188 | | /* start search at hashm_firstfree */ |
189 | 0 | orig_firstfree = metap->hashm_firstfree; |
190 | 0 | first_page = orig_firstfree >> BMPG_SHIFT(metap); |
191 | 0 | bit = orig_firstfree & BMPG_MASK(metap); |
192 | 0 | i = first_page; |
193 | 0 | j = bit / BITS_PER_MAP; |
194 | 0 | bit &= ~(BITS_PER_MAP - 1); |
195 | | |
196 | | /* outer loop iterates once per bitmap page */ |
197 | 0 | for (;;) |
198 | 0 | { |
199 | 0 | BlockNumber mapblkno; |
200 | 0 | Page mappage; |
201 | 0 | uint32 last_inpage; |
202 | | |
203 | | /* want to end search with the last existing overflow page */ |
204 | 0 | splitnum = metap->hashm_ovflpoint; |
205 | 0 | max_ovflpg = metap->hashm_spares[splitnum] - 1; |
206 | 0 | last_page = max_ovflpg >> BMPG_SHIFT(metap); |
207 | 0 | last_bit = max_ovflpg & BMPG_MASK(metap); |
208 | |
|
209 | 0 | if (i > last_page) |
210 | 0 | break; |
211 | | |
212 | 0 | Assert(i < metap->hashm_nmaps); |
213 | 0 | mapblkno = metap->hashm_mapp[i]; |
214 | |
|
215 | 0 | if (i == last_page) |
216 | 0 | last_inpage = last_bit; |
217 | 0 | else |
218 | 0 | last_inpage = BMPGSZ_BIT(metap) - 1; |
219 | | |
220 | | /* Release exclusive lock on metapage while reading bitmap page */ |
221 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
222 | |
|
223 | 0 | mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); |
224 | 0 | mappage = BufferGetPage(mapbuf); |
225 | 0 | freep = HashPageGetBitmap(mappage); |
226 | |
|
227 | 0 | for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) |
228 | 0 | { |
229 | 0 | if (freep[j] != ALL_SET) |
230 | 0 | { |
231 | 0 | page_found = true; |
232 | | |
233 | | /* Reacquire exclusive lock on the meta page */ |
234 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
235 | | |
236 | | /* convert bit to bit number within page */ |
237 | 0 | bit += _hash_firstfreebit(freep[j]); |
238 | 0 | bitmap_page_bit = bit; |
239 | | |
240 | | /* convert bit to absolute bit number */ |
241 | 0 | bit += (i << BMPG_SHIFT(metap)); |
242 | | /* Calculate address of the recycled overflow page */ |
243 | 0 | blkno = bitno_to_blkno(metap, bit); |
244 | | |
245 | | /* Fetch and init the recycled page */ |
246 | 0 | ovflbuf = _hash_getinitbuf(rel, blkno); |
247 | |
|
248 | 0 | goto found; |
249 | 0 | } |
250 | 0 | } |
251 | | |
252 | | /* No free space here, try to advance to next map page */ |
253 | 0 | _hash_relbuf(rel, mapbuf); |
254 | 0 | mapbuf = InvalidBuffer; |
255 | 0 | i++; |
256 | 0 | j = 0; /* scan from start of next map page */ |
257 | 0 | bit = 0; |
258 | | |
259 | | /* Reacquire exclusive lock on the meta page */ |
260 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
261 | 0 | } |
262 | | |
263 | | /* |
264 | | * No free pages --- have to extend the relation to add an overflow page. |
265 | | * First, check to see if we have to add a new bitmap page too. |
266 | | */ |
267 | 0 | if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) |
268 | 0 | { |
269 | | /* |
270 | | * We create the new bitmap page with all pages marked "in use". |
271 | | * Actually two pages in the new bitmap's range will exist |
272 | | * immediately: the bitmap page itself, and the following page which |
273 | | * is the one we return to the caller. Both of these are correctly |
274 | | * marked "in use". Subsequent pages do not exist yet, but it is |
275 | | * convenient to pre-mark them as "in use" too. |
276 | | */ |
277 | 0 | bit = metap->hashm_spares[splitnum]; |
278 | | |
279 | | /* metapage already has a write lock */ |
280 | 0 | if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) |
281 | 0 | ereport(ERROR, |
282 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
283 | 0 | errmsg("out of overflow pages in hash index \"%s\"", |
284 | 0 | RelationGetRelationName(rel)))); |
285 | | |
286 | 0 | newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); |
287 | 0 | } |
288 | 0 | else |
289 | 0 | { |
290 | | /* |
291 | | * Nothing to do here; since the page will be past the last used page, |
292 | | * we know its bitmap bit was preinitialized to "in use". |
293 | | */ |
294 | 0 | } |
295 | | |
296 | | /* Calculate address of the new overflow page */ |
297 | 0 | bit = BufferIsValid(newmapbuf) ? |
298 | 0 | metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; |
299 | 0 | blkno = bitno_to_blkno(metap, bit); |
300 | | |
301 | | /* |
302 | | * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the |
303 | | * relation length stays in sync with ours. XXX It's annoying to do this |
304 | | * with metapage write lock held; would be better to use a lock that |
305 | | * doesn't block incoming searches. |
306 | | * |
307 | | * It is okay to hold two buffer locks here (one on tail page of bucket |
308 | | * and other on new overflow page) since there cannot be anyone else |
309 | | * contending for access to ovflbuf. |
310 | | */ |
311 | 0 | ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); |
312 | |
|
313 | 0 | found: |
314 | | |
315 | | /* |
316 | | * Do the update. No ereport(ERROR) until changes are logged. We want to |
317 | | * log the changes for bitmap page and overflow page together to avoid |
318 | | * loss of pages in case the new page is added. |
319 | | */ |
320 | 0 | START_CRIT_SECTION(); |
321 | |
|
322 | 0 | if (page_found) |
323 | 0 | { |
324 | 0 | Assert(BufferIsValid(mapbuf)); |
325 | | |
326 | | /* mark page "in use" in the bitmap */ |
327 | 0 | SETBIT(freep, bitmap_page_bit); |
328 | 0 | MarkBufferDirty(mapbuf); |
329 | 0 | } |
330 | 0 | else |
331 | 0 | { |
332 | | /* update the count to indicate new overflow page is added */ |
333 | 0 | metap->hashm_spares[splitnum]++; |
334 | |
|
335 | 0 | if (BufferIsValid(newmapbuf)) |
336 | 0 | { |
337 | 0 | _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); |
338 | 0 | MarkBufferDirty(newmapbuf); |
339 | | |
340 | | /* add the new bitmap page to the metapage's list of bitmaps */ |
341 | 0 | metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); |
342 | 0 | metap->hashm_nmaps++; |
343 | 0 | metap->hashm_spares[splitnum]++; |
344 | 0 | } |
345 | | |
346 | 0 | MarkBufferDirty(metabuf); |
347 | | |
348 | | /* |
349 | | * for new overflow page, we don't need to explicitly set the bit in |
350 | | * bitmap page, as by default that will be set to "in use". |
351 | | */ |
352 | 0 | } |
353 | | |
354 | | /* |
355 | | * Adjust hashm_firstfree to avoid redundant searches. But don't risk |
356 | | * changing it if someone moved it while we were searching bitmap pages. |
357 | | */ |
358 | 0 | if (metap->hashm_firstfree == orig_firstfree) |
359 | 0 | { |
360 | 0 | metap->hashm_firstfree = bit + 1; |
361 | 0 | MarkBufferDirty(metabuf); |
362 | 0 | } |
363 | | |
364 | | /* initialize new overflow page */ |
365 | 0 | ovflpage = BufferGetPage(ovflbuf); |
366 | 0 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
367 | 0 | ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); |
368 | 0 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
369 | 0 | ovflopaque->hasho_bucket = pageopaque->hasho_bucket; |
370 | 0 | ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; |
371 | 0 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
372 | |
|
373 | 0 | MarkBufferDirty(ovflbuf); |
374 | | |
375 | | /* logically chain overflow page to previous page */ |
376 | 0 | pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); |
377 | |
|
378 | 0 | MarkBufferDirty(buf); |
379 | | |
380 | | /* XLOG stuff */ |
381 | 0 | if (RelationNeedsWAL(rel)) |
382 | 0 | { |
383 | 0 | XLogRecPtr recptr; |
384 | 0 | xl_hash_add_ovfl_page xlrec; |
385 | |
|
386 | 0 | xlrec.bmpage_found = page_found; |
387 | 0 | xlrec.bmsize = metap->hashm_bmsize; |
388 | |
|
389 | 0 | XLogBeginInsert(); |
390 | 0 | XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); |
391 | |
|
392 | 0 | XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); |
393 | 0 | XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); |
394 | |
|
395 | 0 | XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
396 | |
|
397 | 0 | if (BufferIsValid(mapbuf)) |
398 | 0 | { |
399 | 0 | XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); |
400 | 0 | XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); |
401 | 0 | } |
402 | | |
403 | 0 | if (BufferIsValid(newmapbuf)) |
404 | 0 | XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); |
405 | | |
406 | 0 | XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); |
407 | 0 | XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
408 | |
|
409 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); |
410 | |
|
411 | 0 | PageSetLSN(BufferGetPage(ovflbuf), recptr); |
412 | 0 | PageSetLSN(BufferGetPage(buf), recptr); |
413 | | |
414 | 0 | if (BufferIsValid(mapbuf)) |
415 | 0 | PageSetLSN(BufferGetPage(mapbuf), recptr); |
416 | | |
417 | 0 | if (BufferIsValid(newmapbuf)) |
418 | 0 | PageSetLSN(BufferGetPage(newmapbuf), recptr); |
419 | | |
420 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
421 | 0 | } |
422 | | |
423 | 0 | END_CRIT_SECTION(); |
424 | | |
425 | 0 | if (retain_pin) |
426 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
427 | 0 | else |
428 | 0 | _hash_relbuf(rel, buf); |
429 | |
|
430 | 0 | if (BufferIsValid(mapbuf)) |
431 | 0 | _hash_relbuf(rel, mapbuf); |
432 | | |
433 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
434 | |
|
435 | 0 | if (BufferIsValid(newmapbuf)) |
436 | 0 | _hash_relbuf(rel, newmapbuf); |
437 | | |
438 | 0 | return ovflbuf; |
439 | 0 | } |
440 | | |
441 | | /* |
442 | | * _hash_firstfreebit() |
443 | | * |
444 | | * Return the number of the first bit that is not set in the word 'map'. |
445 | | */ |
446 | | static uint32 |
447 | | _hash_firstfreebit(uint32 map) |
448 | 0 | { |
449 | 0 | uint32 i, |
450 | 0 | mask; |
451 | |
|
452 | 0 | mask = 0x1; |
453 | 0 | for (i = 0; i < BITS_PER_MAP; i++) |
454 | 0 | { |
455 | 0 | if (!(mask & map)) |
456 | 0 | return i; |
457 | 0 | mask <<= 1; |
458 | 0 | } |
459 | | |
460 | 0 | elog(ERROR, "firstfreebit found no free bit"); |
461 | | |
462 | 0 | return 0; /* keep compiler quiet */ |
463 | 0 | } |
464 | | |
465 | | /* |
466 | | * _hash_freeovflpage() - |
467 | | * |
468 | | * Remove this overflow page from its bucket's chain, and mark the page as |
469 | | * free. On entry, ovflbuf is write-locked; it is released before exiting. |
470 | | * |
471 | | * Add the tuples (itups) to wbuf in this function. We could do that in the |
472 | | * caller as well, but the advantage of doing it here is we can easily write |
473 | | * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and |
474 | | * removal of overflow page has to done as an atomic operation, otherwise |
475 | | * during replay on standby users might find duplicate records. |
476 | | * |
477 | | * Since this function is invoked in VACUUM, we provide an access strategy |
478 | | * parameter that controls fetches of the bucket pages. |
479 | | * |
480 | | * Returns the block number of the page that followed the given page |
481 | | * in the bucket, or InvalidBlockNumber if no following page. |
482 | | * |
483 | | * NB: caller must not hold lock on metapage, nor on page, that's next to |
484 | | * ovflbuf in the bucket chain. We don't acquire the lock on page that's |
485 | | * prior to ovflbuf in chain if it is same as wbuf because the caller already |
486 | | * has a lock on same. |
487 | | */ |
488 | | BlockNumber |
489 | | _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, |
490 | | Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, |
491 | | Size *tups_size, uint16 nitups, |
492 | | BufferAccessStrategy bstrategy) |
493 | 0 | { |
494 | 0 | HashMetaPage metap; |
495 | 0 | Buffer metabuf; |
496 | 0 | Buffer mapbuf; |
497 | 0 | BlockNumber ovflblkno; |
498 | 0 | BlockNumber prevblkno; |
499 | 0 | BlockNumber blkno; |
500 | 0 | BlockNumber nextblkno; |
501 | 0 | BlockNumber writeblkno; |
502 | 0 | HashPageOpaque ovflopaque; |
503 | 0 | Page ovflpage; |
504 | 0 | Page mappage; |
505 | 0 | uint32 *freep; |
506 | 0 | uint32 ovflbitno; |
507 | 0 | int32 bitmappage, |
508 | 0 | bitmapbit; |
509 | 0 | Bucket bucket PG_USED_FOR_ASSERTS_ONLY; |
510 | 0 | Buffer prevbuf = InvalidBuffer; |
511 | 0 | Buffer nextbuf = InvalidBuffer; |
512 | 0 | bool update_metap = false; |
513 | | |
514 | | /* Get information from the doomed page */ |
515 | 0 | _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); |
516 | 0 | ovflblkno = BufferGetBlockNumber(ovflbuf); |
517 | 0 | ovflpage = BufferGetPage(ovflbuf); |
518 | 0 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
519 | 0 | nextblkno = ovflopaque->hasho_nextblkno; |
520 | 0 | prevblkno = ovflopaque->hasho_prevblkno; |
521 | 0 | writeblkno = BufferGetBlockNumber(wbuf); |
522 | 0 | bucket = ovflopaque->hasho_bucket; |
523 | | |
524 | | /* |
525 | | * Fix up the bucket chain. this is a doubly-linked list, so we must fix |
526 | | * up the bucket chain members behind and ahead of the overflow page being |
527 | | * deleted. Concurrency issues are avoided by using lock chaining as |
528 | | * described atop hashbucketcleanup. |
529 | | */ |
530 | 0 | if (BlockNumberIsValid(prevblkno)) |
531 | 0 | { |
532 | 0 | if (prevblkno == writeblkno) |
533 | 0 | prevbuf = wbuf; |
534 | 0 | else |
535 | 0 | prevbuf = _hash_getbuf_with_strategy(rel, |
536 | 0 | prevblkno, |
537 | 0 | HASH_WRITE, |
538 | 0 | LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, |
539 | 0 | bstrategy); |
540 | 0 | } |
541 | 0 | if (BlockNumberIsValid(nextblkno)) |
542 | 0 | nextbuf = _hash_getbuf_with_strategy(rel, |
543 | 0 | nextblkno, |
544 | 0 | HASH_WRITE, |
545 | 0 | LH_OVERFLOW_PAGE, |
546 | 0 | bstrategy); |
547 | | |
548 | | /* Note: bstrategy is intentionally not used for metapage and bitmap */ |
549 | | |
550 | | /* Read the metapage so we can determine which bitmap page to use */ |
551 | 0 | metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); |
552 | 0 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
553 | | |
554 | | /* Identify which bit to set */ |
555 | 0 | ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); |
556 | |
|
557 | 0 | bitmappage = ovflbitno >> BMPG_SHIFT(metap); |
558 | 0 | bitmapbit = ovflbitno & BMPG_MASK(metap); |
559 | |
|
560 | 0 | if (bitmappage >= metap->hashm_nmaps) |
561 | 0 | elog(ERROR, "invalid overflow bit number %u", ovflbitno); |
562 | 0 | blkno = metap->hashm_mapp[bitmappage]; |
563 | | |
564 | | /* Release metapage lock while we access the bitmap page */ |
565 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
566 | | |
567 | | /* read the bitmap page to clear the bitmap bit */ |
568 | 0 | mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); |
569 | 0 | mappage = BufferGetPage(mapbuf); |
570 | 0 | freep = HashPageGetBitmap(mappage); |
571 | 0 | Assert(ISSET(freep, bitmapbit)); |
572 | | |
573 | | /* Get write-lock on metapage to update firstfree */ |
574 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
575 | | |
576 | | /* This operation needs to log multiple tuples, prepare WAL for that */ |
577 | 0 | if (RelationNeedsWAL(rel)) |
578 | 0 | XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); |
579 | |
|
580 | 0 | START_CRIT_SECTION(); |
581 | | |
582 | | /* |
583 | | * we have to insert tuples on the "write" page, being careful to preserve |
584 | | * hashkey ordering. (If we insert many tuples into the same "write" page |
585 | | * it would be worth qsort'ing them). |
586 | | */ |
587 | 0 | if (nitups > 0) |
588 | 0 | { |
589 | 0 | _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
590 | 0 | MarkBufferDirty(wbuf); |
591 | 0 | } |
592 | | |
593 | | /* |
594 | | * Reinitialize the freed overflow page. Just zeroing the page won't |
595 | | * work, because WAL replay routines expect pages to be initialized. See |
596 | | * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are |
597 | | * careful to make the special space valid here so that tools like |
598 | | * pageinspect won't get confused. |
599 | | */ |
600 | 0 | _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); |
601 | | |
602 | 0 | ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); |
603 | | |
604 | 0 | ovflopaque->hasho_prevblkno = InvalidBlockNumber; |
605 | 0 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
606 | 0 | ovflopaque->hasho_bucket = -1; |
607 | 0 | ovflopaque->hasho_flag = LH_UNUSED_PAGE; |
608 | 0 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
609 | |
|
610 | 0 | MarkBufferDirty(ovflbuf); |
611 | |
|
612 | 0 | if (BufferIsValid(prevbuf)) |
613 | 0 | { |
614 | 0 | Page prevpage = BufferGetPage(prevbuf); |
615 | 0 | HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); |
616 | | |
617 | 0 | Assert(prevopaque->hasho_bucket == bucket); |
618 | 0 | prevopaque->hasho_nextblkno = nextblkno; |
619 | 0 | MarkBufferDirty(prevbuf); |
620 | 0 | } |
621 | 0 | if (BufferIsValid(nextbuf)) |
622 | 0 | { |
623 | 0 | Page nextpage = BufferGetPage(nextbuf); |
624 | 0 | HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); |
625 | | |
626 | 0 | Assert(nextopaque->hasho_bucket == bucket); |
627 | 0 | nextopaque->hasho_prevblkno = prevblkno; |
628 | 0 | MarkBufferDirty(nextbuf); |
629 | 0 | } |
630 | | |
631 | | /* Clear the bitmap bit to indicate that this overflow page is free */ |
632 | 0 | CLRBIT(freep, bitmapbit); |
633 | 0 | MarkBufferDirty(mapbuf); |
634 | | |
635 | | /* if this is now the first free page, update hashm_firstfree */ |
636 | 0 | if (ovflbitno < metap->hashm_firstfree) |
637 | 0 | { |
638 | 0 | metap->hashm_firstfree = ovflbitno; |
639 | 0 | update_metap = true; |
640 | 0 | MarkBufferDirty(metabuf); |
641 | 0 | } |
642 | | |
643 | | /* XLOG stuff */ |
644 | 0 | if (RelationNeedsWAL(rel)) |
645 | 0 | { |
646 | 0 | xl_hash_squeeze_page xlrec; |
647 | 0 | XLogRecPtr recptr; |
648 | 0 | int i; |
649 | |
|
650 | 0 | xlrec.prevblkno = prevblkno; |
651 | 0 | xlrec.nextblkno = nextblkno; |
652 | 0 | xlrec.ntups = nitups; |
653 | 0 | xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); |
654 | 0 | xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); |
655 | |
|
656 | 0 | XLogBeginInsert(); |
657 | 0 | XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); |
658 | | |
659 | | /* |
660 | | * bucket buffer needs to be registered to ensure that we can acquire |
661 | | * a cleanup lock on it during replay. |
662 | | */ |
663 | 0 | if (!xlrec.is_prim_bucket_same_wrt) |
664 | 0 | XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
665 | |
|
666 | 0 | XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
667 | 0 | if (xlrec.ntups > 0) |
668 | 0 | { |
669 | 0 | XLogRegisterBufData(1, (char *) itup_offsets, |
670 | 0 | nitups * sizeof(OffsetNumber)); |
671 | 0 | for (i = 0; i < nitups; i++) |
672 | 0 | XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
673 | 0 | } |
674 | |
|
675 | 0 | XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); |
676 | | |
677 | | /* |
678 | | * If prevpage and the writepage (block in which we are moving tuples |
679 | | * from overflow) are same, then no need to separately register |
680 | | * prevpage. During replay, we can directly update the nextblock in |
681 | | * writepage. |
682 | | */ |
683 | 0 | if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
684 | 0 | XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); |
685 | | |
686 | 0 | if (BufferIsValid(nextbuf)) |
687 | 0 | XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); |
688 | | |
689 | 0 | XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); |
690 | 0 | XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); |
691 | |
|
692 | 0 | if (update_metap) |
693 | 0 | { |
694 | 0 | XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); |
695 | 0 | XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); |
696 | 0 | } |
697 | |
|
698 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); |
699 | |
|
700 | 0 | PageSetLSN(BufferGetPage(wbuf), recptr); |
701 | 0 | PageSetLSN(BufferGetPage(ovflbuf), recptr); |
702 | | |
703 | 0 | if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) |
704 | 0 | PageSetLSN(BufferGetPage(prevbuf), recptr); |
705 | 0 | if (BufferIsValid(nextbuf)) |
706 | 0 | PageSetLSN(BufferGetPage(nextbuf), recptr); |
707 | | |
708 | 0 | PageSetLSN(BufferGetPage(mapbuf), recptr); |
709 | | |
710 | 0 | if (update_metap) |
711 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
712 | 0 | } |
713 | | |
714 | 0 | END_CRIT_SECTION(); |
715 | | |
716 | | /* release previous bucket if it is not same as write bucket */ |
717 | 0 | if (BufferIsValid(prevbuf) && prevblkno != writeblkno) |
718 | 0 | _hash_relbuf(rel, prevbuf); |
719 | | |
720 | 0 | if (BufferIsValid(ovflbuf)) |
721 | 0 | _hash_relbuf(rel, ovflbuf); |
722 | | |
723 | 0 | if (BufferIsValid(nextbuf)) |
724 | 0 | _hash_relbuf(rel, nextbuf); |
725 | | |
726 | 0 | _hash_relbuf(rel, mapbuf); |
727 | 0 | _hash_relbuf(rel, metabuf); |
728 | |
|
729 | 0 | return nextblkno; |
730 | 0 | } |
731 | | |
732 | | |
733 | | /* |
734 | | * _hash_initbitmapbuffer() |
735 | | * |
736 | | * Initialize a new bitmap page. All bits in the new bitmap page are set to |
737 | | * "1", indicating "in use". |
738 | | */ |
739 | | void |
740 | | _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) |
741 | 1 | { |
742 | 1 | Page pg; |
743 | 1 | HashPageOpaque op; |
744 | 1 | uint32 *freep; |
745 | | |
746 | 1 | pg = BufferGetPage(buf); |
747 | | |
748 | | /* initialize the page */ |
749 | 1 | if (initpage) |
750 | 0 | _hash_pageinit(pg, BufferGetPageSize(buf)); |
751 | | |
752 | | /* initialize the page's special space */ |
753 | 1 | op = (HashPageOpaque) PageGetSpecialPointer(pg); |
754 | 1 | op->hasho_prevblkno = InvalidBlockNumber; |
755 | 1 | op->hasho_nextblkno = InvalidBlockNumber; |
756 | 1 | op->hasho_bucket = -1; |
757 | 1 | op->hasho_flag = LH_BITMAP_PAGE; |
758 | 1 | op->hasho_page_id = HASHO_PAGE_ID; |
759 | | |
760 | | /* set all of the bits to 1 */ |
761 | 1 | freep = HashPageGetBitmap(pg); |
762 | 1 | MemSet(freep, 0xFF, bmsize); |
763 | | |
764 | | /* |
765 | | * Set pd_lower just past the end of the bitmap page data. We could even |
766 | | * set pd_lower equal to pd_upper, but this is more precise and makes the |
767 | | * page look compressible to xlog.c. |
768 | | */ |
769 | 1 | ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; |
770 | 1 | } |
771 | | |
772 | | |
773 | | /* |
774 | | * _hash_squeezebucket(rel, bucket) |
775 | | * |
776 | | * Try to squeeze the tuples onto pages occurring earlier in the |
777 | | * bucket chain in an attempt to free overflow pages. When we start |
778 | | * the "squeezing", the page from which we start taking tuples (the |
779 | | * "read" page) is the last bucket in the bucket chain and the page |
780 | | * onto which we start squeezing tuples (the "write" page) is the |
781 | | * first page in the bucket chain. The read page works backward and |
782 | | * the write page works forward; the procedure terminates when the |
783 | | * read page and write page are the same page. |
784 | | * |
785 | | * At completion of this procedure, it is guaranteed that all pages in |
786 | | * the bucket are nonempty, unless the bucket is totally empty (in |
787 | | * which case all overflow pages will be freed). The original implementation |
788 | | * required that to be true on entry as well, but it's a lot easier for |
789 | | * callers to leave empty overflow pages and let this guy clean it up. |
790 | | * |
791 | | * Caller must acquire cleanup lock on the primary page of the target |
792 | | * bucket to exclude any scans that are in progress, which could easily |
793 | | * be confused into returning the same tuple more than once or some tuples |
794 | | * not at all by the rearrangement we are performing here. To prevent |
795 | | * any concurrent scan to cross the squeeze scan we use lock chaining |
796 | | * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup. |
797 | | * |
798 | | * We need to retain a pin on the primary bucket to ensure that no concurrent |
799 | | * split can start. |
800 | | * |
801 | | * Since this function is invoked in VACUUM, we provide an access strategy |
802 | | * parameter that controls fetches of the bucket pages. |
803 | | */ |
804 | | void |
805 | | _hash_squeezebucket(Relation rel, |
806 | | Bucket bucket, |
807 | | BlockNumber bucket_blkno, |
808 | | Buffer bucket_buf, |
809 | | BufferAccessStrategy bstrategy) |
810 | 0 | { |
811 | 0 | BlockNumber wblkno; |
812 | 0 | BlockNumber rblkno; |
813 | 0 | Buffer wbuf; |
814 | 0 | Buffer rbuf; |
815 | 0 | Page wpage; |
816 | 0 | Page rpage; |
817 | 0 | HashPageOpaque wopaque; |
818 | 0 | HashPageOpaque ropaque; |
819 | | |
820 | | /* |
821 | | * start squeezing into the primary bucket page. |
822 | | */ |
823 | 0 | wblkno = bucket_blkno; |
824 | 0 | wbuf = bucket_buf; |
825 | 0 | wpage = BufferGetPage(wbuf); |
826 | 0 | wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
827 | | |
828 | | /* |
829 | | * if there aren't any overflow pages, there's nothing to squeeze. caller |
830 | | * is responsible for releasing the pin on primary bucket page. |
831 | | */ |
832 | 0 | if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) |
833 | 0 | { |
834 | 0 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
835 | 0 | return; |
836 | 0 | } |
837 | | |
838 | | /* |
839 | | * Find the last page in the bucket chain by starting at the base bucket |
840 | | * page and working forward. Note: we assume that a hash bucket chain is |
841 | | * usually smaller than the buffer ring being used by VACUUM, else using |
842 | | * the access strategy here would be counterproductive. |
843 | | */ |
844 | 0 | rbuf = InvalidBuffer; |
845 | 0 | ropaque = wopaque; |
846 | 0 | do |
847 | 0 | { |
848 | 0 | rblkno = ropaque->hasho_nextblkno; |
849 | 0 | if (rbuf != InvalidBuffer) |
850 | 0 | _hash_relbuf(rel, rbuf); |
851 | 0 | rbuf = _hash_getbuf_with_strategy(rel, |
852 | 0 | rblkno, |
853 | 0 | HASH_WRITE, |
854 | 0 | LH_OVERFLOW_PAGE, |
855 | 0 | bstrategy); |
856 | 0 | rpage = BufferGetPage(rbuf); |
857 | 0 | ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
858 | 0 | Assert(ropaque->hasho_bucket == bucket); |
859 | 0 | } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); |
860 | | |
861 | | /* |
862 | | * squeeze the tuples. |
863 | | */ |
864 | 0 | for (;;) |
865 | 0 | { |
866 | 0 | OffsetNumber roffnum; |
867 | 0 | OffsetNumber maxroffnum; |
868 | 0 | OffsetNumber deletable[MaxOffsetNumber]; |
869 | 0 | IndexTuple itups[MaxIndexTuplesPerPage]; |
870 | 0 | Size tups_size[MaxIndexTuplesPerPage]; |
871 | 0 | OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; |
872 | 0 | uint16 ndeletable = 0; |
873 | 0 | uint16 nitups = 0; |
874 | 0 | Size all_tups_size = 0; |
875 | 0 | int i; |
876 | 0 | bool retain_pin = false; |
877 | |
|
878 | 0 | readpage: |
879 | | /* Scan each tuple in "read" page */ |
880 | 0 | maxroffnum = PageGetMaxOffsetNumber(rpage); |
881 | 0 | for (roffnum = FirstOffsetNumber; |
882 | 0 | roffnum <= maxroffnum; |
883 | 0 | roffnum = OffsetNumberNext(roffnum)) |
884 | 0 | { |
885 | 0 | IndexTuple itup; |
886 | 0 | Size itemsz; |
887 | | |
888 | | /* skip dead tuples */ |
889 | 0 | if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) |
890 | 0 | continue; |
891 | | |
892 | 0 | itup = (IndexTuple) PageGetItem(rpage, |
893 | 0 | PageGetItemId(rpage, roffnum)); |
894 | 0 | itemsz = IndexTupleSize(itup); |
895 | 0 | itemsz = MAXALIGN(itemsz); |
896 | | |
897 | | /* |
898 | | * Walk up the bucket chain, looking for a page big enough for |
899 | | * this item and all other accumulated items. Exit if we reach |
900 | | * the read page. |
901 | | */ |
902 | 0 | while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) |
903 | 0 | { |
904 | 0 | Buffer next_wbuf = InvalidBuffer; |
905 | 0 | bool tups_moved = false; |
906 | |
|
907 | 0 | Assert(!PageIsEmpty(wpage)); |
908 | | |
909 | 0 | if (wblkno == bucket_blkno) |
910 | 0 | retain_pin = true; |
911 | |
|
912 | 0 | wblkno = wopaque->hasho_nextblkno; |
913 | 0 | Assert(BlockNumberIsValid(wblkno)); |
914 | | |
915 | | /* don't need to move to next page if we reached the read page */ |
916 | 0 | if (wblkno != rblkno) |
917 | 0 | next_wbuf = _hash_getbuf_with_strategy(rel, |
918 | 0 | wblkno, |
919 | 0 | HASH_WRITE, |
920 | 0 | LH_OVERFLOW_PAGE, |
921 | 0 | bstrategy); |
922 | |
|
923 | 0 | if (nitups > 0) |
924 | 0 | { |
925 | 0 | Assert(nitups == ndeletable); |
926 | | |
927 | | /* |
928 | | * This operation needs to log multiple tuples, prepare |
929 | | * WAL for that. |
930 | | */ |
931 | 0 | if (RelationNeedsWAL(rel)) |
932 | 0 | XLogEnsureRecordSpace(0, 3 + nitups); |
933 | |
|
934 | 0 | START_CRIT_SECTION(); |
935 | | |
936 | | /* |
937 | | * we have to insert tuples on the "write" page, being |
938 | | * careful to preserve hashkey ordering. (If we insert |
939 | | * many tuples into the same "write" page it would be |
940 | | * worth qsort'ing them). |
941 | | */ |
942 | 0 | _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); |
943 | 0 | MarkBufferDirty(wbuf); |
944 | | |
945 | | /* Delete tuples we already moved off read page */ |
946 | 0 | PageIndexMultiDelete(rpage, deletable, ndeletable); |
947 | 0 | MarkBufferDirty(rbuf); |
948 | | |
949 | | /* XLOG stuff */ |
950 | 0 | if (RelationNeedsWAL(rel)) |
951 | 0 | { |
952 | 0 | XLogRecPtr recptr; |
953 | 0 | xl_hash_move_page_contents xlrec; |
954 | |
|
955 | 0 | xlrec.ntups = nitups; |
956 | 0 | xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; |
957 | |
|
958 | 0 | XLogBeginInsert(); |
959 | 0 | XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); |
960 | | |
961 | | /* |
962 | | * bucket buffer needs to be registered to ensure that |
963 | | * we can acquire a cleanup lock on it during replay. |
964 | | */ |
965 | 0 | if (!xlrec.is_prim_bucket_same_wrt) |
966 | 0 | XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
967 | |
|
968 | 0 | XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); |
969 | 0 | XLogRegisterBufData(1, (char *) itup_offsets, |
970 | 0 | nitups * sizeof(OffsetNumber)); |
971 | 0 | for (i = 0; i < nitups; i++) |
972 | 0 | XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); |
973 | |
|
974 | 0 | XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); |
975 | 0 | XLogRegisterBufData(2, (char *) deletable, |
976 | 0 | ndeletable * sizeof(OffsetNumber)); |
977 | |
|
978 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); |
979 | |
|
980 | 0 | PageSetLSN(BufferGetPage(wbuf), recptr); |
981 | 0 | PageSetLSN(BufferGetPage(rbuf), recptr); |
982 | 0 | } |
983 | | |
984 | 0 | END_CRIT_SECTION(); |
985 | | |
986 | 0 | tups_moved = true; |
987 | 0 | } |
988 | | |
989 | | /* |
990 | | * release the lock on previous page after acquiring the lock |
991 | | * on next page |
992 | | */ |
993 | 0 | if (retain_pin) |
994 | 0 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
995 | 0 | else |
996 | 0 | _hash_relbuf(rel, wbuf); |
997 | | |
998 | | /* nothing more to do if we reached the read page */ |
999 | 0 | if (rblkno == wblkno) |
1000 | 0 | { |
1001 | 0 | _hash_relbuf(rel, rbuf); |
1002 | 0 | return; |
1003 | 0 | } |
1004 | | |
1005 | 0 | wbuf = next_wbuf; |
1006 | 0 | wpage = BufferGetPage(wbuf); |
1007 | 0 | wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); |
1008 | 0 | Assert(wopaque->hasho_bucket == bucket); |
1009 | 0 | retain_pin = false; |
1010 | | |
1011 | | /* be tidy */ |
1012 | 0 | for (i = 0; i < nitups; i++) |
1013 | 0 | pfree(itups[i]); |
1014 | 0 | nitups = 0; |
1015 | 0 | all_tups_size = 0; |
1016 | 0 | ndeletable = 0; |
1017 | | |
1018 | | /* |
1019 | | * after moving the tuples, rpage would have been compacted, |
1020 | | * so we need to rescan it. |
1021 | | */ |
1022 | 0 | if (tups_moved) |
1023 | 0 | goto readpage; |
1024 | 0 | } |
1025 | | |
1026 | | /* remember tuple for deletion from "read" page */ |
1027 | 0 | deletable[ndeletable++] = roffnum; |
1028 | | |
1029 | | /* |
1030 | | * we need a copy of index tuples as they can be freed as part of |
1031 | | * overflow page, however we need them to write a WAL record in |
1032 | | * _hash_freeovflpage. |
1033 | | */ |
1034 | 0 | itups[nitups] = CopyIndexTuple(itup); |
1035 | 0 | tups_size[nitups++] = itemsz; |
1036 | 0 | all_tups_size += itemsz; |
1037 | 0 | } |
1038 | | |
1039 | | /* |
1040 | | * If we reach here, there are no live tuples on the "read" page --- |
1041 | | * it was empty when we got to it, or we moved them all. So we can |
1042 | | * just free the page without bothering with deleting tuples |
1043 | | * individually. Then advance to the previous "read" page. |
1044 | | * |
1045 | | * Tricky point here: if our read and write pages are adjacent in the |
1046 | | * bucket chain, our write lock on wbuf will conflict with |
1047 | | * _hash_freeovflpage's attempt to update the sibling links of the |
1048 | | * removed page. In that case, we don't need to lock it again. |
1049 | | */ |
1050 | 0 | rblkno = ropaque->hasho_prevblkno; |
1051 | 0 | Assert(BlockNumberIsValid(rblkno)); |
1052 | | |
1053 | | /* free this overflow page (releases rbuf) */ |
1054 | 0 | _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, |
1055 | 0 | tups_size, nitups, bstrategy); |
1056 | | |
1057 | | /* be tidy */ |
1058 | 0 | for (i = 0; i < nitups; i++) |
1059 | 0 | pfree(itups[i]); |
1060 | | |
1061 | | /* are we freeing the page adjacent to wbuf? */ |
1062 | 0 | if (rblkno == wblkno) |
1063 | 0 | { |
1064 | | /* retain the pin on primary bucket page till end of bucket scan */ |
1065 | 0 | if (wblkno == bucket_blkno) |
1066 | 0 | LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); |
1067 | 0 | else |
1068 | 0 | _hash_relbuf(rel, wbuf); |
1069 | 0 | return; |
1070 | 0 | } |
1071 | | |
1072 | 0 | rbuf = _hash_getbuf_with_strategy(rel, |
1073 | 0 | rblkno, |
1074 | 0 | HASH_WRITE, |
1075 | 0 | LH_OVERFLOW_PAGE, |
1076 | 0 | bstrategy); |
1077 | 0 | rpage = BufferGetPage(rbuf); |
1078 | 0 | ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); |
1079 | 0 | Assert(ropaque->hasho_bucket == bucket); |
1080 | 0 | } |
1081 | | |
1082 | | /* NOTREACHED */ |
1083 | 0 | } |