[フレーム]

hashpage.c

1/*-------------------------------------------------------------------------

2 *

3 * hashpage.c

4 * Hash table page management code for the Postgres hash access method

5 *

8 *

9 *

10 * IDENTIFICATION

11 * src/backend/access/hash/hashpage.c

12 *

13 * NOTES

14 * Postgres hash pages look like ordinary relation pages. The opaque

15 * data at high addresses includes information about the page including

16 * whether a page is an overflow page or a true bucket, the bucket

17 * number, and the block numbers of the preceding and following pages

18 * in the same bucket.

19 *

20 * The first page in a hash relation, page zero, is special -- it stores

21 * information describing the hash table; it is referred to as the

22 * "meta page." Pages one and higher store the actual data.

23 *

24 * There are also bitmap pages, which are not manipulated here;

25 * see hashovfl.c.

26 *

27 *-------------------------------------------------------------------------

28 */

29#include "postgres.h"

30

31#include "access/hash.h"

32#include "access/hash_xlog.h"

33#include "access/xloginsert.h"

34#include "miscadmin.h"

35#include "port/pg_bitutils.h"

36#include "storage/predicate.h"

37#include "storage/smgr.h"

38#include "utils/rel.h"

39

40static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,

41 uint32 nblocks);

42static void _hash_splitbucket(Relation rel, Buffer metabuf,

43 Bucket obucket, Bucket nbucket,

44 Buffer obuf,

45 Buffer nbuf,

46 HTAB *htab,

47 uint32 maxbucket,

48 uint32 highmask, uint32 lowmask);

49static void log_split_page(Relation rel, Buffer buf);

50

51

52/*

53 * _hash_getbuf() -- Get a buffer by block number for read or write.

54 *

55 * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.

56 * 'flags' is a bitwise OR of the allowed page types.

57 *

58 * This must be used only to fetch pages that are expected to be valid

59 * already. _hash_checkpage() is applied using the given flags.

60 *

61 * When this routine returns, the appropriate lock is set on the

62 * requested buffer and its reference count has been incremented

63 * (ie, the buffer is "locked and pinned").

64 *

65 * P_NEW is disallowed because this routine can only be used

66 * to access pages that are known to be before the filesystem EOF.

67 * Extending the index should be done with _hash_getnewbuf.

68 */

69Buffer

70 _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)

71{

72 Buffer buf;

73

74 if (blkno == P_NEW)

75 elog(ERROR, "hash AM does not use P_NEW");

76

77 buf = ReadBuffer(rel, blkno);

78

79 if (access != HASH_NOLOCK)

80 LockBuffer(buf, access);

81

82 /* ref count and lock type are correct */

83

84 _hash_checkpage(rel, buf, flags);

85

86 return buf;

87}

88

89/*

90 * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.

91 *

92 * We read the page and try to acquire a cleanup lock. If we get it,

93 * we return the buffer; otherwise, we return InvalidBuffer.

94 */

95Buffer

96 _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)

97{

98 Buffer buf;

99

100 if (blkno == P_NEW)

101 elog(ERROR, "hash AM does not use P_NEW");

102

103 buf = ReadBuffer(rel, blkno);

104

105 if (!ConditionalLockBufferForCleanup(buf))

106 {

107 ReleaseBuffer(buf);

108 return InvalidBuffer;

109 }

110

111 /* ref count and lock type are correct */

112

113 _hash_checkpage(rel, buf, flags);

114

115 return buf;

116}

117

118/*

119 * _hash_getinitbuf() -- Get and initialize a buffer by block number.

120 *

121 * This must be used only to fetch pages that are known to be before

122 * the index's filesystem EOF, but are to be filled from scratch.

123 * _hash_pageinit() is applied automatically. Otherwise it has

124 * effects similar to _hash_getbuf() with access = HASH_WRITE.

125 *

126 * When this routine returns, a write lock is set on the

127 * requested buffer and its reference count has been incremented

128 * (ie, the buffer is "locked and pinned").

129 *

130 * P_NEW is disallowed because this routine can only be used

131 * to access pages that are known to be before the filesystem EOF.

132 * Extending the index should be done with _hash_getnewbuf.

133 */

134Buffer

135 _hash_getinitbuf(Relation rel, BlockNumber blkno)

136{

137 Buffer buf;

138

139 if (blkno == P_NEW)

140 elog(ERROR, "hash AM does not use P_NEW");

141

142 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,

143 NULL);

144

145 /* ref count and lock type are correct */

146

147 /* initialize the page */

148 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));

149

150 return buf;

151}

152

153/*

154 * _hash_initbuf() -- Get and initialize a buffer by bucket number.

155 */

156void

157 _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,

158 bool initpage)

159{

160 HashPageOpaque pageopaque;

161 Page page;

162

163 page = BufferGetPage(buf);

164

165 /* initialize the page */

166 if (initpage)

167 _hash_pageinit(page, BufferGetPageSize(buf));

168

169 pageopaque = HashPageGetOpaque(page);

170

171 /*

172 * Set hasho_prevblkno with current hashm_maxbucket. This value will be

173 * used to validate cached HashMetaPageData. See

174 * _hash_getbucketbuf_from_hashkey().

175 */

176 pageopaque->hasho_prevblkno = max_bucket;

177 pageopaque->hasho_nextblkno = InvalidBlockNumber;

178 pageopaque->hasho_bucket = num_bucket;

179 pageopaque->hasho_flag = flag;

180 pageopaque->hasho_page_id = HASHO_PAGE_ID;

181}

182

183/*

184 * _hash_getnewbuf() -- Get a new page at the end of the index.

185 *

186 * This has the same API as _hash_getinitbuf, except that we are adding

187 * a page to the index, and hence expect the page to be past the

188 * logical EOF. (However, we have to support the case where it isn't,

189 * since a prior try might have crashed after extending the filesystem

190 * EOF but before updating the metapage to reflect the added page.)

191 *

192 * It is caller's responsibility to ensure that only one process can

193 * extend the index at a time. In practice, this function is called

194 * only while holding write lock on the metapage, because adding a page

195 * is always associated with an update of metapage data.

196 */

197Buffer

198 _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)

199{

200 BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);

201 Buffer buf;

202

203 if (blkno == P_NEW)

204 elog(ERROR, "hash AM does not use P_NEW");

205 if (blkno > nblocks)

206 elog(ERROR, "access to noncontiguous page in hash index \"%s\"",

207 RelationGetRelationName(rel));

208

209 /* smgr insists we explicitly extend the relation */

210 if (blkno == nblocks)

211 {

212 buf = ExtendBufferedRel(BMR_REL(rel), forkNum, NULL,

213 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);

214 if (BufferGetBlockNumber(buf) != blkno)

215 elog(ERROR, "unexpected hash relation size: %u, should be %u",

216 BufferGetBlockNumber(buf), blkno);

217 }

218 else

219 {

220 buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,

221 NULL);

222 }

223

224 /* ref count and lock type are correct */

225

226 /* initialize the page */

227 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));

228

229 return buf;

230}

231

232/*

233 * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.

234 *

235 * This is identical to _hash_getbuf() but also allows a buffer access

236 * strategy to be specified. We use this for VACUUM operations.

237 */

238Buffer

239 _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,

240 int access, int flags,

241 BufferAccessStrategy bstrategy)

242{

243 Buffer buf;

244

245 if (blkno == P_NEW)

246 elog(ERROR, "hash AM does not use P_NEW");

247

248 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);

249

250 if (access != HASH_NOLOCK)

251 LockBuffer(buf, access);

252

253 /* ref count and lock type are correct */

254

255 _hash_checkpage(rel, buf, flags);

256

257 return buf;

258}

259

260/*

261 * _hash_relbuf() -- release a locked buffer.

262 *

263 * Lock and pin (refcount) are both dropped.

264 */

265void

266 _hash_relbuf(Relation rel, Buffer buf)

267{

268 UnlockReleaseBuffer(buf);

269}

270

271/*

272 * _hash_dropbuf() -- release an unlocked buffer.

273 *

274 * This is used to unpin a buffer on which we hold no lock.

275 */

276void

277 _hash_dropbuf(Relation rel, Buffer buf)

278{

279 ReleaseBuffer(buf);

280}

281

282/*

283 * _hash_dropscanbuf() -- release buffers used in scan.

284 *

285 * This routine unpins the buffers used during scan on which we

286 * hold no lock.

287 */

288void

289 _hash_dropscanbuf(Relation rel, HashScanOpaque so)

290{

291 /* release pin we hold on primary bucket page */

292 if (BufferIsValid(so->hashso_bucket_buf) &&

293 so->hashso_bucket_buf != so->currPos.buf)

294 _hash_dropbuf(rel, so->hashso_bucket_buf);

295 so->hashso_bucket_buf = InvalidBuffer;

296

297 /* release pin we hold on primary bucket page of bucket being split */

298 if (BufferIsValid(so->hashso_split_bucket_buf) &&

299 so->hashso_split_bucket_buf != so->currPos.buf)

300 _hash_dropbuf(rel, so->hashso_split_bucket_buf);

301 so->hashso_split_bucket_buf = InvalidBuffer;

302

303 /* release any pin we still hold */

304 if (BufferIsValid(so->currPos.buf))

305 _hash_dropbuf(rel, so->currPos.buf);

306 so->currPos.buf = InvalidBuffer;

307

308 /* reset split scan */

309 so->hashso_buc_populated = false;

310 so->hashso_buc_split = false;

311}

312

313

314/*

315 * _hash_init() -- Initialize the metadata page of a hash index,

316 * the initial buckets, and the initial bitmap page.

317 *

318 * The initial number of buckets is dependent on num_tuples, an estimate

319 * of the number of tuples to be loaded into the index initially. The

320 * chosen number of buckets is returned.

321 *

322 * We are fairly cavalier about locking here, since we know that no one else

323 * could be accessing this index. In particular the rule about not holding

324 * multiple buffer locks is ignored.

325 */

326uint32

327 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)

328{

329 Buffer metabuf;

330 Buffer buf;

331 Buffer bitmapbuf;

332 Page pg;

333 HashMetaPage metap;

334 RegProcedure procid;

335 int32 data_width;

336 int32 item_width;

337 int32 ffactor;

338 uint32 num_buckets;

339 uint32 i;

340 bool use_wal;

341

342 /* safety check */

343 if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)

344 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",

345 RelationGetRelationName(rel));

346

347 /*

348 * WAL log creation of pages if the relation is persistent, or this is the

349 * init fork. Init forks for unlogged relations always need to be WAL

350 * logged.

351 */

352 use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;

353

354 /*

355 * Determine the target fill factor (in tuples per bucket) for this index.

356 * The idea is to make the fill factor correspond to pages about as full

357 * as the user-settable fillfactor parameter says. We can compute it

358 * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.

359 */

360 data_width = sizeof(uint32);

361 item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +

362 sizeof(ItemIdData); /* include the line pointer */

363 ffactor = HashGetTargetPageUsage(rel) / item_width;

364 /* keep to a sane range */

365 if (ffactor < 10)

366 ffactor = 10;

367

368 procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);

369

370 /*

371 * We initialize the metapage, the first N bucket pages, and the first

372 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()

373 * calls to occur. This ensures that the smgr level has the right idea of

374 * the physical index length.

375 *

376 * Critical section not required, because on error the creation of the

377 * whole relation will be rolled back.

378 */

379 metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);

380 _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);

381 MarkBufferDirty(metabuf);

382

383 pg = BufferGetPage(metabuf);

384 metap = HashPageGetMeta(pg);

385

386 /* XLOG stuff */

387 if (use_wal)

388 {

389 xl_hash_init_meta_page xlrec;

390 XLogRecPtr recptr;

391

392 xlrec.num_tuples = num_tuples;

393 xlrec.procid = metap->hashm_procid;

394 xlrec.ffactor = metap->hashm_ffactor;

395

396 XLogBeginInsert();

397 XLogRegisterData(&xlrec, SizeOfHashInitMetaPage);

398 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);

399

400 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);

401

402 PageSetLSN(BufferGetPage(metabuf), recptr);

403 }

404

405 num_buckets = metap->hashm_maxbucket + 1;

406

407 /*

408 * Release buffer lock on the metapage while we initialize buckets.

409 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS

410 * won't accomplish anything. It's a bad idea to hold buffer locks for

411 * long intervals in any case, since that can block the bgwriter.

412 */

413 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

414

415 /*

416 * Initialize and WAL Log the first N buckets

417 */

418 for (i = 0; i < num_buckets; i++)

419 {

420 BlockNumber blkno;

421

422 /* Allow interrupts, in case N is huge */

423 CHECK_FOR_INTERRUPTS();

424

425 blkno = BUCKET_TO_BLKNO(metap, i);

426 buf = _hash_getnewbuf(rel, blkno, forkNum);

427 _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);

428 MarkBufferDirty(buf);

429

430 if (use_wal)

431 log_newpage(&rel->rd_locator,

432 forkNum,

433 blkno,

434 BufferGetPage(buf),

435 true);

436 _hash_relbuf(rel, buf);

437 }

438

439 /* Now reacquire buffer lock on metapage */

440 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);

441

442 /*

443 * Initialize bitmap page

444 */

445 bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);

446 _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);

447 MarkBufferDirty(bitmapbuf);

448

449 /* add the new bitmap page to the metapage's list of bitmaps */

450 /* metapage already has a write lock */

451 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)

452 ereport(ERROR,

453 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

454 errmsg("out of overflow pages in hash index \"%s\"",

455 RelationGetRelationName(rel))));

456

457 metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;

458

459 metap->hashm_nmaps++;

460 MarkBufferDirty(metabuf);

461

462 /* XLOG stuff */

463 if (use_wal)

464 {

465 xl_hash_init_bitmap_page xlrec;

466 XLogRecPtr recptr;

467

468 xlrec.bmsize = metap->hashm_bmsize;

469

470 XLogBeginInsert();

471 XLogRegisterData(&xlrec, SizeOfHashInitBitmapPage);

472 XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);

473

474 /*

475 * This is safe only because nobody else can be modifying the index at

476 * this stage; it's only visible to the transaction that is creating

477 * it.

478 */

479 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);

480

481 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);

482

483 PageSetLSN(BufferGetPage(bitmapbuf), recptr);

484 PageSetLSN(BufferGetPage(metabuf), recptr);

485 }

486

487 /* all done */

488 _hash_relbuf(rel, bitmapbuf);

489 _hash_relbuf(rel, metabuf);

490

491 return num_buckets;

492}

493

494/*

495 * _hash_init_metabuffer() -- Initialize the metadata page of a hash index.

496 */

497void

498 _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,

499 uint16 ffactor, bool initpage)

500{

501 HashMetaPage metap;

502 HashPageOpaque pageopaque;

503 Page page;

504 double dnumbuckets;

505 uint32 num_buckets;

506 uint32 spare_index;

507 uint32 lshift;

508

509 /*

510 * Choose the number of initial bucket pages to match the fill factor

511 * given the estimated number of tuples. We round up the result to the

512 * total number of buckets which has to be allocated before using its

513 * hashm_spares element. However always force at least 2 bucket pages. The

514 * upper limit is determined by considerations explained in

515 * _hash_expandtable().

516 */

517 dnumbuckets = num_tuples / ffactor;

518 if (dnumbuckets <= 2.0)

519 num_buckets = 2;

520 else if (dnumbuckets >= (double) 0x40000000)

521 num_buckets = 0x40000000;

522 else

523 num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));

524

525 spare_index = _hash_spareindex(num_buckets);

526 Assert(spare_index < HASH_MAX_SPLITPOINTS);

527

528 page = BufferGetPage(buf);

529 if (initpage)

530 _hash_pageinit(page, BufferGetPageSize(buf));

531

532 pageopaque = HashPageGetOpaque(page);

533 pageopaque->hasho_prevblkno = InvalidBlockNumber;

534 pageopaque->hasho_nextblkno = InvalidBlockNumber;

535 pageopaque->hasho_bucket = InvalidBucket;

536 pageopaque->hasho_flag = LH_META_PAGE;

537 pageopaque->hasho_page_id = HASHO_PAGE_ID;

538

539 metap = HashPageGetMeta(page);

540

541 metap->hashm_magic = HASH_MAGIC;

542 metap->hashm_version = HASH_VERSION;

543 metap->hashm_ntuples = 0;

544 metap->hashm_nmaps = 0;

545 metap->hashm_ffactor = ffactor;

546 metap->hashm_bsize = HashGetMaxBitmapSize(page);

547

548 /* find largest bitmap array size that will fit in page size */

549 lshift = pg_leftmost_one_pos32(metap->hashm_bsize);

550 Assert(lshift > 0);

551 metap->hashm_bmsize = 1 << lshift;

552 metap->hashm_bmshift = lshift + BYTE_TO_BIT;

553 Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));

554

555 /*

556 * Label the index with its primary hash support function's OID. This is

557 * pretty useless for normal operation (in fact, hashm_procid is not used

558 * anywhere), but it might be handy for forensic purposes so we keep it.

559 */

560 metap->hashm_procid = procid;

561

562 /*

563 * We initialize the index with N buckets, 0 .. N-1, occupying physical

564 * blocks 1 to N. The first freespace bitmap page is in block N+1.

565 */

566 metap->hashm_maxbucket = num_buckets - 1;

567

568 /*

569 * Set highmask as next immediate ((2 ^ x) - 1), which should be

570 * sufficient to cover num_buckets.

571 */

572 metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1;

573 metap->hashm_lowmask = (metap->hashm_highmask >> 1);

574

575 MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));

576 MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

577

578 /* Set up mapping for one spare page after the initial splitpoints */

579 metap->hashm_spares[spare_index] = 1;

580 metap->hashm_ovflpoint = spare_index;

581 metap->hashm_firstfree = 0;

582

583 /*

584 * Set pd_lower just past the end of the metadata. This is essential,

585 * because without doing so, metadata will be lost if xlog.c compresses

586 * the page.

587 */

588 ((PageHeader) page)->pd_lower =

589 ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;

590}

591

592/*

593 * _hash_pageinit() -- Initialize a new hash index page.

594 */

595void

596 _hash_pageinit(Page page, Size size)

597{

598 PageInit(page, size, sizeof(HashPageOpaqueData));

599}

600

601/*

602 * Attempt to expand the hash table by creating one new bucket.

603 *

604 * This will silently do nothing if we don't get cleanup lock on old or

605 * new bucket.

606 *

607 * Complete the pending splits and remove the tuples from old bucket,

608 * if there are any left over from the previous split.

609 *

610 * The caller must hold a pin, but no lock, on the metapage buffer.

611 * The buffer is returned in the same state.

612 */

613void

614 _hash_expandtable(Relation rel, Buffer metabuf)

615{

616 HashMetaPage metap;

617 Bucket old_bucket;

618 Bucket new_bucket;

619 uint32 spare_ndx;

620 BlockNumber start_oblkno;

621 BlockNumber start_nblkno;

622 Buffer buf_nblkno;

623 Buffer buf_oblkno;

624 Page opage;

625 Page npage;

626 HashPageOpaque oopaque;

627 HashPageOpaque nopaque;

628 uint32 maxbucket;

629 uint32 highmask;

630 uint32 lowmask;

631 bool metap_update_masks = false;

632 bool metap_update_splitpoint = false;

633

634restart_expand:

635

636 /*

637 * Write-lock the meta page. It used to be necessary to acquire a

638 * heavyweight lock to begin a split, but that is no longer required.

639 */

640 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);

641

642 _hash_checkpage(rel, metabuf, LH_META_PAGE);

643 metap = HashPageGetMeta(BufferGetPage(metabuf));

644

645 /*

646 * Check to see if split is still needed; someone else might have already

647 * done one while we waited for the lock.

648 *

649 * Make sure this stays in sync with _hash_doinsert()

650 */

651 if (metap->hashm_ntuples <=

652 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))

653 goto fail;

654

655 /*

656 * Can't split anymore if maxbucket has reached its maximum possible

657 * value.

658 *

659 * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because

660 * the calculation maxbucket+1 mustn't overflow). Currently we restrict

661 * to half that to prevent failure of pg_ceil_log2_32() and insufficient

662 * space in hashm_spares[]. It's moot anyway because an index with 2^32

663 * buckets would certainly overflow BlockNumber and hence

664 * _hash_alloc_buckets() would fail, but if we supported buckets smaller

665 * than a disk block then this would be an independent constraint.

666 *

667 * If you change this, see also the maximum initial number of buckets in

668 * _hash_init().

669 */

670 if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)

671 goto fail;

672

673 /*

674 * Determine which bucket is to be split, and attempt to take cleanup lock

675 * on the old bucket. If we can't get the lock, give up.

676 *

677 * The cleanup lock protects us not only against other backends, but

678 * against our own backend as well.

679 *

680 * The cleanup lock is mainly to protect the split from concurrent

681 * inserts. See src/backend/access/hash/README, Lock Definitions for

682 * further details. Due to this locking restriction, if there is any

683 * pending scan, the split will give up which is not good, but harmless.

684 */

685 new_bucket = metap->hashm_maxbucket + 1;

686

687 old_bucket = (new_bucket & metap->hashm_lowmask);

688

689 start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);

690

691 buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);

692 if (!buf_oblkno)

693 goto fail;

694

695 opage = BufferGetPage(buf_oblkno);

696 oopaque = HashPageGetOpaque(opage);

697

698 /*

699 * We want to finish the split from a bucket as there is no apparent

700 * benefit by not doing so and it will make the code complicated to finish

701 * the split that involves multiple buckets considering the case where new

702 * split also fails. We don't need to consider the new bucket for

703 * completing the split here as it is not possible that a re-split of new

704 * bucket starts when there is still a pending split from old bucket.

705 */

706 if (H_BUCKET_BEING_SPLIT(oopaque))

707 {

708 /*

709 * Copy bucket mapping info now; refer the comment in code below where

710 * we copy this information before calling _hash_splitbucket to see

711 * why this is okay.

712 */

713 maxbucket = metap->hashm_maxbucket;

714 highmask = metap->hashm_highmask;

715 lowmask = metap->hashm_lowmask;

716

717 /*

718 * Release the lock on metapage and old_bucket, before completing the

719 * split.

720 */

721 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

722 LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);

723

724 _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,

725 highmask, lowmask);

726

727 /* release the pin on old buffer and retry for expand. */

728 _hash_dropbuf(rel, buf_oblkno);

729

730 goto restart_expand;

731 }

732

733 /*

734 * Clean the tuples remained from the previous split. This operation

735 * requires cleanup lock and we already have one on the old bucket, so

736 * let's do it. We also don't want to allow further splits from the bucket

737 * till the garbage of previous split is cleaned. This has two

738 * advantages; first, it helps in avoiding the bloat due to garbage and

739 * second is, during cleanup of bucket, we are always sure that the

740 * garbage tuples belong to most recently split bucket. On the contrary,

741 * if we allow cleanup of bucket after meta page is updated to indicate

742 * the new split and before the actual split, the cleanup operation won't

743 * be able to decide whether the tuple has been moved to the newly created

744 * bucket and ended up deleting such tuples.

745 */

746 if (H_NEEDS_SPLIT_CLEANUP(oopaque))

747 {

748 /*

749 * Copy bucket mapping info now; refer to the comment in code below

750 * where we copy this information before calling _hash_splitbucket to

751 * see why this is okay.

752 */

753 maxbucket = metap->hashm_maxbucket;

754 highmask = metap->hashm_highmask;

755 lowmask = metap->hashm_lowmask;

756

757 /* Release the metapage lock. */

758 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

759

760 hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,

761 maxbucket, highmask, lowmask, NULL, NULL, true,

762 NULL, NULL);

763

764 _hash_dropbuf(rel, buf_oblkno);

765

766 goto restart_expand;

767 }

768

769 /*

770 * There shouldn't be any active scan on new bucket.

771 *

772 * Note: it is safe to compute the new bucket's blkno here, even though we

773 * may still need to update the BUCKET_TO_BLKNO mapping. This is because

774 * the current value of hashm_spares[hashm_ovflpoint] correctly shows

775 * where we are going to put a new splitpoint's worth of buckets.

776 */

777 start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

778

779 /*

780 * If the split point is increasing we need to allocate a new batch of

781 * bucket pages.

782 */

783 spare_ndx = _hash_spareindex(new_bucket + 1);

784 if (spare_ndx > metap->hashm_ovflpoint)

785 {

786 uint32 buckets_to_add;

787

788 Assert(spare_ndx == metap->hashm_ovflpoint + 1);

789

790 /*

791 * We treat allocation of buckets as a separate WAL-logged action.

792 * Even if we fail after this operation, won't leak bucket pages;

793 * rather, the next split will consume this space. In any case, even

794 * without failure we don't use all the space in one split operation.

795 */

796 buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;

797 if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))

798 {

799 /* can't split due to BlockNumber overflow */

800 _hash_relbuf(rel, buf_oblkno);

801 goto fail;

802 }

803 }

804

805 /*

806 * Physically allocate the new bucket's primary page. We want to do this

807 * before changing the metapage's mapping info, in case we can't get the

808 * disk space.

809 *

810 * XXX It doesn't make sense to call _hash_getnewbuf first, zeroing the

811 * buffer, and then only afterwards check whether we have a cleanup lock.

812 * However, since no scan can be accessing the buffer yet, any concurrent

813 * accesses will just be from processes like the bgwriter or checkpointer

814 * which don't care about its contents, so it doesn't really matter.

815 */

816 buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);

817 if (!IsBufferCleanupOK(buf_nblkno))

818 {

819 _hash_relbuf(rel, buf_oblkno);

820 _hash_relbuf(rel, buf_nblkno);

821 goto fail;

822 }

823

824 /*

825 * Since we are scribbling on the pages in the shared buffers, establish a

826 * critical section. Any failure in this next code leaves us with a big

827 * problem: the metapage is effectively corrupt but could get written back

828 * to disk.

829 */

830 START_CRIT_SECTION();

831

832 /*

833 * Okay to proceed with split. Update the metapage bucket mapping info.

834 */

835 metap->hashm_maxbucket = new_bucket;

836

837 if (new_bucket > metap->hashm_highmask)

838 {

839 /* Starting a new doubling */

840 metap->hashm_lowmask = metap->hashm_highmask;

841 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;

842 metap_update_masks = true;

843 }

844

845 /*

846 * If the split point is increasing we need to adjust the hashm_spares[]

847 * array and hashm_ovflpoint so that future overflow pages will be created

848 * beyond this new batch of bucket pages.

849 */

850 if (spare_ndx > metap->hashm_ovflpoint)

851 {

852 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];

853 metap->hashm_ovflpoint = spare_ndx;

854 metap_update_splitpoint = true;

855 }

856

857 MarkBufferDirty(metabuf);

858

859 /*

860 * Copy bucket mapping info now; this saves re-accessing the meta page

861 * inside _hash_splitbucket's inner loop. Note that once we drop the

862 * split lock, other splits could begin, so these values might be out of

863 * date before _hash_splitbucket finishes. That's okay, since all it

864 * needs is to tell which of these two buckets to map hashkeys into.

865 */

866 maxbucket = metap->hashm_maxbucket;

867 highmask = metap->hashm_highmask;

868 lowmask = metap->hashm_lowmask;

869

870 opage = BufferGetPage(buf_oblkno);

871 oopaque = HashPageGetOpaque(opage);

872

873 /*

874 * Mark the old bucket to indicate that split is in progress. (At

875 * operation end, we will clear the split-in-progress flag.) Also, for a

876 * primary bucket page, hasho_prevblkno stores the number of buckets that

877 * existed as of the last split, so we must update that value here.

878 */

879 oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;

880 oopaque->hasho_prevblkno = maxbucket;

881

882 MarkBufferDirty(buf_oblkno);

883

884 npage = BufferGetPage(buf_nblkno);

885

886 /*

887 * initialize the new bucket's primary page and mark it to indicate that

888 * split is in progress.

889 */

890 nopaque = HashPageGetOpaque(npage);

891 nopaque->hasho_prevblkno = maxbucket;

892 nopaque->hasho_nextblkno = InvalidBlockNumber;

893 nopaque->hasho_bucket = new_bucket;

894 nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;

895 nopaque->hasho_page_id = HASHO_PAGE_ID;

896

897 MarkBufferDirty(buf_nblkno);

898

899 /* XLOG stuff */

900 if (RelationNeedsWAL(rel))

901 {

902 xl_hash_split_allocate_page xlrec;

903 XLogRecPtr recptr;

904

905 xlrec.new_bucket = maxbucket;

906 xlrec.old_bucket_flag = oopaque->hasho_flag;

907 xlrec.new_bucket_flag = nopaque->hasho_flag;

908 xlrec.flags = 0;

909

910 XLogBeginInsert();

911

912 XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);

913 XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);

914 XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);

915

916 if (metap_update_masks)

917 {

918 xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;

919 XLogRegisterBufData(2, &metap->hashm_lowmask, sizeof(uint32));

920 XLogRegisterBufData(2, &metap->hashm_highmask, sizeof(uint32));

921 }

922

923 if (metap_update_splitpoint)

924 {

925 xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;

926 XLogRegisterBufData(2, &metap->hashm_ovflpoint,

927 sizeof(uint32));

928 XLogRegisterBufData(2,

929 &metap->hashm_spares[metap->hashm_ovflpoint],

930 sizeof(uint32));

931 }

932

933 XLogRegisterData(&xlrec, SizeOfHashSplitAllocPage);

934

935 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);

936

937 PageSetLSN(BufferGetPage(buf_oblkno), recptr);

938 PageSetLSN(BufferGetPage(buf_nblkno), recptr);

939 PageSetLSN(BufferGetPage(metabuf), recptr);

940 }

941

942 END_CRIT_SECTION();

943

944 /* drop lock, but keep pin */

945 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

946

947 /* Relocate records to the new bucket */

948 _hash_splitbucket(rel, metabuf,

949 old_bucket, new_bucket,

950 buf_oblkno, buf_nblkno, NULL,

951 maxbucket, highmask, lowmask);

952

953 /* all done, now release the pins on primary buckets. */

954 _hash_dropbuf(rel, buf_oblkno);

955 _hash_dropbuf(rel, buf_nblkno);

956

957 return;

958

959 /* Here if decide not to split or fail to acquire old bucket lock */

960fail:

961

962 /* We didn't write the metapage, so just drop lock */

963 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);

964}

965

966

967/*

968 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages

969 *

970 * This does not need to initialize the new bucket pages; we'll do that as

971 * each one is used by _hash_expandtable(). But we have to extend the logical

972 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in

973 * sync with ours, so that we don't get complaints from smgr.

974 *

975 * We do this by writing a page of zeroes at the end of the splitpoint range.

976 * We expect that the filesystem will ensure that the intervening pages read

977 * as zeroes too. On many filesystems this "hole" will not be allocated

978 * immediately, which means that the index file may end up more fragmented

979 * than if we forced it all to be allocated now; but since we don't scan

980 * hash indexes sequentially anyway, that probably doesn't matter.

981 *

982 * XXX It's annoying that this code is executed with the metapage lock held.

983 * We need to interlock against _hash_addovflpage() adding a new overflow page

984 * concurrently, but it'd likely be better to use LockRelationForExtension

985 * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,

986 * so it may not be worth worrying about.

987 *

988 * Returns true if successful, or false if allocation failed due to

989 * BlockNumber overflow.

990 */

991static bool

992 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)

993{

994 BlockNumber lastblock;

995 PGIOAlignedBlock zerobuf;

996 Page page;

997 HashPageOpaque ovflopaque;

998

999 lastblock = firstblock + nblocks - 1;

1000

1001 /*

1002 * Check for overflow in block number calculation; if so, we cannot extend

1003 * the index anymore.

1004 */

1005 if (lastblock < firstblock || lastblock == InvalidBlockNumber)

1006 return false;

1007

1008 page = (Page) zerobuf.data;

1009

1010 /*

1011 * Initialize the page. Just zeroing the page won't work; see

1012 * _hash_freeovflpage for similar usage. We take care to make the special

1013 * space valid for the benefit of tools such as pageinspect.

1014 */

1015 _hash_pageinit(page, BLCKSZ);

1016

1017 ovflopaque = HashPageGetOpaque(page);

1018

1019 ovflopaque->hasho_prevblkno = InvalidBlockNumber;

1020 ovflopaque->hasho_nextblkno = InvalidBlockNumber;

1021 ovflopaque->hasho_bucket = InvalidBucket;

1022 ovflopaque->hasho_flag = LH_UNUSED_PAGE;

1023 ovflopaque->hasho_page_id = HASHO_PAGE_ID;

1024

1025 if (RelationNeedsWAL(rel))

1026 log_newpage(&rel->rd_locator,

1027 MAIN_FORKNUM,

1028 lastblock,

1029 zerobuf.data,

1030 true);

1031

1032 PageSetChecksumInplace(page, lastblock);

1033 smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,

1034 false);

1035

1036 return true;

1037}

1038

1039

1040/*

1041 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'

1042 *

1043 * This routine is used to partition the tuples between old and new bucket and

1044 * is used to finish the incomplete split operations. To finish the previously

1045 * interrupted split operation, the caller needs to fill htab. If htab is set,

1046 * then we skip the movement of tuples that exists in htab, otherwise NULL

1047 * value of htab indicates movement of all the tuples that belong to the new

1048 * bucket.

1049 *

1050 * We are splitting a bucket that consists of a base bucket page and zero

1051 * or more overflow (bucket chain) pages. We must relocate tuples that

1052 * belong in the new bucket.

1053 *

1054 * The caller must hold cleanup locks on both buckets to ensure that

1055 * no one else is trying to access them (see README).

1056 *

1057 * The caller must hold a pin, but no lock, on the metapage buffer.

1058 * The buffer is returned in the same state. (The metapage is only

1059 * touched if it becomes necessary to add or remove overflow pages.)

1060 *

1061 * Split needs to retain pin on primary bucket pages of both old and new

1062 * buckets till end of operation. This is to prevent vacuum from starting

1063 * while a split is in progress.

1064 *

1065 * In addition, the caller must have created the new bucket's base page,

1066 * which is passed in buffer nbuf, pinned and write-locked. The lock will be

1067 * released here and pin must be released by the caller. (The API is set up

1068 * this way because we must do _hash_getnewbuf() before releasing the metapage

1069 * write lock. So instead of passing the new bucket's start block number, we

1070 * pass an actual buffer.)

1071 */

1072static void

1073 _hash_splitbucket(Relation rel,

1074 Buffer metabuf,

1075 Bucket obucket,

1076 Bucket nbucket,

1077 Buffer obuf,

1078 Buffer nbuf,

1079 HTAB *htab,

1080 uint32 maxbucket,

1081 uint32 highmask,

1082 uint32 lowmask)

1083{

1084 Buffer bucket_obuf;

1085 Buffer bucket_nbuf;

1086 Page opage;

1087 Page npage;

1088 HashPageOpaque oopaque;

1089 HashPageOpaque nopaque;

1090 OffsetNumber itup_offsets[MaxIndexTuplesPerPage];

1091 IndexTuple itups[MaxIndexTuplesPerPage];

1092 Size all_tups_size = 0;

1093 int i;

1094 uint16 nitups = 0;

1095

1096 bucket_obuf = obuf;

1097 opage = BufferGetPage(obuf);

1098 oopaque = HashPageGetOpaque(opage);

1099

1100 bucket_nbuf = nbuf;

1101 npage = BufferGetPage(nbuf);

1102 nopaque = HashPageGetOpaque(npage);

1103

1104 /* Copy the predicate locks from old bucket to new bucket. */

1105 PredicateLockPageSplit(rel,

1106 BufferGetBlockNumber(bucket_obuf),

1107 BufferGetBlockNumber(bucket_nbuf));

1108

1109 /*

1110 * Partition the tuples in the old bucket between the old bucket and the

1111 * new bucket, advancing along the old bucket's overflow bucket chain and

1112 * adding overflow pages to the new bucket as needed. Outer loop iterates

1113 * once per page in old bucket.

1114 */

1115 for (;;)

1116 {

1117 BlockNumber oblkno;

1118 OffsetNumber ooffnum;

1119 OffsetNumber omaxoffnum;

1120

1121 /* Scan each tuple in old page */

1122 omaxoffnum = PageGetMaxOffsetNumber(opage);

1123 for (ooffnum = FirstOffsetNumber;

1124 ooffnum <= omaxoffnum;

1125 ooffnum = OffsetNumberNext(ooffnum))

1126 {

1127 IndexTuple itup;

1128 Size itemsz;

1129 Bucket bucket;

1130 bool found = false;

1131

1132 /* skip dead tuples */

1133 if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))

1134 continue;

1135

1136 /*

1137 * Before inserting a tuple, probe the hash table containing TIDs

1138 * of tuples belonging to new bucket, if we find a match, then

1139 * skip that tuple, else fetch the item's hash key (conveniently

1140 * stored in the item) and determine which bucket it now belongs

1141 * in.

1142 */

1143 itup = (IndexTuple) PageGetItem(opage,

1144 PageGetItemId(opage, ooffnum));

1145

1146 if (htab)

1147 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);

1148

1149 if (found)

1150 continue;

1151

1152 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),

1153 maxbucket, highmask, lowmask);

1154

1155 if (bucket == nbucket)

1156 {

1157 IndexTuple new_itup;

1158

1159 /*

1160 * make a copy of index tuple as we have to scribble on it.

1161 */

1162 new_itup = CopyIndexTuple(itup);

1163

1164 /*

1165 * mark the index tuple as moved by split, such tuples are

1166 * skipped by scan if there is split in progress for a bucket.

1167 */

1168 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;

1169

1170 /*

1171 * insert the tuple into the new bucket. if it doesn't fit on

1172 * the current page in the new bucket, we must allocate a new

1173 * overflow page and place the tuple on that page instead.

1174 */

1175 itemsz = IndexTupleSize(new_itup);

1176 itemsz = MAXALIGN(itemsz);

1177

1178 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))

1179 {

1180 /*

1181 * Change the shared buffer state in critical section,

1182 * otherwise any error could make it unrecoverable.

1183 */

1184 START_CRIT_SECTION();

1185

1186 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);

1187 MarkBufferDirty(nbuf);

1188 /* log the split operation before releasing the lock */

1189 log_split_page(rel, nbuf);

1190

1191 END_CRIT_SECTION();

1192

1193 /* drop lock, but keep pin */

1194 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);

1195

1196 /* be tidy */

1197 for (i = 0; i < nitups; i++)

1198 pfree(itups[i]);

1199 nitups = 0;

1200 all_tups_size = 0;

1201

1202 /* chain to a new overflow page */

1203 nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf));

1204 npage = BufferGetPage(nbuf);

1205 nopaque = HashPageGetOpaque(npage);

1206 }

1207

1208 itups[nitups++] = new_itup;

1209 all_tups_size += itemsz;

1210 }

1211 else

1212 {

1213 /*

1214 * the tuple stays on this page, so nothing to do.

1215 */

1216 Assert(bucket == obucket);

1217 }

1218 }

1219

1220 oblkno = oopaque->hasho_nextblkno;

1221

1222 /* retain the pin on the old primary bucket */

1223 if (obuf == bucket_obuf)

1224 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);

1225 else

1226 _hash_relbuf(rel, obuf);

1227

1228 /* Exit loop if no more overflow pages in old bucket */

1229 if (!BlockNumberIsValid(oblkno))

1230 {

1231 /*

1232 * Change the shared buffer state in critical section, otherwise

1233 * any error could make it unrecoverable.

1234 */

1235 START_CRIT_SECTION();

1236

1237 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);

1238 MarkBufferDirty(nbuf);

1239 /* log the split operation before releasing the lock */

1240 log_split_page(rel, nbuf);

1241

1242 END_CRIT_SECTION();

1243

1244 if (nbuf == bucket_nbuf)

1245 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);

1246 else

1247 _hash_relbuf(rel, nbuf);

1248

1249 /* be tidy */

1250 for (i = 0; i < nitups; i++)

1251 pfree(itups[i]);

1252 break;

1253 }

1254

1255 /* Else, advance to next old page */

1256 obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);

1257 opage = BufferGetPage(obuf);

1258 oopaque = HashPageGetOpaque(opage);

1259 }

1260

1261 /*

1262 * We're at the end of the old bucket chain, so we're done partitioning

1263 * the tuples. Mark the old and new buckets to indicate split is

1264 * finished.

1265 *

1266 * To avoid deadlocks due to locking order of buckets, first lock the old

1267 * bucket and then the new bucket.

1268 */

1269 LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);

1270 opage = BufferGetPage(bucket_obuf);

1271 oopaque = HashPageGetOpaque(opage);

1272

1273 LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);

1274 npage = BufferGetPage(bucket_nbuf);

1275 nopaque = HashPageGetOpaque(npage);

1276

1277 START_CRIT_SECTION();

1278

1279 oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;

1280 nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;

1281

1282 /*

1283 * After the split is finished, mark the old bucket to indicate that it

1284 * contains deletable tuples. We will clear split-cleanup flag after

1285 * deleting such tuples either at the end of split or at the next split

1286 * from old bucket or at the time of vacuum.

1287 */

1288 oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;

1289

1290 /*

1291 * now write the buffers, here we don't release the locks as caller is

1292 * responsible to release locks.

1293 */

1294 MarkBufferDirty(bucket_obuf);

1295 MarkBufferDirty(bucket_nbuf);

1296

1297 if (RelationNeedsWAL(rel))

1298 {

1299 XLogRecPtr recptr;

1300 xl_hash_split_complete xlrec;

1301

1302 xlrec.old_bucket_flag = oopaque->hasho_flag;

1303 xlrec.new_bucket_flag = nopaque->hasho_flag;

1304

1305 XLogBeginInsert();

1306

1307 XLogRegisterData(&xlrec, SizeOfHashSplitComplete);

1308

1309 XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);

1310 XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);

1311

1312 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);

1313

1314 PageSetLSN(BufferGetPage(bucket_obuf), recptr);

1315 PageSetLSN(BufferGetPage(bucket_nbuf), recptr);

1316 }

1317

1318 END_CRIT_SECTION();

1319

1320 /*

1321 * If possible, clean up the old bucket. We might not be able to do this

1322 * if someone else has a pin on it, but if not then we can go ahead. This

1323 * isn't absolutely necessary, but it reduces bloat; if we don't do it

1324 * now, VACUUM will do it eventually, but maybe not until new overflow

1325 * pages have been allocated. Note that there's no need to clean up the

1326 * new bucket.

1327 */

1328 if (IsBufferCleanupOK(bucket_obuf))

1329 {

1330 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);

1331 hashbucketcleanup(rel, obucket, bucket_obuf,

1332 BufferGetBlockNumber(bucket_obuf), NULL,

1333 maxbucket, highmask, lowmask, NULL, NULL, true,

1334 NULL, NULL);

1335 }

1336 else

1337 {

1338 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);

1339 LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);

1340 }

1341}

1342

1343/*

1344 * _hash_finish_split() -- Finish the previously interrupted split operation

1345 *

1346 * To complete the split operation, we form the hash table of TIDs in new

1347 * bucket which is then used by split operation to skip tuples that are

1348 * already moved before the split operation was previously interrupted.

1349 *

1350 * The caller must hold a pin, but no lock, on the metapage and old bucket's

1351 * primary page buffer. The buffers are returned in the same state. (The

1352 * metapage is only touched if it becomes necessary to add or remove overflow

1353 * pages.)

1354 */

1355void

1356 _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,

1357 uint32 maxbucket, uint32 highmask, uint32 lowmask)

1358{

1359 HASHCTL hash_ctl;

1360 HTAB *tidhtab;

1361 Buffer bucket_nbuf = InvalidBuffer;

1362 Buffer nbuf;

1363 Page npage;

1364 BlockNumber nblkno;

1365 BlockNumber bucket_nblkno;

1366 HashPageOpaque npageopaque;

1367 Bucket nbucket;

1368 bool found;

1369

1370 /* Initialize hash tables used to track TIDs */

1371 hash_ctl.keysize = sizeof(ItemPointerData);

1372 hash_ctl.entrysize = sizeof(ItemPointerData);

1373 hash_ctl.hcxt = CurrentMemoryContext;

1374

1375 tidhtab =

1376 hash_create("bucket ctids",

1377 256, /* arbitrary initial size */

1378 &hash_ctl,

1379 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);

1380

1381 bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);

1382

1383 /*

1384 * Scan the new bucket and build hash table of TIDs

1385 */

1386 for (;;)

1387 {

1388 OffsetNumber noffnum;

1389 OffsetNumber nmaxoffnum;

1390

1391 nbuf = _hash_getbuf(rel, nblkno, HASH_READ,

1392 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

1393

1394 /* remember the primary bucket buffer to acquire cleanup lock on it. */

1395 if (nblkno == bucket_nblkno)

1396 bucket_nbuf = nbuf;

1397

1398 npage = BufferGetPage(nbuf);

1399 npageopaque = HashPageGetOpaque(npage);

1400

1401 /* Scan each tuple in new page */

1402 nmaxoffnum = PageGetMaxOffsetNumber(npage);

1403 for (noffnum = FirstOffsetNumber;

1404 noffnum <= nmaxoffnum;

1405 noffnum = OffsetNumberNext(noffnum))

1406 {

1407 IndexTuple itup;

1408

1409 /* Fetch the item's TID and insert it in hash table. */

1410 itup = (IndexTuple) PageGetItem(npage,

1411 PageGetItemId(npage, noffnum));

1412

1413 (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);

1414

1415 Assert(!found);

1416 }

1417

1418 nblkno = npageopaque->hasho_nextblkno;

1419

1420 /*

1421 * release our write lock without modifying buffer and ensure to

1422 * retain the pin on primary bucket.

1423 */

1424 if (nbuf == bucket_nbuf)

1425 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);

1426 else

1427 _hash_relbuf(rel, nbuf);

1428

1429 /* Exit loop if no more overflow pages in new bucket */

1430 if (!BlockNumberIsValid(nblkno))

1431 break;

1432 }

1433

1434 /*

1435 * Conditionally get the cleanup lock on old and new buckets to perform

1436 * the split operation. If we don't get the cleanup locks, silently give

1437 * up and next insertion on old bucket will try again to complete the

1438 * split.

1439 */

1440 if (!ConditionalLockBufferForCleanup(obuf))

1441 {

1442 hash_destroy(tidhtab);

1443 return;

1444 }

1445 if (!ConditionalLockBufferForCleanup(bucket_nbuf))

1446 {

1447 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);

1448 hash_destroy(tidhtab);

1449 return;

1450 }

1451

1452 npage = BufferGetPage(bucket_nbuf);

1453 npageopaque = HashPageGetOpaque(npage);

1454 nbucket = npageopaque->hasho_bucket;

1455

1456 _hash_splitbucket(rel, metabuf, obucket,

1457 nbucket, obuf, bucket_nbuf, tidhtab,

1458 maxbucket, highmask, lowmask);

1459

1460 _hash_dropbuf(rel, bucket_nbuf);

1461 hash_destroy(tidhtab);

1462}

1463

1464/*

1465 * log_split_page() -- Log the split operation

1466 *

1467 * We log the split operation when the new page in new bucket gets full,

1468 * so we log the entire page.

1469 *

1470 * 'buf' must be locked by the caller which is also responsible for unlocking

1471 * it.

1472 */

1473static void

1474 log_split_page(Relation rel, Buffer buf)

1475{

1476 if (RelationNeedsWAL(rel))

1477 {

1478 XLogRecPtr recptr;

1479

1480 XLogBeginInsert();

1481

1482 XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);

1483

1484 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);

1485

1486 PageSetLSN(BufferGetPage(buf), recptr);

1487 }

1488}

1489

1490/*

1491 * _hash_getcachedmetap() -- Returns cached metapage data.

1492 *

1493 * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on

1494 * the metapage. If not set, we'll set it before returning if we have to

1495 * refresh the cache, and return with a pin but no lock on it; caller is

1496 * responsible for releasing the pin.

1497 *

1498 * We refresh the cache if it's not initialized yet or force_refresh is true.

1499 */

1500HashMetaPage

1501 _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)

1502{

1503 Page page;

1504

1505 Assert(metabuf);

1506 if (force_refresh || rel->rd_amcache == NULL)

1507 {

1508 char *cache = NULL;

1509

1510 /*

1511 * It's important that we don't set rd_amcache to an invalid value.

1512 * Either MemoryContextAlloc or _hash_getbuf could fail, so don't

1513 * install a pointer to the newly-allocated storage in the actual

1514 * relcache entry until both have succeeded.

1515 */

1516 if (rel->rd_amcache == NULL)

1517 cache = MemoryContextAlloc(rel->rd_indexcxt,

1518 sizeof(HashMetaPageData));

1519

1520 /* Read the metapage. */

1521 if (BufferIsValid(*metabuf))

1522 LockBuffer(*metabuf, BUFFER_LOCK_SHARE);

1523 else

1524 *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,

1525 LH_META_PAGE);

1526 page = BufferGetPage(*metabuf);

1527

1528 /* Populate the cache. */

1529 if (rel->rd_amcache == NULL)

1530 rel->rd_amcache = cache;

1531 memcpy(rel->rd_amcache, HashPageGetMeta(page),

1532 sizeof(HashMetaPageData));

1533

1534 /* Release metapage lock, but keep the pin. */

1535 LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);

1536 }

1537

1538 return (HashMetaPage) rel->rd_amcache;

1539}

1540

1541/*

1542 * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given

1543 * hashkey.

1544 *

1545 * Bucket pages do not move or get removed once they are allocated. This give

1546 * us an opportunity to use the previously saved metapage contents to reach

1547 * the target bucket buffer, instead of reading from the metapage every time.

1548 * This saves one buffer access every time we want to reach the target bucket

1549 * buffer, which is very helpful savings in bufmgr traffic and contention.

1550 *

1551 * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the

1552 * bucket buffer has to be locked for reading or writing.

1553 *

1554 * The out parameter cachedmetap is set with metapage contents used for

1555 * hashkey to bucket buffer mapping. Some callers need this info to reach the

1556 * old bucket in case of bucket split, see _hash_doinsert().

1557 */

1558Buffer

1559 _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,

1560 HashMetaPage *cachedmetap)

1561{

1562 HashMetaPage metap;

1563 Buffer buf;

1564 Buffer metabuf = InvalidBuffer;

1565 Page page;

1566 Bucket bucket;

1567 BlockNumber blkno;

1568 HashPageOpaque opaque;

1569

1570 /* We read from target bucket buffer, hence locking is must. */

1571 Assert(access == HASH_READ || access == HASH_WRITE);

1572

1573 metap = _hash_getcachedmetap(rel, &metabuf, false);

1574 Assert(metap != NULL);

1575

1576 /*

1577 * Loop until we get a lock on the correct target bucket.

1578 */

1579 for (;;)

1580 {

1581 /*

1582 * Compute the target bucket number, and convert to block number.

1583 */

1584 bucket = _hash_hashkey2bucket(hashkey,

1585 metap->hashm_maxbucket,

1586 metap->hashm_highmask,

1587 metap->hashm_lowmask);

1588

1589 blkno = BUCKET_TO_BLKNO(metap, bucket);

1590

1591 /* Fetch the primary bucket page for the bucket */

1592 buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);

1593 page = BufferGetPage(buf);

1594 opaque = HashPageGetOpaque(page);

1595 Assert(opaque->hasho_bucket == bucket);

1596 Assert(opaque->hasho_prevblkno != InvalidBlockNumber);

1597

1598 /*

1599 * If this bucket hasn't been split, we're done.

1600 */

1601 if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)

1602 break;

1603

1604 /* Drop lock on this buffer, update cached metapage, and retry. */

1605 _hash_relbuf(rel, buf);

1606 metap = _hash_getcachedmetap(rel, &metabuf, true);

1607 Assert(metap != NULL);

1608 }

1609

1610 if (BufferIsValid(metabuf))

1611 _hash_dropbuf(rel, metabuf);

1612

1613 if (cachedmetap)

1614 *cachedmetap = metap;

1615

1616 return buf;

1617}

BlockNumber

uint32 BlockNumber

Definition: block.h:31

InvalidBlockNumber

#define InvalidBlockNumber

Definition: block.h:33

BlockNumberIsValid

static bool BlockNumberIsValid(BlockNumber blockNumber)

Definition: block.h:71

Buffer

int Buffer

Definition: buf.h:23

InvalidBuffer

#define InvalidBuffer

Definition: buf.h:25

BufferGetBlockNumber

BlockNumber BufferGetBlockNumber(Buffer buffer)

Definition: bufmgr.c:4198

IsBufferCleanupOK

bool IsBufferCleanupOK(Buffer buffer)

Definition: bufmgr.c:5878

ExtendBufferedRel

Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)

Definition: bufmgr.c:858

RelationGetNumberOfBlocksInFork

BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)

Definition: bufmgr.c:4398

ReleaseBuffer

void ReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5338

UnlockReleaseBuffer

void UnlockReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5355

MarkBufferDirty

void MarkBufferDirty(Buffer buffer)

Definition: bufmgr.c:2921

LockBuffer

void LockBuffer(Buffer buffer, int mode)

Definition: bufmgr.c:5572

ReadBufferExtended

Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)

Definition: bufmgr.c:805

ReadBuffer

Buffer ReadBuffer(Relation reln, BlockNumber blockNum)

Definition: bufmgr.c:758

ConditionalLockBufferForCleanup

bool ConditionalLockBufferForCleanup(Buffer buffer)

Definition: bufmgr.c:5820

BUFFER_LOCK_UNLOCK

#define BUFFER_LOCK_UNLOCK

Definition: bufmgr.h:196

BUFFER_LOCK_SHARE

#define BUFFER_LOCK_SHARE

Definition: bufmgr.h:197

P_NEW

#define P_NEW

Definition: bufmgr.h:191

BufferGetPage

static Page BufferGetPage(Buffer buffer)

Definition: bufmgr.h:417

BufferGetPageSize

static Size BufferGetPageSize(Buffer buffer)

Definition: bufmgr.h:406

EB_SKIP_EXTENSION_LOCK

@ EB_SKIP_EXTENSION_LOCK

Definition: bufmgr.h:75

EB_LOCK_FIRST

@ EB_LOCK_FIRST

Definition: bufmgr.h:87

BUFFER_LOCK_EXCLUSIVE

#define BUFFER_LOCK_EXCLUSIVE

Definition: bufmgr.h:198

RBM_ZERO_AND_LOCK

@ RBM_ZERO_AND_LOCK

Definition: bufmgr.h:47

RBM_NORMAL

@ RBM_NORMAL

Definition: bufmgr.h:46

BMR_REL

#define BMR_REL(p_rel)

Definition: bufmgr.h:111

BufferIsValid

static bool BufferIsValid(Buffer bufnum)

Definition: bufmgr.h:368

PageGetFreeSpaceForMultipleTuples

Size PageGetFreeSpaceForMultipleTuples(const PageData *page, int ntups)

Definition: bufpage.c:933

PageSetChecksumInplace

void PageSetChecksumInplace(Page page, BlockNumber blkno)

Definition: bufpage.c:1541

PageInit

void PageInit(Page page, Size pageSize, Size specialSize)

Definition: bufpage.c:42

PageHeader

PageHeaderData * PageHeader

Definition: bufpage.h:174

PageGetItem

static Item PageGetItem(const PageData *page, const ItemIdData *itemId)

Definition: bufpage.h:354

PageGetItemId

static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)

Definition: bufpage.h:244

PageSetLSN

static void PageSetLSN(Page page, XLogRecPtr lsn)

Definition: bufpage.h:391

Page

PageData * Page

Definition: bufpage.h:82

PageGetMaxOffsetNumber

static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)

Definition: bufpage.h:372

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

RegProcedure

regproc RegProcedure

Definition: c.h:655

int32

int32_t int32

Definition: c.h:534

uint16

uint16_t uint16

Definition: c.h:537

uint32

uint32_t uint32

Definition: c.h:538

MemSet

#define MemSet(start, val, len)

Definition: c.h:1019

Size

size_t Size

Definition: c.h:610

hash_search

void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)

Definition: dynahash.c:952

hash_create

HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)

Definition: dynahash.c:358

hash_destroy

void hash_destroy(HTAB *hashp)

Definition: dynahash.c:865

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

ereport

#define ereport(elevel,...)

Definition: elog.h:150

hashbucketcleanup

void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)

Definition: hash.c:690

hash.h

HASH_NOLOCK

#define HASH_NOLOCK

Definition: hash.h:341

HashPageGetOpaque

#define HashPageGetOpaque(page)

Definition: hash.h:88

LH_BUCKET_PAGE

#define LH_BUCKET_PAGE

Definition: hash.h:55

HASH_MAX_BITMAPS

#define HASH_MAX_BITMAPS

Definition: hash.h:230

HASHSTANDARD_PROC

#define HASHSTANDARD_PROC

Definition: hash.h:355

BMPG_MASK

#define BMPG_MASK(metap)

Definition: hash.h:314

HASH_VERSION

#define HASH_VERSION

Definition: hash.h:201

HASH_MAX_SPLITPOINTS

#define HASH_MAX_SPLITPOINTS

Definition: hash.h:239

BYTE_TO_BIT

#define BYTE_TO_BIT

Definition: hash.h:301

HASH_WRITE

#define HASH_WRITE

Definition: hash.h:340

LH_UNUSED_PAGE

#define LH_UNUSED_PAGE

Definition: hash.h:53

H_BUCKET_BEING_SPLIT

#define H_BUCKET_BEING_SPLIT(opaque)

Definition: hash.h:91

LH_META_PAGE

#define LH_META_PAGE

Definition: hash.h:57

HASHO_PAGE_ID

#define HASHO_PAGE_ID

Definition: hash.h:101

HashPageGetMeta

#define HashPageGetMeta(page)

Definition: hash.h:323

LH_BUCKET_BEING_POPULATED

#define LH_BUCKET_BEING_POPULATED

Definition: hash.h:58

HASH_READ

#define HASH_READ

Definition: hash.h:339

BUCKET_TO_BLKNO

#define BUCKET_TO_BLKNO(metap, B)

Definition: hash.h:39

HashGetMaxBitmapSize

#define HashGetMaxBitmapSize(page)

Definition: hash.h:319

INDEX_MOVED_BY_SPLIT_MASK

#define INDEX_MOVED_BY_SPLIT_MASK

Definition: hash.h:293

HASH_METAPAGE

#define HASH_METAPAGE

Definition: hash.h:198

H_NEEDS_SPLIT_CLEANUP

#define H_NEEDS_SPLIT_CLEANUP(opaque)

Definition: hash.h:90

Bucket

uint32 Bucket

Definition: hash.h:35

LH_BUCKET_NEEDS_SPLIT_CLEANUP

#define LH_BUCKET_NEEDS_SPLIT_CLEANUP

Definition: hash.h:60

LH_BUCKET_BEING_SPLIT

#define LH_BUCKET_BEING_SPLIT

Definition: hash.h:59

HashGetTargetPageUsage

#define HashGetTargetPageUsage(relation)

Definition: hash.h:281

BMPG_SHIFT

#define BMPG_SHIFT(metap)

Definition: hash.h:313

HASH_MAGIC

#define HASH_MAGIC

Definition: hash.h:200

LH_OVERFLOW_PAGE

#define LH_OVERFLOW_PAGE

Definition: hash.h:54

InvalidBucket

#define InvalidBucket

Definition: hash.h:37

hash_xlog.h

SizeOfHashInitBitmapPage

#define SizeOfHashInitBitmapPage

Definition: hash_xlog.h:234

XLOG_HASH_INIT_BITMAP_PAGE

#define XLOG_HASH_INIT_BITMAP_PAGE

Definition: hash_xlog.h:28

SizeOfHashSplitComplete

#define SizeOfHashSplitComplete

Definition: hash_xlog.h:117

XLOG_HASH_SPLIT_ALLOCATE_PAGE

#define XLOG_HASH_SPLIT_ALLOCATE_PAGE

Definition: hash_xlog.h:31

XLOG_HASH_SPLIT_PAGE

#define XLOG_HASH_SPLIT_PAGE

Definition: hash_xlog.h:32

XLOG_HASH_INIT_META_PAGE

#define XLOG_HASH_INIT_META_PAGE

Definition: hash_xlog.h:27

XLOG_HASH_SPLIT_COMPLETE

#define XLOG_HASH_SPLIT_COMPLETE

Definition: hash_xlog.h:33

SizeOfHashSplitAllocPage

#define SizeOfHashSplitAllocPage

Definition: hash_xlog.h:100

SizeOfHashInitMetaPage

#define SizeOfHashInitMetaPage

Definition: hash_xlog.h:218

XLH_SPLIT_META_UPDATE_SPLITPOINT

#define XLH_SPLIT_META_UPDATE_SPLITPOINT

Definition: hash_xlog.h:46

XLH_SPLIT_META_UPDATE_MASKS

#define XLH_SPLIT_META_UPDATE_MASKS

Definition: hash_xlog.h:45

Assert

Assert(PointerIsAligned(start, uint64))

_hash_pgaddmultitup

void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)

Definition: hashinsert.c:331

_hash_initbitmapbuffer

void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)

Definition: hashovfl.c:777

_hash_addovflpage

Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)

Definition: hashovfl.c:112

_hash_getinitbuf

Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno)

Definition: hashpage.c:135

_hash_getcachedmetap

HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)

Definition: hashpage.c:1501

_hash_initbuf

void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage)

Definition: hashpage.c:157

_hash_relbuf

void _hash_relbuf(Relation rel, Buffer buf)

Definition: hashpage.c:266

_hash_getbuf_with_condlock_cleanup

Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)

Definition: hashpage.c:96

_hash_pageinit

void _hash_pageinit(Page page, Size size)

Definition: hashpage.c:596

_hash_splitbucket

static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask)

Definition: hashpage.c:1073

_hash_init

uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)

Definition: hashpage.c:327

_hash_dropbuf

void _hash_dropbuf(Relation rel, Buffer buf)

Definition: hashpage.c:277

_hash_dropscanbuf

void _hash_dropscanbuf(Relation rel, HashScanOpaque so)

Definition: hashpage.c:289

_hash_getbuf

Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)

Definition: hashpage.c:70

_hash_finish_split

void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask)

Definition: hashpage.c:1356

_hash_getbucketbuf_from_hashkey

Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, HashMetaPage *cachedmetap)

Definition: hashpage.c:1559

_hash_init_metabuffer

void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage)

Definition: hashpage.c:498

log_split_page

static void log_split_page(Relation rel, Buffer buf)

Definition: hashpage.c:1474

_hash_alloc_buckets

static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)

Definition: hashpage.c:992

_hash_getbuf_with_strategy

Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)

Definition: hashpage.c:239

_hash_getnewbuf

Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)

Definition: hashpage.c:198

_hash_expandtable

void _hash_expandtable(Relation rel, Buffer metabuf)

Definition: hashpage.c:614

_hash_spareindex

uint32 _hash_spareindex(uint32 num_bucket)

Definition: hashutil.c:142

_hash_get_newblock_from_oldbucket

BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)

Definition: hashutil.c:461

_hash_get_totalbuckets

uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)

Definition: hashutil.c:174

_hash_get_indextuple_hashkey

uint32 _hash_get_indextuple_hashkey(IndexTuple itup)

Definition: hashutil.c:291

_hash_hashkey2bucket

Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)

Definition: hashutil.c:125

_hash_checkpage

void _hash_checkpage(Relation rel, Buffer buf, int flags)

Definition: hashutil.c:210

HASH_FIND

@ HASH_FIND

Definition: hsearch.h:113

HASH_ENTER

@ HASH_ENTER

Definition: hsearch.h:114

HASH_CONTEXT

#define HASH_CONTEXT

Definition: hsearch.h:102

HASH_ELEM

#define HASH_ELEM

Definition: hsearch.h:95

HASH_BLOBS

#define HASH_BLOBS

Definition: hsearch.h:97

index_getprocid

RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)

Definition: indexam.c:883

CopyIndexTuple

IndexTuple CopyIndexTuple(IndexTuple source)

Definition: indextuple.c:547

i

int i

Definition: isn.c:77

ItemIdData

struct ItemIdData ItemIdData

ItemIdIsDead

#define ItemIdIsDead(itemId)

Definition: itemid.h:113

ItemPointerData

struct ItemPointerData ItemPointerData

IndexTuple

IndexTupleData * IndexTuple

Definition: itup.h:53

IndexTupleSize

static Size IndexTupleSize(const IndexTupleData *itup)

Definition: itup.h:71

MaxIndexTuplesPerPage

#define MaxIndexTuplesPerPage

Definition: itup.h:181

MemoryContextAlloc

void * MemoryContextAlloc(MemoryContext context, Size size)

Definition: mcxt.c:1229

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

CurrentMemoryContext

MemoryContext CurrentMemoryContext

Definition: mcxt.c:160

miscadmin.h

START_CRIT_SECTION

#define START_CRIT_SECTION()

Definition: miscadmin.h:149

CHECK_FOR_INTERRUPTS

#define CHECK_FOR_INTERRUPTS()

Definition: miscadmin.h:122

END_CRIT_SECTION

#define END_CRIT_SECTION()

Definition: miscadmin.h:151

OffsetNumberNext

#define OffsetNumberNext(offsetNumber)

Definition: off.h:52

OffsetNumber

uint16 OffsetNumber

Definition: off.h:24

FirstOffsetNumber

#define FirstOffsetNumber

Definition: off.h:27

pg_bitutils.h

pg_nextpower2_32

static uint32 pg_nextpower2_32(uint32 num)

Definition: pg_bitutils.h:189

pg_leftmost_one_pos32

static int pg_leftmost_one_pos32(uint32 word)

Definition: pg_bitutils.h:41

buf

static char * buf

Definition: pg_test_fsync.c:72

postgres.h

PredicateLockPageSplit

void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)

Definition: predicate.c:3144

predicate.h

access

short access

Definition: preproc-type.c:36

rel.h

RelationGetSmgr

static SMgrRelation RelationGetSmgr(Relation rel)

Definition: rel.h:576

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

RelationNeedsWAL

#define RelationNeedsWAL(relation)

Definition: rel.h:637

ForkNumber

Definition: relpath.h:56

MAIN_FORKNUM

@ MAIN_FORKNUM

Definition: relpath.h:58

INIT_FORKNUM

@ INIT_FORKNUM

Definition: relpath.h:61

smgrextend

void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)

Definition: smgr.c:620

smgr.h

BufferAccessStrategyData

Definition: freelist.c:65

HASHCTL

Definition: hsearch.h:66

HASHCTL::keysize

Size keysize

Definition: hsearch.h:75

HASHCTL::entrysize

Size entrysize

Definition: hsearch.h:76

HASHCTL::hcxt

MemoryContext hcxt

Definition: hsearch.h:86

HTAB

Definition: dynahash.c:222

HashMetaPageData

Definition: hash.h:245

HashMetaPageData::hashm_version

uint32 hashm_version

Definition: hash.h:247

HashMetaPageData::hashm_mapp

BlockNumber hashm_mapp[HASH_MAX_BITMAPS]

Definition: hash.h:264

HashMetaPageData::hashm_lowmask

uint32 hashm_lowmask

Definition: hash.h:256

HashMetaPageData::hashm_maxbucket

uint32 hashm_maxbucket

Definition: hash.h:254

HashMetaPageData::hashm_procid

RegProcedure hashm_procid

Definition: hash.h:261

HashMetaPageData::hashm_spares

uint32 hashm_spares[HASH_MAX_SPLITPOINTS]

Definition: hash.h:262

HashMetaPageData::hashm_ntuples

double hashm_ntuples

Definition: hash.h:248

HashMetaPageData::hashm_firstfree

uint32 hashm_firstfree

Definition: hash.h:259

HashMetaPageData::hashm_bmsize

uint16 hashm_bmsize

Definition: hash.h:251

HashMetaPageData::hashm_bsize

uint16 hashm_bsize

Definition: hash.h:250

HashMetaPageData::hashm_ovflpoint

uint32 hashm_ovflpoint

Definition: hash.h:257

HashMetaPageData::hashm_highmask

uint32 hashm_highmask

Definition: hash.h:255

HashMetaPageData::hashm_magic

uint32 hashm_magic

Definition: hash.h:246

HashMetaPageData::hashm_bmshift

uint16 hashm_bmshift

Definition: hash.h:253

HashMetaPageData::hashm_nmaps

uint32 hashm_nmaps

Definition: hash.h:260

HashMetaPageData::hashm_ffactor

uint16 hashm_ffactor

Definition: hash.h:249

HashPageOpaqueData

Definition: hash.h:78

HashPageOpaqueData::hasho_nextblkno

BlockNumber hasho_nextblkno

Definition: hash.h:80

HashPageOpaqueData::hasho_flag

uint16 hasho_flag

Definition: hash.h:82

HashPageOpaqueData::hasho_prevblkno

BlockNumber hasho_prevblkno

Definition: hash.h:79

HashPageOpaqueData::hasho_page_id

uint16 hasho_page_id

Definition: hash.h:83

HashPageOpaqueData::hasho_bucket

Bucket hasho_bucket

Definition: hash.h:81

HashScanOpaqueData

Definition: hash.h:159

HashScanOpaqueData::hashso_buc_split

bool hashso_buc_split

Definition: hash.h:180

HashScanOpaqueData::currPos

HashScanPosData currPos

Definition: hash.h:189

HashScanOpaqueData::hashso_buc_populated

bool hashso_buc_populated

Definition: hash.h:174

HashScanOpaqueData::hashso_split_bucket_buf

Buffer hashso_split_bucket_buf

Definition: hash.h:171

HashScanOpaqueData::hashso_bucket_buf

Buffer hashso_bucket_buf

Definition: hash.h:164

HashScanPosData::buf

Buffer buf

Definition: hash.h:111

IndexTupleData

Definition: itup.h:36

IndexTupleData::t_tid

ItemPointerData t_tid

Definition: itup.h:37

IndexTupleData::t_info

unsigned short t_info

Definition: itup.h:49

RelationData

Definition: rel.h:56

RelationData::rd_amcache

void * rd_amcache

Definition: rel.h:229

RelationData::rd_indexcxt

MemoryContext rd_indexcxt

Definition: rel.h:204

RelationData::rd_locator

RelFileLocator rd_locator

Definition: rel.h:57

xl_hash_init_bitmap_page

Definition: hash_xlog.h:230

xl_hash_init_bitmap_page::bmsize

uint16 bmsize

Definition: hash_xlog.h:231

xl_hash_init_meta_page

Definition: hash_xlog.h:212

xl_hash_init_meta_page::procid

RegProcedure procid

Definition: hash_xlog.h:214

xl_hash_init_meta_page::num_tuples

double num_tuples

Definition: hash_xlog.h:213

xl_hash_init_meta_page::ffactor

uint16 ffactor

Definition: hash_xlog.h:215

xl_hash_split_allocate_page

Definition: hash_xlog.h:93

xl_hash_split_allocate_page::new_bucket_flag

uint16 new_bucket_flag

Definition: hash_xlog.h:96

xl_hash_split_allocate_page::flags

uint8 flags

Definition: hash_xlog.h:97

xl_hash_split_allocate_page::old_bucket_flag

uint16 old_bucket_flag

Definition: hash_xlog.h:95

xl_hash_split_allocate_page::new_bucket

uint32 new_bucket

Definition: hash_xlog.h:94

xl_hash_split_complete

Definition: hash_xlog.h:112

xl_hash_split_complete::new_bucket_flag

uint16 new_bucket_flag

Definition: hash_xlog.h:114

xl_hash_split_complete::old_bucket_flag

uint16 old_bucket_flag

Definition: hash_xlog.h:113

flag

char * flag(int b)

Definition: test-ctype.c:33

PGIOAlignedBlock

Definition: c.h:1132

PGIOAlignedBlock::data

char data[BLCKSZ]

Definition: c.h:1136

XLogRecPtr

uint64 XLogRecPtr

Definition: xlogdefs.h:21

XLogInsert

XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)

Definition: xloginsert.c:474

XLogRegisterBufData

void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)

Definition: xloginsert.c:405

XLogRegisterData

void XLogRegisterData(const void *data, uint32 len)

Definition: xloginsert.c:364

log_newpage

XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blkno, Page page, bool page_std)

Definition: xloginsert.c:1155

XLogRegisterBuffer

void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)

Definition: xloginsert.c:242

XLogBeginInsert

void XLogBeginInsert(void)

Definition: xloginsert.c:149

xloginsert.h

REGBUF_STANDARD

#define REGBUF_STANDARD

Definition: xloginsert.h:35

REGBUF_FORCE_IMAGE

#define REGBUF_FORCE_IMAGE

Definition: xloginsert.h:32

REGBUF_WILL_INIT

#define REGBUF_WILL_INIT

Definition: xloginsert.h:34

PostgreSQL Source Code: src/backend/access/hash/hashpage.c Source File