hio.c File Reference

#include "postgres.h"
#include "access/heapam.h"
#include "access/hio.h"
#include "access/htup_details.h"
#include "access/visibilitymap.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"

Include dependency graph for hio.c:

Go to the source code of this file.

Macros

#define MAX_BUFFERS_TO_EXTEND_BY 64

Functions

void RelationPutHeapTuple (Relation relation, Buffer buffer, HeapTuple tuple, bool token)

static Buffer ReadBufferBI (Relation relation, BlockNumber targetBlock, ReadBufferMode mode, BulkInsertState bistate)

static bool GetVisibilityMapPins (Relation relation, Buffer buffer1, Buffer buffer2, BlockNumber block1, BlockNumber block2, Buffer *vmbuffer1, Buffer *vmbuffer2)

static Buffer RelationAddBlocks (Relation relation, BulkInsertState bistate, int num_pages, bool use_fsm, bool *did_unlock)

Buffer RelationGetBufferForTuple (Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)

Macro Definition Documentation

◆ MAX_BUFFERS_TO_EXTEND_BY

#define MAX_BUFFERS_TO_EXTEND_BY 64

Function Documentation

◆ GetVisibilityMapPins()

static bool GetVisibilityMapPins ( Relation relation,

Buffer buffer1,

Buffer buffer2,

BlockNumber block1,

BlockNumber block2,

Buffer * vmbuffer1,

Buffer * vmbuffer2

)

static

Definition at line 140 of file hio.c.

143{

144 bool need_to_pin_buffer1;

145 bool need_to_pin_buffer2;

146 bool released_locks = false;

147

148 /*

149 * Swap buffers around to handle case of a single block/buffer, and to

150 * handle if lock ordering rules require to lock block2 first.

151 */

152 if (!BufferIsValid(buffer1) ||

153 (BufferIsValid(buffer2) && block1 > block2))

154 {

155 Buffer tmpbuf = buffer1;

156 Buffer *tmpvmbuf = vmbuffer1;

157 BlockNumber tmpblock = block1;

158

159 buffer1 = buffer2;

160 vmbuffer1 = vmbuffer2;

161 block1 = block2;

162

163 buffer2 = tmpbuf;

164 vmbuffer2 = tmpvmbuf;

165 block2 = tmpblock;

166 }

167

168 Assert(BufferIsValid(buffer1));

169 Assert(buffer2 == InvalidBuffer || block1 <= block2);

170

171 while (1)

172 {

173 /* Figure out which pins we need but don't have. */

174 need_to_pin_buffer1 = PageIsAllVisible(BufferGetPage(buffer1))

175 && !visibilitymap_pin_ok(block1, *vmbuffer1);

176 need_to_pin_buffer2 = buffer2 != InvalidBuffer

177 && PageIsAllVisible(BufferGetPage(buffer2))

178 && !visibilitymap_pin_ok(block2, *vmbuffer2);

179 if (!need_to_pin_buffer1 && !need_to_pin_buffer2)

180 break;

181

182 /* We must unlock both buffers before doing any I/O. */

183 released_locks = true;

184 LockBuffer(buffer1, BUFFER_LOCK_UNLOCK);

185 if (buffer2 != InvalidBuffer && buffer2 != buffer1)

186 LockBuffer(buffer2, BUFFER_LOCK_UNLOCK);

187

188 /* Get pins. */

189 if (need_to_pin_buffer1)

190 visibilitymap_pin(relation, block1, vmbuffer1);

191 if (need_to_pin_buffer2)

192 visibilitymap_pin(relation, block2, vmbuffer2);

193

194 /* Relock buffers. */

195 LockBuffer(buffer1, BUFFER_LOCK_EXCLUSIVE);

196 if (buffer2 != InvalidBuffer && buffer2 != buffer1)

197 LockBuffer(buffer2, BUFFER_LOCK_EXCLUSIVE);

198

199 /*

200 * If there are two buffers involved and we pinned just one of them,

201 * it's possible that the second one became all-visible while we were

202 * busy pinning the first one. If it looks like that's a possible

203 * scenario, we'll need to make a second pass through this loop.

204 */

205 if (buffer2 == InvalidBuffer || buffer1 == buffer2

206 || (need_to_pin_buffer1 && need_to_pin_buffer2))

207 break;

208 }

209

210 return released_locks;

211}

BlockNumber

uint32 BlockNumber

Definition: block.h:31

Buffer

int Buffer

Definition: buf.h:23

InvalidBuffer

#define InvalidBuffer

Definition: buf.h:25

LockBuffer

void LockBuffer(Buffer buffer, int mode)

Definition: bufmgr.c:5572

BUFFER_LOCK_UNLOCK

#define BUFFER_LOCK_UNLOCK

Definition: bufmgr.h:196

BufferGetPage

static Page BufferGetPage(Buffer buffer)

Definition: bufmgr.h:417

BUFFER_LOCK_EXCLUSIVE

#define BUFFER_LOCK_EXCLUSIVE

Definition: bufmgr.h:198

BufferIsValid

static bool BufferIsValid(Buffer bufnum)

Definition: bufmgr.h:368

PageIsAllVisible

static bool PageIsAllVisible(const PageData *page)

Definition: bufpage.h:429

Assert

Assert(PointerIsAligned(start, uint64))

visibilitymap_pin_ok

bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)

Definition: visibilitymap.c:215

visibilitymap_pin

void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)

Definition: visibilitymap.c:191

tmpbuf

static StringInfoData tmpbuf

Definition: walsender.c:178

References Assert(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), BufferIsValid(), InvalidBuffer, LockBuffer(), PageIsAllVisible(), tmpbuf, visibilitymap_pin(), and visibilitymap_pin_ok().

Referenced by RelationGetBufferForTuple().

◆ ReadBufferBI()

static Buffer ReadBufferBI ( Relation relation,

BlockNumber targetBlock,

ReadBufferMode mode,

BulkInsertState bistate

)

static

Definition at line 88 of file hio.c.

90{

91 Buffer buffer;

92

93 /* If not bulk-insert, exactly like ReadBuffer */

94 if (!bistate)

95 return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,

96 mode, NULL);

97

98 /* If we have the desired block already pinned, re-pin and return it */

99 if (bistate->current_buf != InvalidBuffer)

100 {

101 if (BufferGetBlockNumber(bistate->current_buf) == targetBlock)

102 {

103 /*

104 * Currently the LOCK variants are only used for extending

105 * relation, which should never reach this branch.

106 */

107 Assert(mode != RBM_ZERO_AND_LOCK &&

108 mode != RBM_ZERO_AND_CLEANUP_LOCK);

109

110 IncrBufferRefCount(bistate->current_buf);

111 return bistate->current_buf;

112 }

113 /* ... else drop the old buffer */

114 ReleaseBuffer(bistate->current_buf);

115 bistate->current_buf = InvalidBuffer;

116 }

117

118 /* Perform a read using the buffer strategy */

119 buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,

120 mode, bistate->strategy);

121

122 /* Save the selected block as target for future inserts */

123 IncrBufferRefCount(buffer);

124 bistate->current_buf = buffer;

125

126 return buffer;

127}

IncrBufferRefCount

void IncrBufferRefCount(Buffer buffer)

Definition: bufmgr.c:5370

BufferGetBlockNumber

BlockNumber BufferGetBlockNumber(Buffer buffer)

Definition: bufmgr.c:4198

ReleaseBuffer

void ReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5338

ReadBufferExtended

Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)

Definition: bufmgr.c:805

RBM_ZERO_AND_CLEANUP_LOCK

@ RBM_ZERO_AND_CLEANUP_LOCK

Definition: bufmgr.h:49

RBM_ZERO_AND_LOCK

@ RBM_ZERO_AND_LOCK

Definition: bufmgr.h:47

mode

static PgChecksumMode mode

Definition: pg_checksums.c:55

MAIN_FORKNUM

@ MAIN_FORKNUM

Definition: relpath.h:58

BulkInsertStateData::strategy

BufferAccessStrategy strategy

Definition: hio.h:31

BulkInsertStateData::current_buf

Buffer current_buf

Definition: hio.h:32

References Assert(), BufferGetBlockNumber(), BulkInsertStateData::current_buf, IncrBufferRefCount(), InvalidBuffer, MAIN_FORKNUM, mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, ReadBufferExtended(), ReleaseBuffer(), and BulkInsertStateData::strategy.

Referenced by RelationGetBufferForTuple().

◆ RelationAddBlocks()

static Buffer RelationAddBlocks ( Relation relation,

BulkInsertState bistate,

int num_pages,

bool use_fsm,

bool * did_unlock

)

static

Definition at line 238 of file hio.c.

240{

241#define MAX_BUFFERS_TO_EXTEND_BY 64

242 Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY];

243 BlockNumber first_block = InvalidBlockNumber;

244 BlockNumber last_block = InvalidBlockNumber;

245 uint32 extend_by_pages;

246 uint32 not_in_fsm_pages;

247 Buffer buffer;

248 Page page;

249

250 /*

251 * Determine by how many pages to try to extend by.

252 */

253 if (bistate == NULL && !use_fsm)

254 {

255 /*

256 * If we have neither bistate, nor can use the FSM, we can't bulk

257 * extend - there'd be no way to find the additional pages.

258 */

259 extend_by_pages = 1;

260 }

261 else

262 {

263 uint32 waitcount;

264

265 /*

266 * Try to extend at least by the number of pages the caller needs. We

267 * can remember the additional pages (either via FSM or bistate).

268 */

269 extend_by_pages = num_pages;

270

271 if (!RELATION_IS_LOCAL(relation))

272 waitcount = RelationExtensionLockWaiterCount(relation);

273 else

274 waitcount = 0;

275

276 /*

277 * Multiply the number of pages to extend by the number of waiters. Do

278 * this even if we're not using the FSM, as it still relieves

279 * contention, by deferring the next time this backend needs to

280 * extend. In that case the extended pages will be found via

281 * bistate->next_free.

282 */

283 extend_by_pages += extend_by_pages * waitcount;

284

285 /* ---

286 * If we previously extended using the same bistate, it's very likely

287 * we'll extend some more. Try to extend by as many pages as

288 * before. This can be important for performance for several reasons,

289 * including:

290 *

291 * - It prevents mdzeroextend() switching between extending the

292 * relation in different ways, which is inefficient for some

293 * filesystems.

294 *

295 * - Contention is often intermittent. Even if we currently don't see

296 * other waiters (see above), extending by larger amounts can

297 * prevent future contention.

298 * ---

299 */

300 if (bistate)

301 extend_by_pages = Max(extend_by_pages, bistate->already_extended_by);

302

303 /*

304 * Can't extend by more than MAX_BUFFERS_TO_EXTEND_BY, we need to pin

305 * them all concurrently.

306 */

307 extend_by_pages = Min(extend_by_pages, MAX_BUFFERS_TO_EXTEND_BY);

308 }

309

310 /*

311 * How many of the extended pages should be entered into the FSM?

312 *

313 * If we have a bistate, only enter pages that we don't need ourselves

314 * into the FSM. Otherwise every other backend will immediately try to

315 * use the pages this backend needs for itself, causing unnecessary

316 * contention. If we don't have a bistate, we can't avoid the FSM.

317 *

318 * Never enter the page returned into the FSM, we'll immediately use it.

319 */

320 if (num_pages > 1 && bistate == NULL)

321 not_in_fsm_pages = 1;

322 else

323 not_in_fsm_pages = num_pages;

324

325 /* prepare to put another buffer into the bistate */

326 if (bistate && bistate->current_buf != InvalidBuffer)

327 {

328 ReleaseBuffer(bistate->current_buf);

329 bistate->current_buf = InvalidBuffer;

330 }

331

332 /*

333 * Extend the relation. We ask for the first returned page to be locked,

334 * so that we are sure that nobody has inserted into the page

335 * concurrently.

336 *

337 * With the current MAX_BUFFERS_TO_EXTEND_BY there's no danger of

338 * [auto]vacuum trying to truncate later pages as REL_TRUNCATE_MINIMUM is

339 * way larger.

340 */

341 first_block = ExtendBufferedRelBy(BMR_REL(relation), MAIN_FORKNUM,

342 bistate ? bistate->strategy : NULL,

343 EB_LOCK_FIRST,

344 extend_by_pages,

345 victim_buffers,

346 &extend_by_pages);

347 buffer = victim_buffers[0]; /* the buffer the function will return */

348 last_block = first_block + (extend_by_pages - 1);

349 Assert(first_block == BufferGetBlockNumber(buffer));

350

351 /*

352 * Relation is now extended. Initialize the page. We do this here, before

353 * potentially releasing the lock on the page, because it allows us to

354 * double check that the page contents are empty (this should never

355 * happen, but if it does we don't want to risk wiping out valid data).

356 */

357 page = BufferGetPage(buffer);

358 if (!PageIsNew(page))

359 elog(ERROR, "page %u of relation \"%s\" should be empty but is not",

360 first_block,

361 RelationGetRelationName(relation));

362

363 PageInit(page, BufferGetPageSize(buffer), 0);

364 MarkBufferDirty(buffer);

365

366 /*

367 * If we decided to put pages into the FSM, release the buffer lock (but

368 * not pin), we don't want to do IO while holding a buffer lock. This will

369 * necessitate a bit more extensive checking in our caller.

370 */

371 if (use_fsm && not_in_fsm_pages < extend_by_pages)

372 {

373 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

374 *did_unlock = true;

375 }

376 else

377 *did_unlock = false;

378

379 /*

380 * Relation is now extended. Release pins on all buffers, except for the

381 * first (which we'll return). If we decided to put pages into the FSM,

382 * we can do that as part of the same loop.

383 */

384 for (uint32 i = 1; i < extend_by_pages; i++)

385 {

386 BlockNumber curBlock = first_block + i;

387

388 Assert(curBlock == BufferGetBlockNumber(victim_buffers[i]));

389 Assert(BlockNumberIsValid(curBlock));

390

391 ReleaseBuffer(victim_buffers[i]);

392

393 if (use_fsm && i >= not_in_fsm_pages)

394 {

395 Size freespace = BufferGetPageSize(victim_buffers[i]) -

396 SizeOfPageHeaderData;

397

398 RecordPageWithFreeSpace(relation, curBlock, freespace);

399 }

400 }

401

402 if (use_fsm && not_in_fsm_pages < extend_by_pages)

403 {

404 BlockNumber first_fsm_block = first_block + not_in_fsm_pages;

405

406 FreeSpaceMapVacuumRange(relation, first_fsm_block, last_block);

407 }

408

409 if (bistate)

410 {

411 /*

412 * Remember the additional pages we extended by, so we later can use

413 * them without looking into the FSM.

414 */

415 if (extend_by_pages > 1)

416 {

417 bistate->next_free = first_block + 1;

418 bistate->last_free = last_block;

419 }

420 else

421 {

422 bistate->next_free = InvalidBlockNumber;

423 bistate->last_free = InvalidBlockNumber;

424 }

425

426 /* maintain bistate->current_buf */

427 IncrBufferRefCount(buffer);

428 bistate->current_buf = buffer;

429 bistate->already_extended_by += extend_by_pages;

430 }

431

432 return buffer;

433#undef MAX_BUFFERS_TO_EXTEND_BY

434}

InvalidBlockNumber

#define InvalidBlockNumber

Definition: block.h:33

BlockNumberIsValid

static bool BlockNumberIsValid(BlockNumber blockNumber)

Definition: block.h:71

ExtendBufferedRelBy

BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)

Definition: bufmgr.c:890

MarkBufferDirty

void MarkBufferDirty(Buffer buffer)

Definition: bufmgr.c:2921

BufferGetPageSize

static Size BufferGetPageSize(Buffer buffer)

Definition: bufmgr.h:406

EB_LOCK_FIRST

@ EB_LOCK_FIRST

Definition: bufmgr.h:87

BMR_REL

#define BMR_REL(p_rel)

Definition: bufmgr.h:111

PageInit

void PageInit(Page page, Size pageSize, Size specialSize)

Definition: bufpage.c:42

PageIsNew

static bool PageIsNew(const PageData *page)

Definition: bufpage.h:234

SizeOfPageHeaderData

#define SizeOfPageHeaderData

Definition: bufpage.h:217

Page

PageData * Page

Definition: bufpage.h:82

Min

#define Min(x, y)

Definition: c.h:1003

Max

#define Max(x, y)

Definition: c.h:997

uint32

uint32_t uint32

Definition: c.h:538

Size

size_t Size

Definition: c.h:610

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

FreeSpaceMapVacuumRange

void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)

Definition: freespace.c:377

RecordPageWithFreeSpace

void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)

Definition: freespace.c:194

MAX_BUFFERS_TO_EXTEND_BY

#define MAX_BUFFERS_TO_EXTEND_BY

i

int i

Definition: isn.c:77

RelationExtensionLockWaiterCount

int RelationExtensionLockWaiterCount(Relation relation)

Definition: lmgr.c:459

RELATION_IS_LOCAL

#define RELATION_IS_LOCAL(relation)

Definition: rel.h:657

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

BulkInsertStateData::last_free

BlockNumber last_free

Definition: hio.h:49

BulkInsertStateData::already_extended_by

uint32 already_extended_by

Definition: hio.h:50

BulkInsertStateData::next_free

BlockNumber next_free

Definition: hio.h:48

References BulkInsertStateData::already_extended_by, Assert(), BlockNumberIsValid(), BMR_REL, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferGetPageSize(), BulkInsertStateData::current_buf, EB_LOCK_FIRST, elog, ERROR, ExtendBufferedRelBy(), FreeSpaceMapVacuumRange(), i, IncrBufferRefCount(), InvalidBlockNumber, InvalidBuffer, BulkInsertStateData::last_free, LockBuffer(), MAIN_FORKNUM, MarkBufferDirty(), Max, MAX_BUFFERS_TO_EXTEND_BY, Min, BulkInsertStateData::next_free, PageInit(), PageIsNew(), RecordPageWithFreeSpace(), RELATION_IS_LOCAL, RelationExtensionLockWaiterCount(), RelationGetRelationName, ReleaseBuffer(), SizeOfPageHeaderData, and BulkInsertStateData::strategy.

Referenced by RelationGetBufferForTuple().

◆ RelationGetBufferForTuple()

Buffer RelationGetBufferForTuple ( Relation relation,

Size len,

Buffer otherBuffer,

int options,

BulkInsertState bistate,

Buffer * vmbuffer,

Buffer * vmbuffer_other,

int num_pages

)

Definition at line 502 of file hio.c.

507{

508 bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM);

509 Buffer buffer = InvalidBuffer;

510 Page page;

511 Size nearlyEmptyFreeSpace,

512 pageFreeSpace = 0,

513 saveFreeSpace = 0,

514 targetFreeSpace = 0;

515 BlockNumber targetBlock,

516 otherBlock;

517 bool unlockedTargetBuffer;

518 bool recheckVmPins;

519

520 len = MAXALIGN(len); /* be conservative */

521

522 /* if the caller doesn't know by how many pages to extend, extend by 1 */

523 if (num_pages <= 0)

524 num_pages = 1;

525

526 /* Bulk insert is not supported for updates, only inserts. */

527 Assert(otherBuffer == InvalidBuffer || !bistate);

528

529 /*

530 * If we're gonna fail for oversize tuple, do it right away

531 */

532 if (len > MaxHeapTupleSize)

533 ereport(ERROR,

534 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

535 errmsg("row is too big: size %zu, maximum size %zu",

536 len, MaxHeapTupleSize)));

537

538 /* Compute desired extra freespace due to fillfactor option */

539 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,

540 HEAP_DEFAULT_FILLFACTOR);

541

542 /*

543 * Since pages without tuples can still have line pointers, we consider

544 * pages "empty" when the unavailable space is slight. This threshold is

545 * somewhat arbitrary, but it should prevent most unnecessary relation

546 * extensions while inserting large tuples into low-fillfactor tables.

547 */

548 nearlyEmptyFreeSpace = MaxHeapTupleSize -

549 (MaxHeapTuplesPerPage / 8 * sizeof(ItemIdData));

550 if (len + saveFreeSpace > nearlyEmptyFreeSpace)

551 targetFreeSpace = Max(len, nearlyEmptyFreeSpace);

552 else

553 targetFreeSpace = len + saveFreeSpace;

554

555 if (otherBuffer != InvalidBuffer)

556 otherBlock = BufferGetBlockNumber(otherBuffer);

557 else

558 otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */

559

560 /*

561 * We first try to put the tuple on the same page we last inserted a tuple

562 * on, as cached in the BulkInsertState or relcache entry. If that

563 * doesn't work, we ask the Free Space Map to locate a suitable page.

564 * Since the FSM's info might be out of date, we have to be prepared to

565 * loop around and retry multiple times. (To ensure this isn't an infinite

566 * loop, we must update the FSM with the correct amount of free space on

567 * each page that proves not to be suitable.) If the FSM has no record of

568 * a page with enough free space, we give up and extend the relation.

569 *

570 * When use_fsm is false, we either put the tuple onto the existing target

571 * page or extend the relation.

572 */

573 if (bistate && bistate->current_buf != InvalidBuffer)

574 targetBlock = BufferGetBlockNumber(bistate->current_buf);

575 else

576 targetBlock = RelationGetTargetBlock(relation);

577

578 if (targetBlock == InvalidBlockNumber && use_fsm)

579 {

580 /*

581 * We have no cached target page, so ask the FSM for an initial

582 * target.

583 */

584 targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace);

585 }

586

587 /*

588 * If the FSM knows nothing of the rel, try the last page before we give

589 * up and extend. This avoids one-tuple-per-page syndrome during

590 * bootstrapping or in a recently-started system.

591 */

592 if (targetBlock == InvalidBlockNumber)

593 {

594 BlockNumber nblocks = RelationGetNumberOfBlocks(relation);

595

596 if (nblocks > 0)

597 targetBlock = nblocks - 1;

598 }

599

600loop:

601 while (targetBlock != InvalidBlockNumber)

602 {

603 /*

604 * Read and exclusive-lock the target block, as well as the other

605 * block if one was given, taking suitable care with lock ordering and

606 * the possibility they are the same block.

607 *

608 * If the page-level all-visible flag is set, caller will need to

609 * clear both that and the corresponding visibility map bit. However,

610 * by the time we return, we'll have x-locked the buffer, and we don't

611 * want to do any I/O while in that state. So we check the bit here

612 * before taking the lock, and pin the page if it appears necessary.

613 * Checking without the lock creates a risk of getting the wrong

614 * answer, so we'll have to recheck after acquiring the lock.

615 */

616 if (otherBuffer == InvalidBuffer)

617 {

618 /* easy case */

619 buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate);

620 if (PageIsAllVisible(BufferGetPage(buffer)))

621 visibilitymap_pin(relation, targetBlock, vmbuffer);

622

623 /*

624 * If the page is empty, pin vmbuffer to set all_frozen bit later.

625 */

626 if ((options & HEAP_INSERT_FROZEN) &&

627 (PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0))

628 visibilitymap_pin(relation, targetBlock, vmbuffer);

629

630 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

631 }

632 else if (otherBlock == targetBlock)

633 {

634 /* also easy case */

635 buffer = otherBuffer;

636 if (PageIsAllVisible(BufferGetPage(buffer)))

637 visibilitymap_pin(relation, targetBlock, vmbuffer);

638 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

639 }

640 else if (otherBlock < targetBlock)

641 {

642 /* lock other buffer first */

643 buffer = ReadBuffer(relation, targetBlock);

644 if (PageIsAllVisible(BufferGetPage(buffer)))

645 visibilitymap_pin(relation, targetBlock, vmbuffer);

646 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

647 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

648 }

649 else

650 {

651 /* lock target buffer first */

652 buffer = ReadBuffer(relation, targetBlock);

653 if (PageIsAllVisible(BufferGetPage(buffer)))

654 visibilitymap_pin(relation, targetBlock, vmbuffer);

655 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

656 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

657 }

658

659 /*

660 * We now have the target page (and the other buffer, if any) pinned

661 * and locked. However, since our initial PageIsAllVisible checks

662 * were performed before acquiring the lock, the results might now be

663 * out of date, either for the selected victim buffer, or for the

664 * other buffer passed by the caller. In that case, we'll need to

665 * give up our locks, go get the pin(s) we failed to get earlier, and

666 * re-lock. That's pretty painful, but hopefully shouldn't happen

667 * often.

668 *

669 * Note that there's a small possibility that we didn't pin the page

670 * above but still have the correct page pinned anyway, either because

671 * we've already made a previous pass through this loop, or because

672 * caller passed us the right page anyway.

673 *

674 * Note also that it's possible that by the time we get the pin and

675 * retake the buffer locks, the visibility map bit will have been

676 * cleared by some other backend anyway. In that case, we'll have

677 * done a bit of extra work for no gain, but there's no real harm

678 * done.

679 */

680 GetVisibilityMapPins(relation, buffer, otherBuffer,

681 targetBlock, otherBlock, vmbuffer,

682 vmbuffer_other);

683

684 /*

685 * Now we can check to see if there's enough free space here. If so,

686 * we're done.

687 */

688 page = BufferGetPage(buffer);

689

690 /*

691 * If necessary initialize page, it'll be used soon. We could avoid

692 * dirtying the buffer here, and rely on the caller to do so whenever

693 * it puts a tuple onto the page, but there seems not much benefit in

694 * doing so.

695 */

696 if (PageIsNew(page))

697 {

698 PageInit(page, BufferGetPageSize(buffer), 0);

699 MarkBufferDirty(buffer);

700 }

701

702 pageFreeSpace = PageGetHeapFreeSpace(page);

703 if (targetFreeSpace <= pageFreeSpace)

704 {

705 /* use this page as future insert target, too */

706 RelationSetTargetBlock(relation, targetBlock);

707 return buffer;

708 }

709

710 /*

711 * Not enough space, so we must give up our page locks and pin (if

712 * any) and prepare to look elsewhere. We don't care which order we

713 * unlock the two buffers in, so this can be slightly simpler than the

714 * code above.

715 */

716 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

717 if (otherBuffer == InvalidBuffer)

718 ReleaseBuffer(buffer);

719 else if (otherBlock != targetBlock)

720 {

721 LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);

722 ReleaseBuffer(buffer);

723 }

724

725 /* Is there an ongoing bulk extension? */

726 if (bistate && bistate->next_free != InvalidBlockNumber)

727 {

728 Assert(bistate->next_free <= bistate->last_free);

729

730 /*

731 * We bulk extended the relation before, and there are still some

732 * unused pages from that extension, so we don't need to look in

733 * the FSM for a new page. But do record the free space from the

734 * last page, somebody might insert narrower tuples later.

735 */

736 if (use_fsm)

737 RecordPageWithFreeSpace(relation, targetBlock, pageFreeSpace);

738

739 targetBlock = bistate->next_free;

740 if (bistate->next_free >= bistate->last_free)

741 {

742 bistate->next_free = InvalidBlockNumber;

743 bistate->last_free = InvalidBlockNumber;

744 }

745 else

746 bistate->next_free++;

747 }

748 else if (!use_fsm)

749 {

750 /* Without FSM, always fall out of the loop and extend */

751 break;

752 }

753 else

754 {

755 /*

756 * Update FSM as to condition of this page, and ask for another

757 * page to try.

758 */

759 targetBlock = RecordAndGetPageWithFreeSpace(relation,

760 targetBlock,

761 pageFreeSpace,

762 targetFreeSpace);

763 }

764 }

765

766 /* Have to extend the relation */

767 buffer = RelationAddBlocks(relation, bistate, num_pages, use_fsm,

768 &unlockedTargetBuffer);

769

770 targetBlock = BufferGetBlockNumber(buffer);

771 page = BufferGetPage(buffer);

772

773 /*

774 * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to

775 * do IO while the buffer is locked, so we unlock the page first if IO is

776 * needed (necessitating checks below).

777 */

778 if (options & HEAP_INSERT_FROZEN)

779 {

780 Assert(PageGetMaxOffsetNumber(page) == 0);

781

782 if (!visibilitymap_pin_ok(targetBlock, *vmbuffer))

783 {

784 if (!unlockedTargetBuffer)

785 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

786 unlockedTargetBuffer = true;

787 visibilitymap_pin(relation, targetBlock, vmbuffer);

788 }

789 }

790

791 /*

792 * Reacquire locks if necessary.

793 *

794 * If the target buffer was unlocked above, or is unlocked while

795 * reacquiring the lock on otherBuffer below, it's unlikely, but possible,

796 * that another backend used space on this page. We check for that below,

797 * and retry if necessary.

798 */

799 recheckVmPins = false;

800 if (unlockedTargetBuffer)

801 {

802 /* released lock on target buffer above */

803 if (otherBuffer != InvalidBuffer)

804 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

805 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

806 recheckVmPins = true;

807 }

808 else if (otherBuffer != InvalidBuffer)

809 {

810 /*

811 * We did not release the target buffer, and otherBuffer is valid,

812 * need to lock the other buffer. It's guaranteed to be of a lower

813 * page number than the new page. To conform with the deadlock

814 * prevent rules, we ought to lock otherBuffer first, but that would

815 * give other backends a chance to put tuples on our page. To reduce

816 * the likelihood of that, attempt to lock the other buffer

817 * conditionally, that's very likely to work.

818 *

819 * Alternatively, we could acquire the lock on otherBuffer before

820 * extending the relation, but that'd require holding the lock while

821 * performing IO, which seems worse than an unlikely retry.

822 */

823 Assert(otherBuffer != buffer);

824 Assert(targetBlock > otherBlock);

825

826 if (unlikely(!ConditionalLockBuffer(otherBuffer)))

827 {

828 unlockedTargetBuffer = true;

829 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

830 LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

831 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

832 }

833 recheckVmPins = true;

834 }

835

836 /*

837 * If one of the buffers was unlocked (always the case if otherBuffer is

838 * valid), it's possible, although unlikely, that an all-visible flag

839 * became set. We can use GetVisibilityMapPins to deal with that. It's

840 * possible that GetVisibilityMapPins() might need to temporarily release

841 * buffer locks, in which case we'll need to check if there's still enough

842 * space on the page below.

843 */

844 if (recheckVmPins)

845 {

846 if (GetVisibilityMapPins(relation, otherBuffer, buffer,

847 otherBlock, targetBlock, vmbuffer_other,

848 vmbuffer))

849 unlockedTargetBuffer = true;

850 }

851

852 /*

853 * If the target buffer was temporarily unlocked since the relation

854 * extension, it's possible, although unlikely, that all the space on the

855 * page was already used. If so, we just retry from the start. If we

856 * didn't unlock, something has gone wrong if there's not enough space -

857 * the test at the top should have prevented reaching this case.

858 */

859 pageFreeSpace = PageGetHeapFreeSpace(page);

860 if (len > pageFreeSpace)

861 {

862 if (unlockedTargetBuffer)

863 {

864 if (otherBuffer != InvalidBuffer)

865 LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);

866 UnlockReleaseBuffer(buffer);

867

868 goto loop;

869 }

870 elog(PANIC, "tuple is too big: size %zu", len);

871 }

872

873 /*

874 * Remember the new page as our target for future insertions.

875 *

876 * XXX should we enter the new page into the free space map immediately,

877 * or just keep it for this backend's exclusive use in the short run

878 * (until VACUUM sees it)? Seems to depend on whether you expect the

879 * current backend to make more insertions or not, which is probably a

880 * good bet most of the time. So for now, don't add it to FSM yet.

881 */

882 RelationSetTargetBlock(relation, targetBlock);

883

884 return buffer;

885}

ConditionalLockBuffer

bool ConditionalLockBuffer(Buffer buffer)

Definition: bufmgr.c:5598

UnlockReleaseBuffer

void UnlockReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5355

ReadBuffer

Buffer ReadBuffer(Relation reln, BlockNumber blockNum)

Definition: bufmgr.c:758

RelationGetNumberOfBlocks

#define RelationGetNumberOfBlocks(reln)

Definition: bufmgr.h:283

RBM_NORMAL

@ RBM_NORMAL

Definition: bufmgr.h:46

PageGetHeapFreeSpace

Size PageGetHeapFreeSpace(const PageData *page)

Definition: bufpage.c:990

PageGetMaxOffsetNumber

static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)

Definition: bufpage.h:372

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

unlikely

#define unlikely(x)

Definition: c.h:402

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

PANIC

#define PANIC

Definition: elog.h:42

ereport

#define ereport(elevel,...)

Definition: elog.h:150

RecordAndGetPageWithFreeSpace

BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded)

Definition: freespace.c:154

GetPageWithFreeSpace

BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded)

Definition: freespace.c:137

HEAP_INSERT_SKIP_FSM

#define HEAP_INSERT_SKIP_FSM

Definition: heapam.h:37

HEAP_INSERT_FROZEN

#define HEAP_INSERT_FROZEN

Definition: heapam.h:38

RelationAddBlocks

static Buffer RelationAddBlocks(Relation relation, BulkInsertState bistate, int num_pages, bool use_fsm, bool *did_unlock)

Definition: hio.c:238

GetVisibilityMapPins

static bool GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, BlockNumber block1, BlockNumber block2, Buffer *vmbuffer1, Buffer *vmbuffer2)

Definition: hio.c:140

ReadBufferBI

static Buffer ReadBufferBI(Relation relation, BlockNumber targetBlock, ReadBufferMode mode, BulkInsertState bistate)

Definition: hio.c:88

MaxHeapTuplesPerPage

#define MaxHeapTuplesPerPage

Definition: htup_details.h:624

MaxHeapTupleSize

#define MaxHeapTupleSize

Definition: htup_details.h:610

ItemIdData

struct ItemIdData ItemIdData