[フレーム]

ginpostinglist.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 *

3 * ginpostinglist.c

4 * routines for dealing with posting lists.

5 *

6 *

9 *

10 * IDENTIFICATION

11 * src/backend/access/gin/ginpostinglist.c

12 *-------------------------------------------------------------------------

13 */

14

15#include "postgres.h"

16

17#include "access/gin_private.h"

18

19#ifdef USE_ASSERT_CHECKING

20#define CHECK_ENCODING_ROUNDTRIP

21#endif

22

23/*

24 * For encoding purposes, item pointers are represented as 64-bit unsigned

25 * integers. The lowest 11 bits represent the offset number, and the next

26 * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.

27 * only 43 low bits are used.

28 *

29 * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <

30 * 2^11 on all supported block sizes. We are frugal with the bits, because

31 * smaller integers use fewer bytes in the varbyte encoding, saving disk

32 * space. (If we get a new table AM in the future that wants to use the full

33 * range of possible offset numbers, we'll need to change this.)

34 *

35 * These 43-bit integers are encoded using varbyte encoding. In each byte,

36 * the 7 low bits contain data, while the highest bit is a continuation bit.

37 * When the continuation bit is set, the next byte is part of the same

38 * integer, otherwise this is the last byte of this integer. 43 bits need

39 * at most 7 bytes in this encoding:

40 *

41 * 0XXXXXXX

42 * 1XXXXXXX 0XXXXYYY

43 * 1XXXXXXX 1XXXXYYY 0YYYYYYY

44 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY

45 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY

46 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY

47 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY

48 *

49 * X = bits used for offset number

50 * Y = bits used for block number

51 * u = unused bit

52 *

53 * The bytes are in stored in little-endian order.

54 *

55 * An important property of this encoding is that removing an item from list

56 * never increases the size of the resulting compressed posting list. Proof:

57 *

58 * Removing number is actually replacement of two numbers with their sum. We

59 * have to prove that varbyte encoding of a sum can't be longer than varbyte

60 * encoding of its summands. Sum of two numbers is at most one bit wider than

61 * the larger of the summands. Widening a number by one bit enlarges its length

62 * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum

63 * is at most one byte longer than varbyte encoding of larger summand. Lesser

64 * summand is at least one byte, so the sum cannot take more space than the

65 * summands, Q.E.D.

66 *

67 * This property greatly simplifies VACUUM, which can assume that posting

68 * lists always fit on the same page after vacuuming. Note that even though

69 * that holds for removing items from a posting list, you must also be

70 * careful to not cause expansion e.g. when merging uncompressed items on the

71 * page into the compressed lists, when vacuuming.

72 */

73

74/*

75 * How many bits do you need to encode offset number? OffsetNumber is a 16-bit

76 * integer, but you can't fit that many items on a page. 11 ought to be more

77 * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and

78 * use the minimum number of bits, but that would require changing the on-disk

79 * format if MaxHeapTuplesPerPage changes. Better to leave some slack.

80 */

81 #define MaxHeapTuplesPerPageBits 11

82

83/* Max. number of bytes needed to encode the largest supported integer. */

84 #define MaxBytesPerInteger 7

85

86static inline uint64

87 itemptr_to_uint64(const ItemPointer iptr)

88{

89 uint64 val;

90

91 Assert(ItemPointerIsValid(iptr));

92 Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));

93

94 val = GinItemPointerGetBlockNumber(iptr);

95 val <<= MaxHeapTuplesPerPageBits;

96 val |= GinItemPointerGetOffsetNumber(iptr);

97

98 return val;

99}

100

101static inline void

102 uint64_to_itemptr(uint64 val, ItemPointer iptr)

103{

104 GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));

105 val = val >> MaxHeapTuplesPerPageBits;

106 GinItemPointerSetBlockNumber(iptr, val);

107

108 Assert(ItemPointerIsValid(iptr));

109}

110

111/*

112 * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.

113 */

114static void

115 encode_varbyte(uint64 val, unsigned char **ptr)

116{

117 unsigned char *p = *ptr;

118

119 while (val > 0x7F)

120 {

121 *(p++) = 0x80 | (val & 0x7F);

122 val >>= 7;

123 }

124 *(p++) = (unsigned char) val;

125

126 *ptr = p;

127}

128

129/*

130 * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.

131 */

132static uint64

133 decode_varbyte(unsigned char **ptr)

134{

135 uint64 val;

136 unsigned char *p = *ptr;

137 uint64 c;

138

139 /* 1st byte */

140 c = *(p++);

141 val = c & 0x7F;

142 if (c & 0x80)

143 {

144 /* 2nd byte */

145 c = *(p++);

146 val |= (c & 0x7F) << 7;

147 if (c & 0x80)

148 {

149 /* 3rd byte */

150 c = *(p++);

151 val |= (c & 0x7F) << 14;

152 if (c & 0x80)

153 {

154 /* 4th byte */

155 c = *(p++);

156 val |= (c & 0x7F) << 21;

157 if (c & 0x80)

158 {

159 /* 5th byte */

160 c = *(p++);

161 val |= (c & 0x7F) << 28;

162 if (c & 0x80)

163 {

164 /* 6th byte */

165 c = *(p++);

166 val |= (c & 0x7F) << 35;

167 if (c & 0x80)

168 {

169 /* 7th byte, should not have continuation bit */

170 c = *(p++);

171 val |= c << 42;

172 Assert((c & 0x80) == 0);

173 }

174 }

175 }

176 }

177 }

178 }

179

180 *ptr = p;

181

182 return val;

183}

184

185/*

186 * Encode a posting list.

187 *

188 * The encoded list is returned in a palloc'd struct, which will be at most

189 * 'maxsize' bytes in size. The number items in the returned segment is

190 * returned in *nwritten. If it's not equal to nipd, not all the items fit

191 * in 'maxsize', and only the first *nwritten were encoded.

192 *

193 * The allocated size of the returned struct is short-aligned, and the padding

194 * byte at the end, if any, is zero.

195 */

196GinPostingList *

197 ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,

198 int *nwritten)

199{

200 uint64 prev;

201 int totalpacked = 0;

202 int maxbytes;

203 GinPostingList *result;

204 unsigned char *ptr;

205 unsigned char *endptr;

206

207 maxsize = SHORTALIGN_DOWN(maxsize);

208

209 result = palloc(maxsize);

210

211 maxbytes = maxsize - offsetof(GinPostingList, bytes);

212 Assert(maxbytes > 0);

213

214 /* Store the first special item */

215 result->first = ipd[0];

216

217 prev = itemptr_to_uint64(&result->first);

218

219 ptr = result->bytes;

220 endptr = result->bytes + maxbytes;

221 for (totalpacked = 1; totalpacked < nipd; totalpacked++)

222 {

223 uint64 val = itemptr_to_uint64(&ipd[totalpacked]);

224 uint64 delta = val - prev;

225

226 Assert(val > prev);

227

228 if (endptr - ptr >= MaxBytesPerInteger)

229 encode_varbyte(delta, &ptr);

230 else

231 {

232 /*

233 * There are less than 7 bytes left. Have to check if the next

234 * item fits in that space before writing it out.

235 */

236 unsigned char buf[MaxBytesPerInteger];

237 unsigned char *p = buf;

238

239 encode_varbyte(delta, &p);

240 if (p - buf > (endptr - ptr))

241 break; /* output is full */

242

243 memcpy(ptr, buf, p - buf);

244 ptr += (p - buf);

245 }

246 prev = val;

247 }

248 result->nbytes = ptr - result->bytes;

249

250 /*

251 * If we wrote an odd number of bytes, zero out the padding byte at the

252 * end.

253 */

254 if (result->nbytes != SHORTALIGN(result->nbytes))

255 result->bytes[result->nbytes] = 0;

256

257 if (nwritten)

258 *nwritten = totalpacked;

259

260 Assert(SizeOfGinPostingList(result) <= maxsize);

261

262 /*

263 * Check that the encoded segment decodes back to the original items.

264 */

265#if defined (CHECK_ENCODING_ROUNDTRIP)

266 {

267 int ndecoded;

268 ItemPointer tmp = ginPostingListDecode(result, &ndecoded);

269

270 Assert(ndecoded == totalpacked);

271 Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0);

272 pfree(tmp);

273 }

274#endif

275

276 return result;

277}

278

279/*

280 * Decode a compressed posting list into an array of item pointers.

281 * The number of items is returned in *ndecoded.

282 */

283ItemPointer

284 ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)

285{

286 return ginPostingListDecodeAllSegments(plist,

287 SizeOfGinPostingList(plist),

288 ndecoded_out);

289}

290

291/*

292 * Decode multiple posting list segments into an array of item pointers.

293 * The number of items is returned in *ndecoded_out. The segments are stored

294 * one after each other, with total size 'len' bytes.

295 */

296ItemPointer

297 ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)

298{

299 ItemPointer result;

300 int nallocated;

301 uint64 val;

302 char *endseg = ((char *) segment) + len;

303 int ndecoded;

304 unsigned char *ptr;

305 unsigned char *endptr;

306

307 /*

308 * Guess an initial size of the array.

309 */

310 nallocated = segment->nbytes * 2 + 1;

311 result = palloc(nallocated * sizeof(ItemPointerData));

312

313 ndecoded = 0;

314 while ((char *) segment < endseg)

315 {

316 /* enlarge output array if needed */

317 if (ndecoded >= nallocated)

318 {

319 nallocated *= 2;

320 result = repalloc(result, nallocated * sizeof(ItemPointerData));

321 }

322

323 /* copy the first item */

324 Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));

325 Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);

326 result[ndecoded] = segment->first;

327 ndecoded++;

328

329 val = itemptr_to_uint64(&segment->first);

330 ptr = segment->bytes;

331 endptr = segment->bytes + segment->nbytes;

332 while (ptr < endptr)

333 {

334 /* enlarge output array if needed */

335 if (ndecoded >= nallocated)

336 {

337 nallocated *= 2;

338 result = repalloc(result, nallocated * sizeof(ItemPointerData));

339 }

340

341 val += decode_varbyte(&ptr);

342

343 uint64_to_itemptr(val, &result[ndecoded]);

344 ndecoded++;

345 }

346 segment = GinNextPostingListSegment(segment);

347 }

348

349 if (ndecoded_out)

350 *ndecoded_out = ndecoded;

351 return result;

352}

353

354/*

355 * Add all item pointers from a bunch of posting lists to a TIDBitmap.

356 */

357int

358 ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,

359 TIDBitmap *tbm)

360{

361 int ndecoded;

362 ItemPointer items;

363

364 items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);

365 tbm_add_tuples(tbm, items, ndecoded, false);

366 pfree(items);

367

368 return ndecoded;

369}

370

371/*

372 * Merge two ordered arrays of itempointers, eliminating any duplicates.

373 *

374 * Returns a palloc'd array, and *nmerged is set to the number of items in

375 * the result, after eliminating duplicates.

376 */

377ItemPointer

378 ginMergeItemPointers(ItemPointerData *a, uint32 na,

379 ItemPointerData *b, uint32 nb,

380 int *nmerged)

381{

382 ItemPointerData *dst;

383

384 dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));

385

386 /*

387 * If the argument arrays don't overlap, we can just append them to each

388 * other.

389 */

390 if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)

391 {

392 memcpy(dst, a, na * sizeof(ItemPointerData));

393 memcpy(&dst[na], b, nb * sizeof(ItemPointerData));

394 *nmerged = na + nb;

395 }

396 else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)

397 {

398 memcpy(dst, b, nb * sizeof(ItemPointerData));

399 memcpy(&dst[nb], a, na * sizeof(ItemPointerData));

400 *nmerged = na + nb;

401 }

402 else

403 {

404 ItemPointerData *dptr = dst;

405 ItemPointerData *aptr = a;

406 ItemPointerData *bptr = b;

407

408 while (aptr - a < na && bptr - b < nb)

409 {

410 int cmp = ginCompareItemPointers(aptr, bptr);

411

412 if (cmp > 0)

413 *dptr++ = *bptr++;

414 else if (cmp == 0)

415 {

416 /* only keep one copy of the identical items */

417 *dptr++ = *bptr++;

418 aptr++;

419 }

420 else

421 *dptr++ = *aptr++;

422 }

423

424 while (aptr - a < na)

425 *dptr++ = *aptr++;

426

427 while (bptr - b < nb)

428 *dptr++ = *bptr++;

429

430 *nmerged = dptr - dst;

431 }

432

433 return dst;

434}

SHORTALIGN_DOWN

#define SHORTALIGN_DOWN(LEN)

Definition: c.h:818

SHORTALIGN

#define SHORTALIGN(LEN)

Definition: c.h:806

uint64

uint64_t uint64

Definition: c.h:539

uint32

uint32_t uint32

Definition: c.h:538

gin_private.h

ginCompareItemPointers

static int ginCompareItemPointers(ItemPointer a, ItemPointer b)

Definition: gin_private.h:496

GinItemPointerGetOffsetNumber

#define GinItemPointerGetOffsetNumber(pointer)

Definition: ginblock.h:146

SizeOfGinPostingList

#define SizeOfGinPostingList(plist)

Definition: ginblock.h:342

GinItemPointerSetBlockNumber

#define GinItemPointerSetBlockNumber(pointer, blkno)

Definition: ginblock.h:149

GinNextPostingListSegment

#define GinNextPostingListSegment(cur)

Definition: ginblock.h:343

GinItemPointerSetOffsetNumber

#define GinItemPointerSetOffsetNumber(pointer, offnum)

Definition: ginblock.h:152

GinItemPointerGetBlockNumber

#define GinItemPointerGetBlockNumber(pointer)

Definition: ginblock.h:143

ginPostingListDecodeAllSegments

ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)

Definition: ginpostinglist.c:297

itemptr_to_uint64

static uint64 itemptr_to_uint64(const ItemPointer iptr)

Definition: ginpostinglist.c:87

ginPostingListDecodeAllSegmentsToTbm

int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)

Definition: ginpostinglist.c:358

encode_varbyte

static void encode_varbyte(uint64 val, unsigned char **ptr)

Definition: ginpostinglist.c:115

ginPostingListDecode

ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)

Definition: ginpostinglist.c:284

ginCompressPostingList

GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)

Definition: ginpostinglist.c:197

decode_varbyte

static uint64 decode_varbyte(unsigned char **ptr)

Definition: ginpostinglist.c:133

MaxBytesPerInteger

#define MaxBytesPerInteger

Definition: ginpostinglist.c:84

MaxHeapTuplesPerPageBits

#define MaxHeapTuplesPerPageBits

Definition: ginpostinglist.c:81

uint64_to_itemptr

static void uint64_to_itemptr(uint64 val, ItemPointer iptr)

Definition: ginpostinglist.c:102

ginMergeItemPointers

ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)

Definition: ginpostinglist.c:378

Assert

Assert(PointerIsAligned(start, uint64))

val

long val

Definition: informix.c:689

b

int b

Definition: isn.c:74

a

int a

Definition: isn.c:73

if

if(TABLE==NULL||TABLE_index==NULL)

Definition: isn.c:81

ItemPointerGetOffsetNumber

static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)

Definition: itemptr.h:124

ItemPointer

ItemPointerData * ItemPointer

Definition: itemptr.h:49

ItemPointerData

struct ItemPointerData ItemPointerData

ItemPointerIsValid

static bool ItemPointerIsValid(const ItemPointerData *pointer)

Definition: itemptr.h:83

repalloc

void * repalloc(void *pointer, Size size)

Definition: mcxt.c:1610

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

OffsetNumberIsValid

#define OffsetNumberIsValid(offsetNumber)

Definition: off.h:39

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

buf

static char * buf

Definition: pg_test_fsync.c:72

postgres.h

c

char * c

Definition: preproc-cursor.c:31

cmp

static int cmp(const chr *x, const chr *y, size_t len)

Definition: regc_locale.c:743

GinPostingList

Definition: ginblock.h:336

GinPostingList::first

ItemPointerData first

Definition: ginblock.h:337

GinPostingList::bytes

unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]

Definition: ginblock.h:339

GinPostingList::nbytes

uint16 nbytes

Definition: ginblock.h:338

ItemPointerData

Definition: itemptr.h:37

TIDBitmap

Definition: tidbitmap.c:142

items

static ItemArray items

Definition: test_tidstore.c:48

tbm_add_tuples

void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)

Definition: tidbitmap.c:367

PostgreSQL Source Code: src/backend/access/gin/ginpostinglist.c Source File