[フレーム]

syncscan.c

Go to the documentation of this file.

1/*-------------------------------------------------------------------------

2 *

3 * syncscan.c

4 * scan synchronization support

5 *

6 * When multiple backends run a sequential scan on the same table, we try

7 * to keep them synchronized to reduce the overall I/O needed. The goal is

8 * to read each page into shared buffer cache only once, and let all backends

9 * that take part in the shared scan process the page before it falls out of

10 * the cache.

11 *

12 * Since the "leader" in a pack of backends doing a seqscan will have to wait

13 * for I/O, while the "followers" don't, there is a strong self-synchronizing

14 * effect once we can get the backends examining approximately the same part

15 * of the table at the same time. Hence all that is really needed is to get

16 * a new backend beginning a seqscan to begin it close to where other backends

17 * are reading. We can scan the table circularly, from block X up to the

18 * end and then from block 0 to X-1, to ensure we visit all rows while still

19 * participating in the common scan.

20 *

21 * To accomplish that, we keep track of the scan position of each table, and

22 * start new scans close to where the previous scan(s) are. We don't try to

23 * do any extra synchronization to keep the scans together afterwards; some

24 * scans might progress much more slowly than others, for example if the

25 * results need to be transferred to the client over a slow network, and we

26 * don't want such queries to slow down others.

27 *

28 * There can realistically only be a few large sequential scans on different

29 * tables in progress at any time. Therefore we just keep the scan positions

30 * in a small LRU list which we scan every time we need to look up or update a

31 * scan position. The whole mechanism is only applied for tables exceeding

32 * a threshold size (but that is not the concern of this module).

33 *

34 * INTERFACE ROUTINES

35 * ss_get_location - return current scan location of a relation

36 * ss_report_location - update current scan location

37 *

38 *

41 *

42 * IDENTIFICATION

43 * src/backend/access/common/syncscan.c

44 *

45 *-------------------------------------------------------------------------

46 */

47#include "postgres.h"

48

49#include "access/syncscan.h"

50#include "miscadmin.h"

51#include "storage/lwlock.h"

52#include "storage/shmem.h"

53#include "utils/rel.h"

54

55

56/* GUC variables */

57#ifdef TRACE_SYNCSCAN

58bool trace_syncscan = false;

59#endif

60

61

62/*

63 * Size of the LRU list.

64 *

65 * Note: the code assumes that SYNC_SCAN_NELEM > 1.

66 *

67 * XXX: What's a good value? It should be large enough to hold the

68 * maximum number of large tables scanned simultaneously. But a larger value

69 * means more traversing of the LRU list when starting a new scan.

70 */

71 #define SYNC_SCAN_NELEM 20

72

73/*

74 * Interval between reports of the location of the current scan, in pages.

75 *

76 * Note: This should be smaller than the ring size (see buffer/freelist.c)

77 * we use for bulk reads. Otherwise a scan joining other scans might start

78 * from a page that's no longer in the buffer cache. This is a bit fuzzy;

79 * there's no guarantee that the new scan will read the page before it leaves

80 * the buffer cache anyway, and on the other hand the page is most likely

81 * still in the OS cache.

82 */

83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)

84

85

86/*

87 * The scan locations structure is essentially a doubly-linked LRU with head

88 * and tail pointer, but designed to hold a fixed maximum number of elements in

89 * fixed-size shared memory.

90 */

91 typedef struct ss_scan_location_t

92{

93 RelFileLocator relfilelocator; /* identity of a relation */

94 BlockNumber location; /* last-reported location in the relation */

95 } ss_scan_location_t;

96

97 typedef struct ss_lru_item_t

98{

99 struct ss_lru_item_t *prev;

100 struct ss_lru_item_t *next;

101 ss_scan_location_t location;

102 } ss_lru_item_t;

103

104 typedef struct ss_scan_locations_t

105{

106 ss_lru_item_t *head;

107 ss_lru_item_t *tail;

108 ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */

109 } ss_scan_locations_t;

110

111 #define SizeOfScanLocations(N) \

112 (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))

113

114/* Pointer to struct in shared memory */

115 static ss_scan_locations_t *scan_locations;

116

117/* prototypes for internal functions */

118static BlockNumber ss_search(RelFileLocator relfilelocator,

119 BlockNumber location, bool set);

120

121

122/*

123 * SyncScanShmemSize --- report amount of shared memory space needed

124 */

125Size

126 SyncScanShmemSize(void)

127{

128 return SizeOfScanLocations(SYNC_SCAN_NELEM);

129}

130

131/*

132 * SyncScanShmemInit --- initialize this module's shared memory

133 */

134void

135 SyncScanShmemInit(void)

136{

137 int i;

138 bool found;

139

140 scan_locations = (ss_scan_locations_t *)

141 ShmemInitStruct("Sync Scan Locations List",

142 SizeOfScanLocations(SYNC_SCAN_NELEM),

143 &found);

144

145 if (!IsUnderPostmaster)

146 {

147 /* Initialize shared memory area */

148 Assert(!found);

149

150 scan_locations->head = &scan_locations->items[0];

151 scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];

152

153 for (i = 0; i < SYNC_SCAN_NELEM; i++)

154 {

155 ss_lru_item_t *item = &scan_locations->items[i];

156

157 /*

158 * Initialize all slots with invalid values. As scans are started,

159 * these invalid entries will fall off the LRU list and get

160 * replaced with real entries.

161 */

162 item->location.relfilelocator.spcOid = InvalidOid;

163 item->location.relfilelocator.dbOid = InvalidOid;

164 item->location.relfilelocator.relNumber = InvalidRelFileNumber;

165 item->location.location = InvalidBlockNumber;

166

167 item->prev = (i > 0) ?

168 (&scan_locations->items[i - 1]) : NULL;

169 item->next = (i < SYNC_SCAN_NELEM - 1) ?

170 (&scan_locations->items[i + 1]) : NULL;

171 }

172 }

173 else

174 Assert(found);

175}

176

177/*

178 * ss_search --- search the scan_locations structure for an entry with the

179 * given relfilelocator.

180 *

181 * If "set" is true, the location is updated to the given location. If no

182 * entry for the given relfilelocator is found, it will be created at the head

183 * of the list with the given location, even if "set" is false.

184 *

185 * In any case, the location after possible update is returned.

186 *

187 * Caller is responsible for having acquired suitable lock on the shared

188 * data structure.

189 */

190static BlockNumber

191 ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)

192{

193 ss_lru_item_t *item;

194

195 item = scan_locations->head;

196 for (;;)

197 {

198 bool match;

199

200 match = RelFileLocatorEquals(item->location.relfilelocator,

201 relfilelocator);

202

203 if (match || item->next == NULL)

204 {

205 /*

206 * If we reached the end of list and no match was found, take over

207 * the last entry

208 */

209 if (!match)

210 {

211 item->location.relfilelocator = relfilelocator;

212 item->location.location = location;

213 }

214 else if (set)

215 item->location.location = location;

216

217 /* Move the entry to the front of the LRU list */

218 if (item != scan_locations->head)

219 {

220 /* unlink */

221 if (item == scan_locations->tail)

222 scan_locations->tail = item->prev;

223 item->prev->next = item->next;

224 if (item->next)

225 item->next->prev = item->prev;

226

227 /* link */

228 item->prev = NULL;

229 item->next = scan_locations->head;

230 scan_locations->head->prev = item;

231 scan_locations->head = item;

232 }

233

234 return item->location.location;

235 }

236

237 item = item->next;

238 }

239

240 /* not reached */

241}

242

243/*

244 * ss_get_location --- get the optimal starting location for scan

245 *

246 * Returns the last-reported location of a sequential scan on the

247 * relation, or 0 if no valid location is found.

248 *

249 * We expect the caller has just done RelationGetNumberOfBlocks(), and

250 * so that number is passed in rather than computing it again. The result

251 * is guaranteed less than relnblocks (assuming that's > 0).

252 */

253BlockNumber

254 ss_get_location(Relation rel, BlockNumber relnblocks)

255{

256 BlockNumber startloc;

257

258 LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);

259 startloc = ss_search(rel->rd_locator, 0, false);

260 LWLockRelease(SyncScanLock);

261

262 /*

263 * If the location is not a valid block number for this scan, start at 0.

264 *

265 * This can happen if for instance a VACUUM truncated the table since the

266 * location was saved.

267 */

268 if (startloc >= relnblocks)

269 startloc = 0;

270

271#ifdef TRACE_SYNCSCAN

272 if (trace_syncscan)

273 elog(LOG,

274 "SYNC_SCAN: start \"%s\" (size %u) at %u",

275 RelationGetRelationName(rel), relnblocks, startloc);

276#endif

277

278 return startloc;

279}

280

281/*

282 * ss_report_location --- update the current scan location

283 *

284 * Writes an entry into the shared Sync Scan state of the form

285 * (relfilelocator, blocknumber), overwriting any existing entry for the

286 * same relfilelocator.

287 */

288void

289 ss_report_location(Relation rel, BlockNumber location)

290{

291#ifdef TRACE_SYNCSCAN

292 if (trace_syncscan)

293 {

294 if ((location % 1024) == 0)

295 elog(LOG,

296 "SYNC_SCAN: scanning \"%s\" at %u",

297 RelationGetRelationName(rel), location);

298 }

299#endif

300

301 /*

302 * To reduce lock contention, only report scan progress every N pages. For

303 * the same reason, don't block if the lock isn't immediately available.

304 * Missing a few updates isn't critical, it just means that a new scan

305 * that wants to join the pack will start a little bit behind the head of

306 * the scan. Hopefully the pages are still in OS cache and the scan

307 * catches up quickly.

308 */

309 if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)

310 {

311 if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))

312 {

313 (void) ss_search(rel->rd_locator, location, true);

314 LWLockRelease(SyncScanLock);

315 }

316#ifdef TRACE_SYNCSCAN

317 else if (trace_syncscan)

318 elog(LOG,

319 "SYNC_SCAN: missed update for \"%s\" at %u",

320 RelationGetRelationName(rel), location);

321#endif

322 }

323}

BlockNumber

uint32 BlockNumber

Definition: block.h:31

InvalidBlockNumber

#define InvalidBlockNumber

Definition: block.h:33

FLEXIBLE_ARRAY_MEMBER

#define FLEXIBLE_ARRAY_MEMBER

Definition: c.h:470

Size

size_t Size

Definition: c.h:610

LOG

#define LOG

Definition: elog.h:31

elog

#define elog(elevel,...)

Definition: elog.h:226

IsUnderPostmaster

bool IsUnderPostmaster

Definition: globals.c:120

Assert

Assert(PointerIsAligned(start, uint64))

i

int i

Definition: isn.c:77

LWLockAcquire

bool LWLockAcquire(LWLock *lock, LWLockMode mode)

Definition: lwlock.c:1174

LWLockRelease

void LWLockRelease(LWLock *lock)

Definition: lwlock.c:1894

LWLockConditionalAcquire

bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)

Definition: lwlock.c:1345

lwlock.h

LW_EXCLUSIVE

@ LW_EXCLUSIVE

Definition: lwlock.h:112

miscadmin.h

postgres.h

InvalidOid

#define InvalidOid

Definition: postgres_ext.h:37

rel.h

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

RelFileLocatorEquals

#define RelFileLocatorEquals(locator1, locator2)

Definition: relfilelocator.h:89

InvalidRelFileNumber

#define InvalidRelFileNumber

Definition: relpath.h:26

ShmemInitStruct

void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)

Definition: shmem.c:387

shmem.h

RelFileLocator

Definition: relfilelocator.h:59

RelFileLocator::spcOid

Oid spcOid

Definition: relfilelocator.h:60

RelFileLocator::relNumber

RelFileNumber relNumber

Definition: relfilelocator.h:62

RelFileLocator::dbOid

Oid dbOid

Definition: relfilelocator.h:61

RelationData

Definition: rel.h:56

RelationData::rd_locator

RelFileLocator rd_locator

Definition: rel.h:57

ss_lru_item_t

Definition: syncscan.c:98

ss_lru_item_t::location

ss_scan_location_t location

Definition: syncscan.c:101

ss_lru_item_t::next

struct ss_lru_item_t * next

Definition: syncscan.c:100

ss_lru_item_t::prev

struct ss_lru_item_t * prev

Definition: syncscan.c:99

ss_scan_location_t

Definition: syncscan.c:92

ss_scan_location_t::relfilelocator

RelFileLocator relfilelocator

Definition: syncscan.c:93

ss_scan_location_t::location

BlockNumber location

Definition: syncscan.c:94

ss_scan_locations_t

Definition: syncscan.c:105

ss_scan_locations_t::head

ss_lru_item_t * head

Definition: syncscan.c:106

ss_scan_locations_t::items

ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]

Definition: syncscan.c:108

ss_scan_locations_t::tail

ss_lru_item_t * tail

Definition: syncscan.c:107

ss_lru_item_t

struct ss_lru_item_t ss_lru_item_t

ss_report_location

void ss_report_location(Relation rel, BlockNumber location)

Definition: syncscan.c:289

SyncScanShmemInit

void SyncScanShmemInit(void)

Definition: syncscan.c:135

scan_locations

static ss_scan_locations_t * scan_locations

Definition: syncscan.c:115

ss_get_location

BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)

Definition: syncscan.c:254

ss_scan_locations_t

struct ss_scan_locations_t ss_scan_locations_t

SyncScanShmemSize

Size SyncScanShmemSize(void)

Definition: syncscan.c:126

ss_search

static BlockNumber ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)

Definition: syncscan.c:191

ss_scan_location_t

struct ss_scan_location_t ss_scan_location_t

SYNC_SCAN_NELEM

#define SYNC_SCAN_NELEM

Definition: syncscan.c:71

SizeOfScanLocations

#define SizeOfScanLocations(N)

Definition: syncscan.c:111

SYNC_SCAN_REPORT_INTERVAL

#define SYNC_SCAN_REPORT_INTERVAL

Definition: syncscan.c:83

syncscan.h

PostgreSQL Source Code: src/backend/access/common/syncscan.c Source File