1/*-------------------------------------------------------------------------
4 * routines for manipulating inversion fs large objects. This file
5 * contains the user-level large object application interface routines.
8 * Note: we access pg_largeobject.data using its C struct declaration.
9 * This is safe because it immediately follows pageno which is an int4 field,
10 * and therefore the data field will always be 4-byte aligned, even if it
11 * is in the short 1-byte-header format. We have to detoast it since it's
12 * quite likely to be in compressed or short format. We also need to check
13 * for NULLs, since initdb will mark loid and pageno but not data as NOT NULL.
15 * Note: many of these routines leak memory in CurrentMemoryContext, as indeed
16 * does most of the backend code. We expect that CurrentMemoryContext will
17 * be a short-lived context. Data that must persist across function calls
18 * is kept either in CacheMemoryContext (the Relation structs) or in the
19 * memory context given to inv_open (for LargeObjectDesc structs).
22 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
23 * Portions Copyright (c) 1994, Regents of the University of California
27 * src/backend/storage/large_object/inv_api.c
29 *-------------------------------------------------------------------------
48#include "utils/fmgroids.h"
54 * GUC: backwards-compatibility flag to suppress LO permission checks
59 * All accesses to pg_largeobject and its index make use of a single
60 * Relation reference. To guarantee that the relcache entry remains
61 * in the cache, on the first reference inside a subtransaction, we
62 * execute a slightly klugy maneuver to assign ownership of the
63 * Relation reference to TopTransactionResourceOwner.
70 * Open pg_largeobject and its index, if not already done in current xact
78 return;
/* already open in current xact */
80 /* Arrange for the top xact to own these relation references */
84 /* Use RowExclusiveLock since we might either read or write */
94 * Clean up at main transaction end
102 * Only bother to close if committing; else abort cleanup will handle
126 * Extract data field from a pg_largeobject tuple, detoasting if needed
127 * and verifying that the length is sane. Returns data pointer (a bytea *),
128 * data length, and an indication of whether to pfree the data pointer.
140 datafield = &(tuple->data);
/* see note at top of file */
144 datafield = (
bytea *)
152 errmsg(
"pg_largeobject entry for OID %u, page %d has invalid data field size %d",
153 tuple->loid, tuple->pageno,
len)));
154 *pdatafield = datafield;
161 * inv_create -- create a new large object
164 * lobjId - OID to use for new large object, or InvalidOid to pick one
169 * If lobjId is not InvalidOid, then an error occurs if the OID is already
178 * Create a new largeobject with empty data pages
183 * dependency on the owner of largeobject
185 * Note that LO dependencies are recorded using classId
186 * LargeObjectRelationId for backwards-compatibility reasons. Using
187 * LargeObjectMetadataRelationId instead would simplify matters for the
188 * backend, but it'd complicate pg_dump and possibly break other clients.
193 /* Post creation hook for new large object */
197 * Advance command counter to make new tuple visible to later operations.
205 * inv_open -- access an existing large object.
207 * Returns a large object descriptor, appropriately filled in.
208 * The descriptor and subsidiary data are allocated in the specified
209 * memory context, which must be suitably long-lived for the caller's
210 * purposes. If the returned descriptor has a snapshot associated
211 * with it, the caller must ensure that it also lives long enough,
212 * e.g. by calling RegisterSnapshotOnOwner
222 * Historically, no difference is made between (INV_WRITE) and (INV_WRITE
223 * | INV_READ), the caller being allowed to read the large object
224 * descriptor in either case.
233 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
234 errmsg(
"invalid flags for opening a large object: %d",
237 /* Get snapshot. If write is requested, use an instantaneous snapshot. */
243 /* Can't use LargeObjectExists here because we need to specify snapshot */
246 (
errcode(ERRCODE_UNDEFINED_OBJECT),
247 errmsg(
"large object %u does not exist", lobjId)));
249 /* Apply permission checks, again specifying snapshot */
258 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
259 errmsg(
"permission denied for large object %u",
270 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
271 errmsg(
"permission denied for large object %u",
275 /* OK to create a descriptor */
280 retval->
flags = descflags;
282 /* caller sets if needed, not used by the functions in this file */
286 * The snapshot (if any) is just the currently active snapshot. The
287 * caller will replace it with a longer-lived copy if needed.
295 * Closes a large object descriptor previously made by inv_open(), and
296 * releases the long-term memory used by it.
306 * Destroys an existing large object (not to be confused with a descriptor!)
308 * Note we expect caller to have done any required permissions check.
316 * Delete any comments and dependencies on the large object
318 object.
classId = LargeObjectRelationId;
319 object.objectId = lobjId;
320 object.objectSubId = 0;
324 * Advance command counter so that tuple removal will be seen by later
325 * large-object operations in this transaction.
329 /* For historical reasons, we always return 1 on success. */
334 * Determine size of a large object
336 * NOTE: LOs can contain gaps, just like Unix files. We actually return
337 * the offset of the last byte + 1.
352 Anum_pg_largeobject_loid,
360 * Because the pg_largeobject index is on both loid and pageno, but we
361 * constrain only loid, a backwards scan should visit all pages of the
362 * large object in reverse pageno order. So, it's sufficient to examine
363 * the first valid tuple (== last valid page).
374 elog(
ERROR,
"null field found in pg_largeobject");
395 * We allow seek/tell if you have either read or write permission, so no
396 * need for a permission check here.
400 * Note: overflow in the additions is possible, but since we will reject
401 * negative results, we don't need any extra test for that.
409 newoffset = obj_desc->
offset + offset;
416 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
417 errmsg(
"invalid whence setting: %d", whence)));
418 newoffset = 0;
/* keep compiler quiet */
423 * use errmsg_internal here because we don't want to expose INT64_FORMAT
424 * in translatable strings; doing better is not worth the trouble
428 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
432 obj_desc->
offset = newoffset;
442 * We allow seek/tell if you have either read or write permission, so no
443 * need for a permission check here.
467 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
468 errmsg(
"permission denied for large object %u",
477 Anum_pg_largeobject_loid,
482 Anum_pg_largeobject_pageno,
496 elog(
ERROR,
"null field found in pg_largeobject");
500 * We expect the indexscan will deliver pages in order. However,
501 * there may be missing pages if the LO contains unwritten "holes". We
502 * want missing sections to read out as zeroes.
505 if (pageoff > obj_desc->
offset)
507 n = pageoff - obj_desc->
offset;
508 n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
517 off = (int) (obj_desc->
offset - pageoff);
524 n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
525 memcpy(
buf + nread,
VARDATA(datafield) + off, n);
560 /* this is to make the union big enough for a LO data chunk: */
562 /* ensure union is aligned well enough: */
565 char *workb =
VARDATA(&workbuf.hdr);
568 bool nulls[Natts_pg_largeobject];
569 bool replace[Natts_pg_largeobject];
575 /* enforce writability because snapshot is probably wrong otherwise */
578 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
579 errmsg(
"permission denied for large object %u",
585 /* this addition can't overflow because nbytes is only int32 */
588 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
589 errmsg(
"invalid large object write request size: %d",
597 Anum_pg_largeobject_loid,
602 Anum_pg_largeobject_pageno,
613 while (nwritten < nbytes)
616 * If possible, get next pre-existing page of the LO. We expect the
617 * indexscan will deliver these in order --- but there may be holes.
624 elog(
ERROR,
"null field found in pg_largeobject");
626 Assert(olddata->pageno >= pageno);
628 neednextpage =
false;
632 * If we have a pre-existing page, see if it is the page we want to
633 * write, or a later one.
635 if (olddata != NULL && olddata->pageno == pageno)
638 * Update an existing page with fresh data.
640 * First, load old data into workbuf
655 * Insert appropriate portion of new data
658 n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
659 memcpy(workb + off,
buf + nwritten, n);
663 /* compute valid length of new page */
668 * Form and insert updated tuple
671 memset(nulls,
false,
sizeof(nulls));
672 memset(replace,
false,
sizeof(replace));
674 replace[Anum_pg_largeobject_data - 1] =
true;
682 * We're done with this old page.
691 * Write a brand new page.
693 * First, fill any hole
700 * Insert appropriate portion of new data
703 n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
704 memcpy(workb + off,
buf + nwritten, n);
707 /* compute valid length of new page */
712 * Form and insert updated tuple
715 memset(nulls,
false,
sizeof(nulls));
731 * Advance command counter so that my tuple updates will be seen by later
732 * large-object operations in this transaction.
751 /* this is to make the union big enough for a LO data chunk: */
753 /* ensure union is aligned well enough: */
756 char *workb =
VARDATA(&workbuf.hdr);
759 bool nulls[Natts_pg_largeobject];
760 bool replace[Natts_pg_largeobject];
765 /* enforce writability because snapshot is probably wrong otherwise */
768 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
769 errmsg(
"permission denied for large object %u",
773 * use errmsg_internal here because we don't want to expose INT64_FORMAT
774 * in translatable strings; doing better is not worth the trouble
778 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
787 * Set up to find all pages with desired loid and pageno >= target
790 Anum_pg_largeobject_loid,
795 Anum_pg_largeobject_pageno,
803 * If possible, get the page the truncation point is in. The truncation
804 * point may be beyond the end of the LO or in a hole.
810 elog(
ERROR,
"null field found in pg_largeobject");
812 Assert(olddata->pageno >= pageno);
816 * If we found the page of the truncation point we need to truncate the
817 * data in it. Otherwise if we're in a hole, we need to create a page to
818 * mark the end of data.
820 if (olddata != NULL && olddata->pageno == pageno)
822 /* First, load old data into workbuf */
828 memcpy(workb,
VARDATA(datafield), pagelen);
837 MemSet(workb + pagelen, 0, off - pagelen);
839 /* compute length of new page */
843 * Form and insert updated tuple
846 memset(nulls,
false,
sizeof(nulls));
847 memset(replace,
false,
sizeof(replace));
849 replace[Anum_pg_largeobject_data - 1] =
true;
859 * If the first page we found was after the truncation point, we're in
860 * a hole that we'll fill, but we need to delete the later page
861 * because the loop below won't visit it again.
865 Assert(olddata->pageno > pageno);
870 * Write a brand new page.
872 * Fill the hole up to the truncation point
878 /* compute length of new page */
882 * Form and insert new tuple
885 memset(nulls,
false,
sizeof(nulls));
895 * Delete any pages after the truncation point. If the initial search
896 * didn't find a page, then of course there's nothing more to do.
911 * Advance command counter so that tuple updates will be seen by later
912 * large-object operations in this transaction.
AclResult pg_largeobject_aclcheck_snapshot(Oid lobj_oid, Oid roleid, AclMode mode, Snapshot snapshot)
static Datum values[MAXATTR]
#define InvalidSubTransactionId
#define MemSet(start, val, len)
void performDeletion(const ObjectAddress *object, DropBehavior behavior, int flags)
struct varlena * detoast_attr(struct varlena *attr)
int errmsg_internal(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
SysScanDesc systable_beginscan_ordered(Relation heapRelation, Relation indexRelation, Snapshot snapshot, int nkeys, ScanKey key)
void systable_endscan_ordered(SysScanDesc sysscan)
HeapTuple systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
Assert(PointerIsAligned(start, uint64))
HeapTuple heap_modify_tuple(HeapTuple tuple, TupleDesc tupleDesc, const Datum *replValues, const bool *replIsnull, const bool *doReplace)
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
void heap_freetuple(HeapTuple htup)
#define HeapTupleIsValid(tuple)
static bool HeapTupleHasNulls(const HeapTupleData *tuple)
static void * GETSTRUCT(const HeapTupleData *tuple)
void index_close(Relation relation, LOCKMODE lockmode)
Relation index_open(Oid relationId, LOCKMODE lockmode)
void CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup, CatalogIndexState indstate)
void CatalogCloseIndexes(CatalogIndexState indstate)
CatalogIndexState CatalogOpenIndexes(Relation heapRel)
void CatalogTupleDelete(Relation heapRel, ItemPointer tid)
void CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup, CatalogIndexState indstate)
LargeObjectDesc * inv_open(Oid lobjId, int flags, MemoryContext mcxt)
static Relation lo_index_r
void inv_truncate(LargeObjectDesc *obj_desc, int64 len)
int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
static void getdatafield(Form_pg_largeobject tuple, bytea **pdatafield, int *plen, bool *pfreeit)
Oid inv_create(Oid lobjId)
static Relation lo_heap_r
static void open_lo_relation(void)
int64 inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence)
void close_lo_relation(bool isCommit)
int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes)
int64 inv_tell(LargeObjectDesc *obj_desc)
bool lo_compat_privileges
void inv_close(LargeObjectDesc *obj_desc)
static uint64 inv_getsize(LargeObjectDesc *obj_desc)
if(TABLE==NULL||TABLE_index==NULL)
#define MAX_LARGE_OBJECT_SIZE
void * MemoryContextAlloc(MemoryContext context, Size size)
void pfree(void *pointer)
#define InvokeObjectPostCreateHook(classId, objectId, subId)
#define ERRCODE_DATA_CORRUPTED
bool LargeObjectExistsWithSnapshot(Oid loid, Snapshot snapshot)
Oid LargeObjectCreate(Oid loid)
FormData_pg_largeobject * Form_pg_largeobject
void recordDependencyOnOwner(Oid classId, Oid objectId, Oid owner)
static Datum PointerGetDatum(const void *X)
static Datum ObjectIdGetDatum(Oid X)
static Datum Int32GetDatum(int32 X)
#define RelationGetDescr(relation)
ResourceOwner TopTransactionResourceOwner
ResourceOwner CurrentResourceOwner
void ScanKeyInit(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, RegProcedure procedure, Datum argument)
Snapshot GetActiveSnapshot(void)
#define BTEqualStrategyNumber
#define BTGreaterEqualStrategyNumber
void table_close(Relation relation, LOCKMODE lockmode)
Relation table_open(Oid relationId, LOCKMODE lockmode)
static bool VARATT_IS_EXTENDED(const void *PTR)
static Size VARSIZE(const void *PTR)
static char * VARDATA(const void *PTR)
static void SET_VARSIZE(void *PTR, Size len)
void CommandCounterIncrement(void)