1/*-------------------------------------------------------------------------
4 * Parse CSV/text/binary format for COPY FROM.
6 * This file contains routines to parse the text, CSV and binary input
7 * formats. The main entry point is NextCopyFrom(), which parses the
8 * next input line and returns it as Datums.
10 * In text/CSV mode, the parsing happens in multiple stages:
12 * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
15 * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 * places it into 'raw_buf'.
18 * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 * the data in 'raw_buf' from client to server encoding, placing the
20 * converted result in 'input_buf'.
22 * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 * It is responsible for finding the next newline marker, taking quote and
24 * escape characters into account according to the COPY options. The line
25 * is copied into 'line_buf', with quotes and escape characters still
28 * 4. CopyReadAttributesText/CSV() function takes the input line from
29 * 'line_buf', and splits it into fields, unescaping the data as required.
30 * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 * pointers to each field.
33 * If encoding conversion is not required, a shortcut is taken in step 2 to
34 * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 * the data is valid in the current encoding.
39 * In binary mode, the pipeline is much simpler. Input is loaded into
40 * 'raw_buf', and encoding conversion is done in the datatype-specific
41 * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 * data when it's passed the receive function.
45 * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 * and 'attribute_buf' are expanded on demand, to hold the longest line
50 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
51 * Portions Copyright (c) 1994, Regents of the University of California
55 * src/backend/commands/copyfromparse.c
57 *-------------------------------------------------------------------------
78 #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 #define OCTVALUE(c) ((c) - '0')
82 * These macros centralize code used to process line_buf and input_buf buffers.
83 * They are macros because they often do continue/break control and to avoid
84 * function call overhead in tight COPY loops.
86 * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 * prevent the continue/break processing from working. We end the "if (1)"
88 * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 * any "else" in the calling code, and to avoid any compiler warnings about
90 * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
94 * This keeps the character read at the top of the loop in the buffer
95 * even if there is more than one read-ahead.
97 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
100 if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
102 input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
108/* This consumes the remainder of the buffer and breaks */
109 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
112 if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
115 input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116/* backslash just before EOF, treat as data char */ \
123 * Transfer any approved data to line_buf; must do this to be sure
124 * there is some room in input_buf.
126 #define REFILL_LINEBUF \
129 if (input_buf_ptr > cstate->input_buf_index) \
131 appendBinaryStringInfo(&cstate->line_buf, \
132 cstate->input_buf + cstate->input_buf_index, \
133 input_buf_ptr - cstate->input_buf_index); \
134 cstate->input_buf_index = input_buf_ptr; \
138/* NOTE: there's a copy of this in copyto.c */
142/* non-export function prototypes */
161/* Low-level communications functions */
163 int minread,
int maxread);
180 for (
i = 0;
i < natts;
i++)
185 /* We *must* flush here to ensure FE knows it can send. */
199 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
200 errmsg(
"COPY file signature not recognized")));
204 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
205 errmsg(
"invalid COPY file header (missing flags)")));
206 if ((tmp & (1 << 16)) != 0)
208 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 errmsg(
"invalid COPY file header (WITH OIDS)")));
211 if ((tmp >> 16) != 0)
213 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
214 errmsg(
"unrecognized critical flags in COPY file header")));
215 /* Header extension length */
219 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
220 errmsg(
"invalid COPY file header (missing length)")));
221 /* Skip extension header, if present */
226 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
227 errmsg(
"invalid COPY file header (wrong length)")));
232 * CopyGetData reads data from the source (file or frontend)
234 * We attempt to read at least minread, and at most maxread, bytes from
235 * the source. The actual number of bytes read is returned; if this is
236 * less than minread, EOF was detected.
238 * Note: when copying from the frontend, we expect a proper EOF mark per
239 * protocol; if the frontend simply drops the connection, we raise error.
240 * It seems unwise to allow the COPY IN to complete normally in that case.
242 * NB: no data conversion is applied here.
252 bytesread = fread(databuf, 1, maxread, cstate->
copy_file);
256 errmsg(
"could not read from COPY file: %m")));
261 while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
267 /* Try to receive another message */
277 (
errcode(ERRCODE_CONNECTION_FAILURE),
278 errmsg(
"unexpected EOF on client connection with an open transaction")));
279 /* Validate message type and set packet size limit */
293 (
errcode(ERRCODE_PROTOCOL_VIOLATION),
294 errmsg(
"unexpected message type 0x%02X during COPY from stdin",
296 maxmsglen = 0;
/* keep compiler quiet */
299 /* Now collect the message body */
302 (
errcode(ERRCODE_CONNECTION_FAILURE),
303 errmsg(
"unexpected EOF on client connection with an open transaction")));
305 /* ... and process it */
311 /* COPY IN correctly terminated by frontend */
316 (
errcode(ERRCODE_QUERY_CANCELED),
317 errmsg(
"COPY from stdin failed: %s",
324 * Ignore Flush/Sync for the convenience of client
325 * libraries (such as libpq) that may send those
326 * without noticing that the command they just
331 Assert(
false);
/* NOT REACHED */
338 databuf = (
void *) ((
char *) databuf + avail);
353 * These functions do apply some data conversion
357 * CopyGetInt32 reads an int32 that appears in network byte order
359 * Returns true if OK, false if EOF
368 *
val = 0;
/* suppress compiler warning */
376 * CopyGetInt16 reads an int16 that appears in network byte order
385 *
val = 0;
/* suppress compiler warning */
394 * Perform encoding conversion on data in 'raw_buf', writing the converted
395 * data into 'input_buf'.
397 * On entry, there must be some data to convert in 'raw_buf'.
403 * If the file and server encoding are the same, no encoding conversion is
404 * required. However, we still need to verify that the input is valid for
410 * When conversion is not required, input_buf and raw_buf are the
411 * same. raw_buf_len is the total number of bytes in the buffer, and
412 * input_buf_len tracks how many of those bytes have already been
419 if (unverifiedlen == 0)
422 * If no more raw data is coming, report the EOF to the caller.
430 * Verify the new data, including any residual unverified bytes from
434 cstate->
raw_buf + preverifiedlen,
439 * Could not verify anything.
441 * If there is no more raw input data coming, it means that there
442 * was an incomplete multi-byte sequence at the end. Also, if
443 * there's "enough" input left, we should be able to verify at
444 * least one character, and a failure to do so means that we've
445 * hit an invalid byte sequence.
456 * Encoding conversion is needed.
468 * If no more raw data is coming, report the EOF to the caller.
476 * First, copy down any unprocessed data.
492 * Do the conversion. This might stop short, if there is an invalid
493 * byte sequence in the input. We'll convert as much as we can in
496 * Note: Even if we hit an invalid byte sequence, we don't report the
497 * error until all the valid bytes have been consumed. The input
498 * might contain an end-of-input marker (\.), and we don't want to
499 * report an error if the invalid byte sequence is after the
500 * end-of-input marker. We might unnecessarily convert some data
501 * after the end-of-input marker as long as it's valid for the
502 * encoding, but that's harmless.
510 if (convertedlen == 0)
513 * Could not convert anything. If there is no more raw input data
514 * coming, it means that there was an incomplete multi-byte
515 * sequence at the end. Also, if there is plenty of input left,
516 * we should be able to convert at least one character, so a
517 * failure to do so must mean that we've hit a byte sequence
530 * Report an encoding or conversion error.
541 * Everything up to input_buf_len was successfully verified, and
542 * input_buf_len points to the invalid or incomplete character.
551 * raw_buf_index points to the invalid or untranslatable character. We
552 * let the conversion routine report the error, because it can provide
553 * a more specific error message than we could here. An earlier call
554 * to the conversion routine in CopyConvertBuf() detected that there
555 * is an error, now we call the conversion routine again with
556 * noError=false, to have it throw the error.
576 * The conversion routine should have reported an error, so this
577 * should not be reached.
579 elog(
ERROR,
"encoding conversion failed without error");
584 * Load more data from data source to raw_buf.
586 * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
587 * beginning of the buffer, and we load new data after that.
596 * In text mode, if encoding conversion is not required, raw_buf and
597 * input_buf point to the same buffer. Their len/index better agree, too.
607 * Copy down the unprocessed data if any.
617 * If raw_buf and input_buf are in fact the same buffer, adjust the
618 * input_buf variables, too.
630 cstate->
raw_buf[nbytes] =
'0円';
641 * CopyLoadInputBuf loads some more data into input_buf
643 * On return, at least one more input character is loaded into
644 * input_buf, or input_reached_eof is set.
646 * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
647 * of the buffer and then we load more data after that.
655 * The caller has updated input_buf_index to indicate how much of the
656 * input has been consumed and isn't needed anymore. If input_buf is the
657 * same physical area as raw_buf, update raw_buf_index accordingly.
668 /* If we now have some unconverted data, try to convert it */
671 /* If we now have some more input bytes ready, return them */
676 * If we reached an invalid byte sequence, or we're at an incomplete
677 * multi-byte character but there is no more raw input data, report
683 /* no more input, and everything has been converted */
687 /* Try to load more raw data */
696 * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
697 * and writes them to 'dest'. Returns the number of bytes read (which
698 * would be less than 'nbytes' only if we reach EOF).
703 int copied_bytes = 0;
707 /* Enough bytes are present in the buffer. */
710 copied_bytes = nbytes;
715 * Not enough bytes in the buffer, so must read from the file. Need
716 * to loop since 'nbytes' could be larger than the buffer size.
722 /* Load more data if buffer is empty. */
730 /* Transfer some bytes. */
735 copied_bytes += copy_bytes;
736 }
while (copied_bytes < nbytes);
743 * This function is exposed for use by extensions that read raw fields in the
744 * next line. See NextCopyFromRawFieldsInternal() for details.
754 * Workhorse for NextCopyFromRawFields().
756 * Read raw fields in the next line for COPY FROM in text or csv mode. Return
757 * false if no more lines.
759 * An internal temporary buffer is returned via 'fields'. It is valid until
760 * the next call of the function. Since the function returns all raw fields
761 * in the input file, 'nfields' could be different from the number of columns
764 * NOTE: force_not_null option are not applied to the returned fields.
766 * We use pg_attribute_always_inline to reduce function call overhead
767 * and to help compilers to optimize away the 'is_csv' condition when called
768 * by internal functions such as CopyFromTextLikeOneRow().
776 /* only available for text or csv input */
779 /* on input check that the header line is correct if needed */
786 /* If set to "match", one header line is skipped */
792 for (
int i = 0;
i < lines_to_skip;
i++)
810 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
811 errmsg(
"wrong number of fields in header line: got %d, expected %d",
821 Assert(fldnum < cstate->max_fields);
826 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
827 errmsg(
"column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
833 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
834 errmsg(
"column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
835 fldnum, colName,
NameStr(attr->attname))));
846 /* Actually read the line into memory here */
850 * EOF at start of line means we're done. If we see EOF after some
851 * characters, we act as though it was newline followed by EOF, ie,
852 * process the line and then exit loop on next iteration.
857 /* Parse the line into de-escaped field values */
869 * Read next tuple from file for COPY FROM. Return false if no more tuples.
871 * 'econtext' is used to evaluate default expression for each column that is
872 * either not read from the file or is using the DEFAULT option of COPY FROM.
873 * It can be NULL when no default values are used, i.e. when all columns are
874 * read from the file, and DEFAULT option is unset.
876 * 'values' and 'nulls' arrays must be the same length as columns of the
877 * relation passed to BeginCopyFrom. This function fills the arrays.
887 int *defmap = cstate->
defmap;
891 num_phys_attrs = tupDesc->
natts;
893 /* Initialize all values for row to NULL */
895 MemSet(nulls,
true, num_phys_attrs *
sizeof(
bool));
898 /* Get one row from source */
903 * Now compute and insert any defaults available for the columns not
904 * provided by the input data. Anything not processed here or above will
907 for (
i = 0;
i < num_defaults;
i++)
910 * The caller must supply econtext and have switched into the
911 * per-tuple memory context in it.
923/* Implementation of the per-row callback for text format */
931/* Implementation of the per-row callback for CSV format */
940 * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
942 * We use pg_attribute_always_inline to reduce function call overhead
943 * and to help compilers to optimize away the 'is_csv' condition.
954 char **field_strings;
963 /* read raw fields in the next line */
967 /* check for overflowing fields */
968 if (attr_count > 0 && fldct > attr_count)
970 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
971 errmsg(
"extra data after last expected column")));
975 /* Loop to read the user attributes on the line. */
982 if (fieldno >= fldct)
984 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
985 errmsg(
"missing data for column \"%s\"",
987 string = field_strings[fieldno++];
992 /* ignore input field, leaving column as NULL */
998 if (
string == NULL &&
1002 * FORCE_NOT_NULL option is set and column is NULL - convert
1003 * it to the NULL string.
1011 * FORCE_NULL option is set and column matches the NULL
1012 * string. It must have been quoted, or otherwise the string
1013 * would already have been set to NULL. Convert it to NULL as
1028 /* We must have switched into the per-tuple memory context */
1029 Assert(econtext != NULL);
1036 * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1052 * Since we emit line number and column info in the below
1053 * notice message, we suppress error context information other
1054 * than the relation name.
1065 errmsg(
"skipping row due to data type incompatibility at line %" PRIu64
" for column \"%s\": \"%s\"",
1073 errmsg(
"skipping row due to data type incompatibility at line %" PRIu64
" for column \"%s\": null input",
1077 /* reset relname_only */
1088 Assert(fieldno == attr_count);
1093/* Implementation of the per-row callback for binary format */
1112 /* EOF detected (end of file, or protocol-level EOF) */
1116 if (fld_count == -1)
1119 * Received EOF marker. Wait for the protocol-level EOF, and complain
1120 * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1121 * that we correctly handle CopyFail, if client chooses to send that
1122 * now. When copying from file, we could ignore the rest of the file
1123 * like in text mode, but we choose to be consistent with the COPY
1130 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1131 errmsg(
"received copy data after EOF marker")));
1135 if (fld_count != attr_count)
1137 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1138 errmsg(
"row field count is %d, expected %d",
1139 (
int) fld_count, attr_count)));
1160 * Read the next input line and stash it in line_buf.
1162 * Result is true if read was terminated by EOF, false if terminated
1163 * by newline. The terminating newline or EOF marker is not included
1164 * in the final value of line_buf.
1174 /* Parse data and transfer into line_buf */
1180 * Reached EOF. In protocol version 3, we should ignore anything
1181 * after \. up to the protocol end of copy data. (XXX maybe better
1182 * not to treat \. as special?)
1192 }
while (inbytes > 0);
1202 * If we didn't hit EOF, then we must have transferred the EOL marker
1203 * to line_buf along with the data. Get rid of it.
1227 /* shouldn't get here */
1233 /* Now it's safe to use the buffer in error messages */
1240 * CopyReadLineText - inner loop of CopyReadLine for text mode
1245 char *copy_input_buf;
1248 bool need_data =
false;
1249 bool hit_eof =
false;
1250 bool result =
false;
1253 bool in_quote =
false,
1254 last_was_esc =
false;
1256 char escapec =
'0円';
1262 /* ignore special escape processing if it's the same as quotec */
1263 if (quotec == escapec)
1268 * The objective of this loop is to transfer the entire next input line
1269 * into line_buf. Hence, we only care for detecting newlines (\r and/or
1270 * \n) and the end-of-copy marker (\.).
1272 * In CSV mode, \r and \n inside a quoted field are just part of the data
1273 * value and are put in line_buf. We keep just enough state to know if we
1274 * are currently in a quoted field or not.
1276 * The input has already been converted to the database encoding. All
1277 * supported server encodings have the property that all bytes in a
1278 * multi-byte sequence have the high bit set, so a multibyte character
1279 * cannot contain any newline or escape characters embedded in the
1280 * multibyte sequence. Therefore, we can process the input byte-by-byte,
1281 * regardless of the encoding.
1283 * For speed, we try to move data from input_buf to line_buf in chunks
1284 * rather than one character at a time. input_buf_ptr points to the next
1285 * character to examine; any characters from input_buf_index to
1286 * input_buf_ptr have been determined to be part of the line, but not yet
1287 * transferred to line_buf.
1289 * For a little extra speed within the loop, we copy input_buf and
1290 * input_buf_len into local variables.
1302 * Load more data if needed.
1304 * TODO: We could just force four bytes of read-ahead and avoid the
1305 * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1306 * unsafe with the old v2 COPY protocol, but we don't support that
1309 if (input_buf_ptr >= copy_buf_len || need_data)
1314 /* update our local variables */
1320 * If we are completely out of data, break out of the loop,
1331 /* OK to fetch a character */
1332 prev_raw_ptr = input_buf_ptr;
1333 c = copy_input_buf[input_buf_ptr++];
1338 * If character is '\r', we may need to look ahead below. Force
1339 * fetch of the next character if we don't already have it. We
1340 * need to do this before changing CSV state, in case '\r' is also
1341 * the quote or escape character.
1349 * Dealing with quotes and escapes here is mildly tricky. If the
1350 * quote char is also the escape char, there's no problem - we
1351 * just use the char as a toggle. If they are different, we need
1352 * to ensure that we only take account of an escape inside a
1353 * quoted field and immediately preceding a quote char, and not
1354 * the second in an escape-escape sequence.
1356 if (in_quote &&
c == escapec)
1357 last_was_esc = !last_was_esc;
1358 if (
c == quotec && !last_was_esc)
1359 in_quote = !in_quote;
1361 last_was_esc =
false;
1364 * Updating the line count for embedded CR and/or LF chars is
1365 * necessarily a little fragile - this test is probably about the
1366 * best we can do. (XXX it's arguable whether we should do this
1367 * at all --- is cur_lineno a physical or logical count?)
1374 if (
c ==
'\r' && (!is_csv || !in_quote))
1376 /* Check for \r\n on first line, _and_ handle \r\n. */
1381 * If need more data, go back to loop top to load it.
1383 * Note that if we are at EOF, c will wind up as '0円' because
1384 * of the guaranteed pad of input_buf.
1389 c = copy_input_buf[input_buf_ptr];
1393 input_buf_ptr++;
/* eat newline */
1398 /* found \r, but no \n */
1401 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1403 errmsg(
"literal carriage return found in data") :
1404 errmsg(
"unquoted carriage return found in data"),
1406 errhint(
"Use \"\\r\" to represent carriage return.") :
1407 errhint(
"Use quoted CSV field to represent carriage return.")));
1410 * if we got here, it is the first line and we didn't find
1411 * \n, so don't consume the peeked character
1418 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1420 errmsg(
"literal carriage return found in data") :
1421 errmsg(
"unquoted carriage return found in data"),
1423 errhint(
"Use \"\\r\" to represent carriage return.") :
1424 errhint(
"Use quoted CSV field to represent carriage return.")));
1425 /* If reach here, we have found the line terminator */
1430 if (
c ==
'\n' && (!is_csv || !in_quote))
1434 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1436 errmsg(
"literal newline found in data") :
1437 errmsg(
"unquoted newline found in data"),
1439 errhint(
"Use \"\\n\" to represent newline.") :
1440 errhint(
"Use quoted CSV field to represent newline.")));
1442 /* If reach here, we have found the line terminator */
1447 * Process backslash, except in CSV mode where backslash is a normal
1450 if (
c ==
'\\' && !is_csv)
1458 * get next character
1459 * Note: we do not change c so if it isn't \., we can fall
1460 * through and continue processing.
1463 c2 = copy_input_buf[input_buf_ptr];
1467 input_buf_ptr++;
/* consume the '.' */
1470 /* Get the next character */
1472 /* if hit_eof, c2 will become '0円' */
1473 c2 = copy_input_buf[input_buf_ptr++];
1477 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1478 errmsg(
"end-of-copy marker does not match previous newline style")));
1479 else if (c2 !=
'\r')
1481 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1482 errmsg(
"end-of-copy marker is not alone on its line")));
1485 /* Get the next character */
1487 /* if hit_eof, c2 will become '0円' */
1488 c2 = copy_input_buf[input_buf_ptr++];
1490 if (c2 !=
'\r' && c2 !=
'\n')
1492 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1493 errmsg(
"end-of-copy marker is not alone on its line")));
1499 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1500 errmsg(
"end-of-copy marker does not match previous newline style")));
1503 * If there is any data on this line before the \., complain.
1508 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1509 errmsg(
"end-of-copy marker is not alone on its line")));
1512 * Discard the \. and newline, then report EOF.
1515 result =
true;
/* report EOF */
1521 * If we are here, it means we found a backslash followed by
1522 * something other than a period. In non-CSV mode, anything
1523 * after a backslash is special, so we skip over that second
1524 * character too. If we didn't do that \\. would be
1525 * considered an eof-of copy, while in non-CSV mode it is a
1526 * literal backslash followed by a period.
1531 }
/* end of outer loop */
1534 * Transfer any still-uncopied data to line_buf.
1542 * Return decimal value for a hexadecimal digit
1547 if (isdigit((
unsigned char) hex))
1554 * Parse the current line into separate attributes (fields),
1555 * performing de-escaping as needed.
1557 * The input is in line_buf. We use attribute_buf to hold the result
1558 * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1559 * string, or NULL when the input matches the null marker string.
1560 * This array is expanded as necessary.
1562 * (Note that the caller cannot check for nulls since the returned
1563 * string would be the post-de-escaping equivalent, which may look
1564 * the same as some valid data string.)
1566 * delim is the column delimiter string (must be just one byte for now).
1567 * null_print is the null marker string. Note that this is compared to
1568 * the pre-de-escaped input string.
1570 * The return value is the number of fields actually read.
1582 * We need a special case for zero-column tables: check that the input
1583 * line is empty, and return.
1589 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1590 errmsg(
"extra data after last expected column")));
1597 * The de-escaped attributes will certainly not be longer than the input
1598 * data line, so we can just force attribute_buf to be large enough and
1599 * then transfer data without any checks for enough space. We need to do
1600 * it this way because enlarging attribute_buf mid-stream would invalidate
1601 * pointers already stored into cstate->raw_fields[].
1607 /* set pointer variables for loop */
1611 /* Outer loop iterates over fields */
1615 bool found_delim =
false;
1619 bool saw_non_ascii =
false;
1621 /* Make sure there is enough space for the next value */
1629 /* Remember start of field on both input and output sides */
1630 start_ptr = cur_ptr;
1634 * Scan data for field.
1636 * Note that in this loop, we are scanning to locate the end of field
1637 * and also speculatively performing de-escaping. Once we find the
1638 * end-of-field, we can match the raw field contents against the null
1639 * marker string. Only after that comparison fails do we know that
1640 * de-escaping is actually the right thing to do; therefore we *must
1641 * not* throw any syntax errors before we've done the null-marker
1649 if (cur_ptr >= line_end_ptr)
1659 if (cur_ptr >= line_end_ptr)
1677 if (cur_ptr < line_end_ptr)
1684 if (cur_ptr < line_end_ptr)
1697 saw_non_ascii =
true;
1702 if (cur_ptr < line_end_ptr)
1704 char hexchar = *cur_ptr;
1706 if (isxdigit((
unsigned char) hexchar))
1711 if (cur_ptr < line_end_ptr)
1714 if (isxdigit((
unsigned char) hexchar))
1722 saw_non_ascii =
true;
1746 * in all other cases, take the char after '\'
1752 /* Add c to output string */
1756 /* Check whether raw input matched null marker */
1757 input_len = end_ptr - start_ptr;
1761 /* Check whether raw input matched default marker */
1767 /* fieldno is 0-indexed and attnum is 1-indexed */
1772 /* defaults contain entries for all physical attributes */
1781 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1782 errmsg(
"unexpected default marker in COPY data"),
1783 errdetail(
"Column \"%s\" has no default value.",
1790 * At this point we know the field is supposed to contain data.
1792 * If we de-escaped any non-7-bit-ASCII chars, make sure the
1793 * resulting string is valid data for the db encoding.
1803 /* Terminate attribute value in output area */
1804 *output_ptr++ =
'0円';
1807 /* Done if we hit EOL instead of a delim */
1812 /* Clean up state of attribute_buf */
1814 Assert(*output_ptr ==
'0円');
1821 * Parse the current line into separate attributes (fields),
1822 * performing de-escaping as needed. This has exactly the same API as
1823 * CopyReadAttributesText, except we parse the fields according to
1824 * "standard" (i.e. common) CSV usage.
1838 * We need a special case for zero-column tables: check that the input
1839 * line is empty, and return.
1845 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1846 errmsg(
"extra data after last expected column")));
1853 * The de-escaped attributes will certainly not be longer than the input
1854 * data line, so we can just force attribute_buf to be large enough and
1855 * then transfer data without any checks for enough space. We need to do
1856 * it this way because enlarging attribute_buf mid-stream would invalidate
1857 * pointers already stored into cstate->raw_fields[].
1863 /* set pointer variables for loop */
1867 /* Outer loop iterates over fields */
1871 bool found_delim =
false;
1872 bool saw_quote =
false;
1877 /* Make sure there is enough space for the next value */
1885 /* Remember start of field on both input and output sides */
1886 start_ptr = cur_ptr;
1890 * Scan data for field,
1892 * The loop starts in "not quote" mode and then toggles between that
1893 * and "in quote" mode. The loop exits normally if it is in "not
1894 * quote" mode and a delimiter or line end is seen.
1904 if (cur_ptr >= line_end_ptr)
1907 /* unquoted field delimiter */
1913 /* start of quoted field (or part of field) */
1919 /* Add c to output string */
1927 if (cur_ptr >= line_end_ptr)
1929 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1930 errmsg(
"unterminated CSV quoted field")));
1934 /* escape within a quoted field */
1938 * peek at the next char if available, and escape it if it
1939 * is an escape char or a quote char
1941 if (cur_ptr < line_end_ptr)
1943 char nextc = *cur_ptr;
1945 if (nextc == escapec || nextc == quotec)
1947 *output_ptr++ = nextc;
1955 * end of quoted field. Must do this test after testing for
1956 * escape in case quote char and escape char are the same
1957 * (which is the common case).
1962 /* Add c to output string */
1968 /* Terminate attribute value in output area */
1969 *output_ptr++ =
'0円';
1971 /* Check whether raw input matched null marker */
1972 input_len = end_ptr - start_ptr;
1976 /* Check whether raw input matched default marker */
1982 /* fieldno is 0-index and attnum is 1-index */
1987 /* defaults contain entries for all physical attributes */
1996 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1997 errmsg(
"unexpected default marker in COPY data"),
1998 errdetail(
"Column \"%s\" has no default value.",
2004 /* Done if we hit EOL instead of a delim */
2009 /* Clean up state of attribute_buf */
2011 Assert(*output_ptr ==
'0円');
2019 * Read a binary attribute
2031 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2032 errmsg(
"unexpected EOF in COPY data")));
2040 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2041 errmsg(
"invalid field size")));
2043 /* reset attribute_buf to empty, and load raw data in it */
2048 fld_size) != fld_size)
2050 (
errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2051 errmsg(
"unexpected EOF in COPY data")));
2056 /* Call the column type's binary input converter */
2058 typioparam, typmod);
2060 /* Trouble if it didn't eat the whole buffer */
2063 (
errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2064 errmsg(
"incorrect binary data format")));
void pgstat_progress_update_param(int index, int64 val)
static Datum values[MAXATTR]
#define IS_HIGHBIT_SET(ch)
#define pg_attribute_always_inline
#define MemSet(start, val, len)
char * CopyLimitPrintoutLength(const char *str)
#define RAW_BUF_BYTES(cstate)
#define INPUT_BUF_BYTES(cstate)
static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls, bool is_csv)
static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
bool CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static int CopyReadAttributesCSV(CopyFromState cstate)
static bool CopyGetInt16(CopyFromState cstate, int16 *val)
static void CopyConversionError(CopyFromState cstate)
static bool CopyGetInt32(CopyFromState cstate, int32 *val)
static void CopyLoadRawBuf(CopyFromState cstate)
static void CopyLoadInputBuf(CopyFromState cstate)
void ReceiveCopyBinaryHeader(CopyFromState cstate)
static int CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, Oid typioparam, int32 typmod, bool *isnull)
static int GetDecimalFromHex(char hex)
void ReceiveCopyBegin(CopyFromState cstate)
static bool CopyReadLineText(CopyFromState cstate, bool is_csv)
#define IF_NEED_REFILL_AND_EOF_BREAK(extralen)
static int CopyReadAttributesText(CopyFromState cstate)
static const char BinarySignature[11]
#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen)
static bool CopyReadLine(CopyFromState cstate, bool is_csv)
static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
bool CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static void CopyConvertBuf(CopyFromState cstate)
bool NextCopyFrom(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
int errcode_for_file_access(void)
int errdetail(const char *fmt,...)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
bool InputFunctionCallSafe(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod, Node *escontext, Datum *result)
Datum ReceiveFunctionCall(FmgrInfo *flinfo, StringInfo buf, Oid typioparam, int32 typmod)
Assert(PointerIsAligned(start, uint64))
@ COPY_LOG_VERBOSITY_VERBOSE
#define COPY_HEADER_MATCH
#define COPY_HEADER_FALSE
if(TABLE==NULL||TABLE_index==NULL)
#define PQ_SMALL_MESSAGE_LIMIT
#define PQ_LARGE_MESSAGE_LIMIT
int GetDatabaseEncoding(void)
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
void report_invalid_encoding(int encoding, const char *mbstr, int len)
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
MemoryContext CurrentMemoryContext
#define HOLD_CANCEL_INTERRUPTS()
#define RESUME_CANCEL_INTERRUPTS()
int namestrcmp(Name name, const char *str)
FormData_pg_attribute * Form_pg_attribute
static int list_length(const List *l)
static int list_nth_int(const List *list, int n)
#define MAX_CONVERSION_INPUT_LENGTH
unsigned char pg_ascii_tolower(unsigned char ch)
int pq_getmessage(StringInfo s, int maxlen)
void pq_startmsgread(void)
#define PROGRESS_COPY_BYTES_PROCESSED
#define PqMsg_CopyInResponse
#define RelationGetDescr(relation)
StringInfo makeStringInfo(void)
void resetStringInfo(StringInfo str)
void enlargeStringInfo(StringInfo str, int needed)
bool(* CopyFromOneRow)(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
copy_data_source_cb data_source_cb
const struct CopyFromRoutine * routine
StringInfoData attribute_buf
bool * convert_select_flags
ErrorSaveContext * escontext
MemoryContext ecxt_per_tuple_memory
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
int pg_encoding_max_length(int encoding)