index 0543c574c67eb721fd65e8b9a07d83b035d7548d..fc279b3125b144448e2a1551767a562c1568a1f4 100644 (file)
@@ -1030,7 +1030,7 @@ pg_mbcliplen(const char *mbstr, int len, int limit)
}
/*
- * pg_mbcliplen with specified encoding
+ * pg_mbcliplen with specified encoding; string must be valid in encoding
*/
int
pg_encoding_mbcliplen(int encoding, const char *mbstr,
* report_invalid_encoding: complain about invalid multibyte character
*
* note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
+ * len must be greater than zero (or we'd neglect initializing "buf").
*/
void
report_invalid_encoding(int encoding, const char *mbstr, int len)
{
- int l = pg_encoding_mblen(encoding, mbstr);
+ int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
char buf[8 * 5 + 1];
char *p = buf;
int j,
@@ -1673,18 +1673,26 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
* report_untranslatable_char: complain about untranslatable character
*
* note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
+ * len must be greater than zero (or we'd neglect initializing "buf").
*/
void
report_untranslatable_char(int src_encoding, int dest_encoding,
const char *mbstr, int len)
{
- int l = pg_encoding_mblen(src_encoding, mbstr);
+ int l;
char buf[8 * 5 + 1];
char *p = buf;
int j,
jlimit;
+ /*
+ * We probably could use plain pg_encoding_mblen(), because
+ * gb18030_to_utf8() verifies before it converts. All conversions should.
+ * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
+ * so, be defensive, since a buggy conversion might pass invalid data.
+ * This is not a performance-critical path.
+ */
+ l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
jlimit = Min(l, len);
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
index 2da0be0ee914bb3d1c428c5c2afac71d1b22abf2..808e065e876bcba125fd579e15b3ea0d7cc016a9 100644 (file)
} while (0)
#define FAIL_AT_CHAR_END(code) \
do { \
- char *term = s + pg_encoding_mblen(lex->input_encoding, s); \
- lex->token_terminator = (term <= end) ? term : end; \
+ ptrdiff_t remaining = end - s; \
+ int charlen; \
+ charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \
+ s, remaining); \
+ lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \
return code; \
} while (0)
index 4f3621fef126d6ea61b18e51d069060b9b62ac70..0d9eca700325c0bbaaf38bf1a8426df85fb6b2d8 100644 (file)
*/
#include "c.h"
+#include <limits.h>
+
#include "mb/pg_wchar.h"
#include "utils/ascii.h"
/*
* Returns the byte length of a multibyte character.
*
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length. Callers that are not prepared to deal with that
- * should use pg_encoding_mblen_bounded() instead.
+ * Choose "mblen" functions based on the input string characteristics.
+ * pg_encoding_mblen() can be used when ANY of these conditions are met:
+ *
+ * - The input string is zero-terminated
+ *
+ * - The input string is known to be valid in the encoding (e.g., string
+ * converted from database encoding)
+ *
+ * - The encoding is not GB18030 (e.g., when only database encodings are
+ * passed to 'encoding' parameter)
+ *
+ * encoding==GB18030 requires examining up to two bytes to determine character
+ * length. Therefore, callers satisfying none of those conditions must use
+ * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
+ * guaranteed to be within allocation bounds.
+ *
+ * When dealing with text that is not certainly valid in the specified
+ * encoding, the result may exceed the actual remaining string length.
+ * Callers that are not prepared to deal with that should use Min(remaining,
+ * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
+ * pg_encoding_mblen_bounded() are interchangeable.
*/
int
pg_encoding_mblen(int encoding, const char *mbstr)
}
/*
- * Returns the byte length of a multibyte character; but not more than
- * the distance to end of string.
+ * Returns the byte length of a multibyte character (possibly not
+ * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
+ */
+int
+pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+ size_t remaining)
+{
+ /*
+ * Define zero remaining as too few, even for single-byte encodings.
+ * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
+ * zero; others read one.
+ */
+ if (remaining < 1 ||
+ (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
+ return INT_MAX;
+ return pg_encoding_mblen(encoding, mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character; but not more than the
+ * distance to the terminating zero byte. For input that might lack a
+ * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
*/
int
pg_encoding_mblen_bounded(int encoding, const char *mbstr)
index 0e9e412a82e532377cc528f72a0fb213fdb66e8b..45ef9cbba70c9b8476d48573580561f10f959f87 100644 (file)
@@ -575,6 +575,8 @@ extern int pg_valid_server_encoding_id(int encoding);
*/
extern void pg_encoding_set_invalid(int encoding, char *dst);
extern int pg_encoding_mblen(int encoding, const char *mbstr);
+extern int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+ size_t remaining);
extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
index 73c3a0f9025713ac8a05ca225ea065a4dfb6e900..350a33f431c14f693c2dd881b7c7c2e150949a34 100644 (file)
}
/* Slow path for possible multibyte characters */
- charlen = pg_encoding_mblen(encoding, source);
+ charlen = pg_encoding_mblen_or_incomplete(encoding,
+ source, remaining);
if (remaining < charlen ||
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
@@ -4149,7 +4150,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
int charlen;
/* Slow path for possible multibyte characters */
- charlen = pg_encoding_mblen(conn->client_encoding, s);
+ charlen = pg_encoding_mblen_or_incomplete(conn->client_encoding,
+ s, remaining);
if (charlen > remaining)
{
index 128f056825d6669b2f8cdf7f95d4ba266becce63..b8e3ace7483d261442a6061dc67c22804dcd188c 100644 (file)
@@ -1173,13 +1173,9 @@ pqSocketPoll(int sock, int forRead, int forWrite, time_t end_time)
*/
/*
- * Returns the byte length of the character beginning at s, using the
- * specified encoding.
- *
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length. Callers that are not prepared to deal with that
- * should use PQmblenBounded() instead.
+ * Like pg_encoding_mblen(). Use this in callers that want the
+ * dynamically-linked libpq's stance on encodings, even if that means
+ * different behavior in different startups of the executable.
*/
int
PQmblen(const char *s, int encoding)
}
/*
- * Returns the byte length of the character beginning at s, using the
- * specified encoding; but not more than the distance to end of string.
+ * Like pg_encoding_mblen_bounded(). Use this in callers that want the
+ * dynamically-linked libpq's stance on encodings, even if that means
+ * different behavior in different startups of the executable.
*/
int
PQmblenBounded(const char *s, int encoding)
index 454eb557c55605fea526ee145d201b1519368088..dddb6d6d623c39595139e685df7dbcbfecb64e84 100644 (file)
#include <string.h>
#include <stdio.h>
+#include "common/jsonapi.h"
#include "fe_utils/psqlscan.h"
#include "fe_utils/string_utils.h"
#include "getopt_long.h"
}
+/*
+ * Confirm escaping doesn't read past the end of an allocation. Consider the
+ * result of malloc(4096), in the absence of freelist entries satisfying the
+ * allocation. On OpenBSD, reading one byte past the end of that object
+ * yields SIGSEGV.
+ *
+ * Run this test before the program's other tests, so freelists are minimal.
+ * len=4096 didn't SIGSEGV, likely due to free() calls in libpq. len=8192
+ * did. Use 128 KiB, to somewhat insulate the outcome from distant new free()
+ * calls and libc changes.
+ */
+static void
+test_gb18030_page_multiple(pe_test_config *tc)
+{
+ PQExpBuffer testname;
+ size_t input_len = 0x20000;
+ char *input;
+
+ /* prepare input */
+ input = pg_malloc(input_len);
+ memset(input, '-', input_len - 1);
+ input[input_len - 1] = 0xfe;
+
+ /* name to describe the test */
+ testname = createPQExpBuffer();
+ appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1);
+ escapify(testname, input + input_len - 1, 1);
+ appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral");
+
+ /* test itself */
+ PQsetClientEncoding(tc->conn, "GB18030");
+ report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL,
+ testname->data, "",
+ "input validity vs escape success", "ok");
+
+ destroyPQExpBuffer(testname);
+ pg_free(input);
+}
+
+/*
+ * Confirm json parsing doesn't read past the end of an allocation. This
+ * exercises wchar.c infrastructure like the true "escape" tests do, but this
+ * isn't an "escape" test.
+ */
+static void
+test_gb18030_json(pe_test_config *tc)
+{
+ PQExpBuffer raw_buf;
+ PQExpBuffer testname;
+ const char input[] = "{\"\\u\xFE";
+ size_t input_len = sizeof(input) - 1;
+ JsonLexContext *lex;
+ JsonSemAction sem = {0}; /* no callbacks */
+ JsonParseErrorType json_error;
+ char *error_str;
+
+ /* prepare input like test_one_vector_escape() does */
+ raw_buf = createPQExpBuffer();
+ appendBinaryPQExpBuffer(raw_buf, input, input_len);
+ appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR);
+ VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len],
+ raw_buf->len - input_len);
+
+ /* name to describe the test */
+ testname = createPQExpBuffer();
+ appendPQExpBuffer(testname, ">");
+ escapify(testname, input, input_len);
+ appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json");
+
+ /* test itself */
+ lex = makeJsonLexContextCstringLen(raw_buf->data, input_len,
+ PG_GB18030, false);
+ json_error = pg_parse_json(lex, &sem);
+ error_str = psprintf("JsonParseErrorType %d", json_error);
+ report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT,
+ testname->data, "",
+ "diagnosed", error_str);
+
+ pfree(error_str);
+ pfree(lex);
+ destroyPQExpBuffer(testname);
+ destroyPQExpBuffer(raw_buf);
+}
+
+
static bool
escape_literal(PGconn *conn, PQExpBuffer target,
const char *unescaped, size_t unescaped_len,
* Testcases that are not null terminated for the specified input length.
* That's interesting to verify that escape functions don't read beyond
* the intended input length.
+ *
+ * One interesting special case is GB18030, which has the odd behaviour
+ * needing to read beyond the first byte to determine the length of a
+ * multi-byte character.
*/
TV_LEN("gbk", "\x80", 1),
+ TV_LEN("GB18030", "\x80", 1),
+ TV_LEN("GB18030", "\x800円", 2),
+ TV_LEN("GB18030", "\x80\x30", 2),
+ TV_LEN("GB18030", "\x80\x300円", 3),
+ TV_LEN("GB18030", "\x80\x30\x30", 3),
+ TV_LEN("GB18030", "\x80\x30\x300円", 4),
TV_LEN("UTF-8", "\xC3\xb6 ", 1),
TV_LEN("UTF-8", "\xC3\xb6 ", 2),
};
exit(1);
}
+ test_gb18030_page_multiple(&tc);
+ test_gb18030_json(&tc);
+
for (int i = 0; i < lengthof(pe_test_vectors); i++)
{
test_one_vector(&tc, &pe_test_vectors[i]);
index d785f92561e5bd40b479be8b504aeef545213049..7dd1ef6161f0627dfe563121779f292a271f6398 100644 (file)
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
('\x666f6f84309c', 'incomplete char '),
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
+ ('\x666f6f84', 'incomplete char at end'),
('\x666f6f84309c3800', 'invalid, NUL byte'),
('\x666f6f84309c0038', 'invalid, NUL byte');
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification. Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers. This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
description | inbytes | result | errorat | error
------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
valid, pure ASCII | \x666f6f | \x666f6f | |
@@ -520,9 +523,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from g
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6f84309c38 | |
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
invalid, NUL byte | \x666f6f84309c3800 | \x666f6f84309c38 | \x00 | invalid byte sequence for encoding "GB18030": 0x00
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
-- Test conversions from GB18030
select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
@@ -534,9 +538,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6fefa8aa | |
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
invalid, NUL byte | \x666f6f84309c3800 | \x666f6fefa8aa | \x00 | invalid byte sequence for encoding "GB18030": 0x00
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
--
-- ISO-8859-5
index b567a1a57219beb3616073aa27d0bae133167243..a80d62367a20ae7a4e8bf7d5e030b195392df1fd 100644 (file)
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
('\x666f6f84309c', 'incomplete char '),
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
+ ('\x666f6f84', 'incomplete char at end'),
('\x666f6f84309c3800', 'invalid, NUL byte'),
('\x666f6f84309c0038', 'invalid, NUL byte');
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification. Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers. This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
-- Test conversions from GB18030
select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;