git.postgresql.org Git - postgresql.git/commitdiff

git projects / postgresql.git / commitdiff

author Jeff Davis <jdavis@postgresql.org>

2024年3月20日 16:40:57 +0000 (09:40 -0700)

committer Jeff Davis <jdavis@postgresql.org>

2024年3月20日 16:40:57 +0000 (09:40 -0700)

Shows a measurable speedup when processing UTF-8 data, such as with
the new builtin collation provider.

Discussion: https://postgr.es/m/163f4e2190cdf67f67016044e503c5004547e5a9.camel@j-davis.com
Reviewed-by: Peter Eisentraut

src/common/wchar.c patch | blob | blame | history

src/include/mb/pg_wchar.h patch | blob | blame | history

diff --git a/src/common/wchar.c b/src/common/wchar.c

index a238c0106c6efcd7b1940d6f965e7e2cc4ee2c0a..76b7dfdfcb668704646bd03a5a8eb58424af3aa0 100644 (file)

--- a/src/common/wchar.c

+++ b/src/common/wchar.c

@@ -476,39 +476,6 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)

}

-/*

- * Map a Unicode code point to UTF-8. utf8string must have at least

- * unicode_utf8len(c) bytes available.

- */

-unsigned char *

-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)

- if (c <= 0x7F)

- {

- utf8string[0] = c;

- }

- else if (c <= 0x7FF)

- {

- utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);

- utf8string[1] = 0x80 | (c & 0x3F);

- }

- else if (c <= 0xFFFF)

- {

- utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);

- utf8string[1] = 0x80 | ((c >> 6) & 0x3F);

- utf8string[2] = 0x80 | (c & 0x3F);

- }

- else

- {

- utf8string[0] = 0xF0 | ((c >> 18) & 0x07);

- utf8string[1] = 0x80 | ((c >> 12) & 0x3F);

- utf8string[2] = 0x80 | ((c >> 6) & 0x3F);

- utf8string[3] = 0x80 | (c & 0x3F);

- }

- return utf8string;

* Trivial conversion from pg_wchar to UTF-8.

* caller should allocate enough space for "to"

@@ -670,34 +637,6 @@ ucs_wcwidth(pg_wchar ucs)

return 1;

}

-/*

- * Convert a UTF-8 character to a Unicode code point.

- * This is a one-character version of pg_utf2wchar_with_len.

- *

- * No error checks here, c must point to a long-enough string.

- */

-pg_wchar

-utf8_to_unicode(const unsigned char *c)

- if ((*c & 0x80) == 0)

- return (pg_wchar) c[0];

- else if ((*c & 0xe0) == 0xc0)

- return (pg_wchar) (((c[0] & 0x1f) << 6) |

- (c[1] & 0x3f));

- else if ((*c & 0xf0) == 0xe0)

- return (pg_wchar) (((c[0] & 0x0f) << 12) |

- ((c[1] & 0x3f) << 6) |

- (c[2] & 0x3f));

- else if ((*c & 0xf8) == 0xf0)

- return (pg_wchar) (((c[0] & 0x07) << 18) |

- ((c[1] & 0x3f) << 12) |

- ((c[2] & 0x3f) << 6) |

- (c[3] & 0x3f));

- else

- /* that is an invalid code on purpose */

- return 0xffffffff;

static int

pg_utf_dsplen(const unsigned char *s)

{

diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index 69a55b66f44d23ed1bf2d76e1224720556d10715..249cd18a35701f32a479de54d0d60ba895cbe369 100644 (file)

--- a/src/include/mb/pg_wchar.h

+++ b/src/include/mb/pg_wchar.h

@@ -555,6 +555,67 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)

return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);

}

+/*

+ * Convert a UTF-8 character to a Unicode code point.

+ * This is a one-character version of pg_utf2wchar_with_len.

+ *

+ * No error checks here, c must point to a long-enough string.

+ */

+static inline pg_wchar

+utf8_to_unicode(const unsigned char *c)

+ if ((*c & 0x80) == 0)

+ return (pg_wchar) c[0];

+ else if ((*c & 0xe0) == 0xc0)

+ return (pg_wchar) (((c[0] & 0x1f) << 6) |

+ (c[1] & 0x3f));

+ else if ((*c & 0xf0) == 0xe0)

+ return (pg_wchar) (((c[0] & 0x0f) << 12) |

+ ((c[1] & 0x3f) << 6) |

+ (c[2] & 0x3f));

+ else if ((*c & 0xf8) == 0xf0)

+ return (pg_wchar) (((c[0] & 0x07) << 18) |

+ ((c[1] & 0x3f) << 12) |

+ ((c[2] & 0x3f) << 6) |

+ (c[3] & 0x3f));

+ else

+ /* that is an invalid code on purpose */

+ return 0xffffffff;

+/*

+ * Map a Unicode code point to UTF-8. utf8string must have at least

+ * unicode_utf8len(c) bytes available.

+ */

+static inline unsigned char *

+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)

+ if (c <= 0x7F)

+ {

+ utf8string[0] = c;

+ }

+ else if (c <= 0x7FF)

+ {

+ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);

+ utf8string[1] = 0x80 | (c & 0x3F);

+ }

+ else if (c <= 0xFFFF)

+ {

+ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);

+ utf8string[1] = 0x80 | ((c >> 6) & 0x3F);

+ utf8string[2] = 0x80 | (c & 0x3F);

+ }

+ else

+ {

+ utf8string[0] = 0xF0 | ((c >> 18) & 0x07);

+ utf8string[1] = 0x80 | ((c >> 12) & 0x3F);

+ utf8string[2] = 0x80 | ((c >> 6) & 0x3F);

+ utf8string[3] = 0x80 | (c & 0x3F);

+ }

+ return utf8string;

* Number of bytes needed to represent the given char in UTF8.

This is the main PostgreSQL git repository.

RSS Atom