From ec0a69e49bf41a37b5c2d6f6be66d8abae00ee05 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: 2017年8月16日 16:51:56 -0400 Subject: [PATCH] Extend the default rules file for contrib/unaccent with Vietnamese letters. Improve generate_unaccent_rules.py to handle composed characters whose base is another composed character rather than a plain letter. The net effect of this is to add a bunch of multi-accented Vietnamese characters to unaccent.rules. Original complaint from Kha Nguyen, diagnosis of the script's shortcoming by Thomas Munro. Dang Minh Huong and Michael Paquier Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com --- contrib/unaccent/generate_unaccent_rules.py | 39 +++++-- contrib/unaccent/unaccent.rules | 114 ++++++++++++++++++++ 2 files changed, 145 insertions(+), 8 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index a5eb42f0b18..4b1b011861f 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -48,24 +48,47 @@ def is_mark(codepoint): return codepoint.general_category in ("Mn", "Me", "Mc") def is_letter_with_marks(codepoint, table): - """Returns true for plain letters combined with one or more marks.""" + """Returns true for letters combined with one or more marks.""" # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values - return len(codepoint.combining_ids)> 1 and \ - is_plain_letter(table[codepoint.combining_ids[0]]) and \ - all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) + + # Letter may have no combining characters, in which case it has + # no marks. + if len(codepoint.combining_ids) == 1: + return False + + # A letter without diacritical marks has none of them. + if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: + return False + + # Check if the base letter of this letter has marks. + codepoint_base = codepoint.combining_ids[0] + if (is_plain_letter(table[codepoint_base]) is False and \ + is_letter_with_marks(table[codepoint_base], table) is False): + return False + + return True def is_letter(codepoint, table): """Return true for letter with or without diacritical marks.""" return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) def get_plain_letter(codepoint, table): - """Return the base codepoint without marks.""" + """Return the base codepoint without marks. If this codepoint has more + than one combining character, do a recursive lookup on the table to + find out its plain base letter.""" if is_letter_with_marks(codepoint, table): - return table[codepoint.combining_ids[0]] + if len(table[codepoint.combining_ids[0]].combining_ids)> 1: + return get_plain_letter(table[codepoint.combining_ids[0]], table) + elif is_plain_letter(table[codepoint.combining_ids[0]]): + return table[codepoint.combining_ids[0]] + + # Should not come here + assert(False) elif is_plain_letter(codepoint): return codepoint - else: - raise "mu" + + # Should not come here + assert(False) def is_ligature(codepoint, table): """Return true for letters combined with letters.""" diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 84886da587a..97f9ed47cfa 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -254,6 +254,18 @@ ǒ o Ǔ U ǔ u +Ǖ U +ǖ u +Ǘ U +ǘ u +Ǚ U +ǚ u +Ǜ U +ǜ u +Ǟ A +ǟ a +Ç A +Ç¡ a Ǥ G Ç\ g Ǧ G @@ -262,6 +274,8 @@ Ç© k Ça O Ç« o +Ǭ O +Ç­ o ǰ j DZ DZ Ç2 Dz @@ -270,6 +284,8 @@ Çμ g Ç ̧ N Ç1 n +Ço A +Ç» a Ȁ A ȁ a Ȃ A @@ -307,8 +323,14 @@ ȧ a È ̈ E È© e +Èa O +È« o +Ȭ O +È­ o È® O È ̄ o +Ȱ O +ȱ o È2 Y È3 y È ́ l @@ -441,6 +463,8 @@ á ̧… b á ̧† B á ̧‡ b +á ̧ˆ C +á ̧‰ c á ̧Š D á ̧‹ d á ̧Œ D @@ -451,10 +475,16 @@ á ̧‘ d á ̧’ D á ̧“ d +á ̧” E +á ̧• e +á ̧– E +á ̧— e á ̧˜ E á ̧™ e á ̧š E á ̧› e +á ̧œ E +á ̧ e á ̧ž F á ̧Ÿ f á ̧ G @@ -471,6 +501,8 @@ á ̧« h á ̧¬ I á ̧­ i +á ̧® I +á ̧ ̄ i á ̧° K á ̧± k á ̧2 K @@ -479,6 +511,8 @@ á ̧μ k á ̧¶ L á ̧· l +á ̧ ̧ L +á ̧1 l á ̧o L á ̧» l á ̧1⁄4 L @@ -497,6 +531,14 @@ á1‰ n á1Š N á1‹ n +á1Œ O +á1 o +á1Ž O +á1 o +á1 O +á1‘ o +á1’ O +á1“ o á1” P á1• p á1– P @@ -505,12 +547,20 @@ á1™ r á1š R á1› r +á1œ R +á1 r á1ž R á1Ÿ r á1 S á1¡ s á1¢ S á1£ s +á1¤ S +á1\ s +á1¦ S +á1§ s +á1 ̈ S +á1© s á1a T á1« t á1¬ T @@ -525,6 +575,10 @@ á1μ u á1¶ U á1· u +á1 ̧ U +á11 u +á1o U +á1» u á11⁄4 V á11⁄2 v á13⁄4 V @@ -563,12 +617,42 @@ áo¡ a áo¢ A áo£ a +áo¤ A +áo\ a +áo¦ A +áo§ a +áo ̈ A +áo© a +áoa A +áo« a +áo¬ A +áo­ a +áo® A +áo ̄ a +áo° A +áo± a +áo2 A +áo3 a +áo ́ A +áoμ a +áo¶ A +áo· a áo ̧ E áo1 e áoo E áo» e áo1⁄4 E áo1⁄2 e +áo3⁄4 E +áo¿ e +Ề E +ề e +Ể E +ể e +Ễ E +ễ e +Ệ E +ệ e Ỉ I ỉ i Ị I @@ -577,10 +661,40 @@ ọ o Ỏ O ỏ o +Ố O +ố o +Ồ O +ồ o +Ổ O +ổ o +Ỗ O +ỗ o +Ộ O +ộ o +Ớ O +ớ o +Ờ O +ờ o +Ở O +ở o +á» O +ỡ o +Ợ O +ợ o Ụ U á»\ u Ủ U á»§ u +á» ̈ U +ứ u +á»a U +ừ u +Ử U +á»­ u +á»® U +á» ̄ u +á»° U +á»± u á»2 Y á»3 y á» ́ Y -- 2.39.5

AltStyle によって変換されたページ (->オリジナル) /