From 5e8d670c313531c0dca245943fb84c94a477ddc4 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sun, 2 Sep 2018 07:12:24 +1200 Subject: [PATCH] Add Greek characters to unaccent.rules. Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com --- contrib/unaccent/generate_unaccent_rules.py | 19 +- contrib/unaccent/unaccent.rules | 221 ++++++++++++++++++++ 2 files changed, 236 insertions(+), 4 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 4b1b011861f..859cac40fa1 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -29,6 +29,15 @@ import argparse import sys import xml.etree.ElementTree as ET +# The ranges of Unicode characters that we consider to be "plain letters". +# For now we are being conservative by including only Latin and Greek. This +# could be extended in future based on feedback from people with relevant +# language knowledge. +PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case + (ord('A'), ord('Z')), # Latin upper case + (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA + (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA + def print_record(codepoint, letter): print (unichr(codepoint) + "\t" + letter).encode("UTF-8") @@ -39,9 +48,11 @@ class Codepoint: self.combining_ids = combining_ids def is_plain_letter(codepoint): - """Return true if codepoint represents a plain ASCII letter.""" - return (codepoint.id>= ord('a') and codepoint.id <= ord('z')) or \ - (codepoint.id>= ord('A') and codepoint.id <= ord('Z')) + """Return true if codepoint represents a "plain letter".""" + for begin, end in PLAIN_LETTER_RANGES: + if codepoint.id>= begin and codepoint.id <= end: + return True + return False def is_mark(codepoint): """Returns true for diacritical marks (combining codepoints).""" @@ -184,7 +195,7 @@ def main(args): len(codepoint.combining_ids)> 1: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, - chr(get_plain_letter(codepoint, table).id))) + unichr(get_plain_letter(codepoint, table).id))) elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): charactersSet.add((codepoint.id, "".join(unichr(combining_codepoint.id) diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 97f9ed47cfa..76e4e69bebb 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -399,6 +399,26 @@ ʦ ts Êa ls Ê« lz +Ά Α +Έ Ε +Ή Η +Ί Ι +Ό Ο +Ύ Î\ +Ώ Ω +ΐ Î1 +Îa Ι +Ϋ Î\ +ά α +έ Îμ +ή η +Î ̄ Î1 +ΰ υ +ϊ Î1 +ϋ υ +ό ο +ύ υ +ώ ω Ё Е ё Ðμ á ́€ A @@ -709,6 +729,207 @@ á»1⁄2 v á»3⁄4 Y ỿ y +á1⁄4€ α +á1⁄4 α +á1⁄4‚ α +á1⁄4ƒ α +á1⁄4„ α +á1⁄4… α +á1⁄4† α +á1⁄4‡ α +á1⁄4ˆ Α +á1⁄4‰ Α +á1⁄4Š Α +á1⁄4‹ Α +á1⁄4Œ Α +á1⁄4 Α +á1⁄4Ž Α +á1⁄4 Α +á1⁄4 Îμ +á1⁄4‘ Îμ +á1⁄4’ Îμ +á1⁄4“ Îμ +á1⁄4” Îμ +á1⁄4• Îμ +á1⁄4˜ Ε +á1⁄4™ Ε +á1⁄4š Ε +á1⁄4› Ε +á1⁄4œ Ε +á1⁄4 Ε +á1⁄4 η +á1⁄4¡ η +á1⁄4¢ η +á1⁄4£ η +á1⁄4¤ η +á1⁄4\ η +á1⁄4¦ η +á1⁄4§ η +á1⁄4 ̈ Η +á1⁄4© Η +á1⁄4a Η +á1⁄4« Η +á1⁄4¬ Η +á1⁄4­ Η +á1⁄4® Η +á1⁄4 ̄ Η +á1⁄4° Î1 +á1⁄4± Î1 +á1⁄42 Î1 +á1⁄43 Î1 +á1⁄4 ́ Î1 +á1⁄4μ Î1 +á1⁄4¶ Î1 +á1⁄4· Î1 +á1⁄4 ̧ Ι +á1⁄41 Ι +á1⁄4o Ι +á1⁄4» Ι +á1⁄41⁄4 Ι +á1⁄41⁄2 Ι +á1⁄43⁄4 Ι +á1⁄4¿ Ι +á1⁄2€ ο +á1⁄2 ο +á1⁄2‚ ο +á1⁄2ƒ ο +á1⁄2„ ο +á1⁄2… ο +á1⁄2ˆ Ο +á1⁄2‰ Ο +á1⁄2Š Ο +á1⁄2‹ Ο +á1⁄2Œ Ο +á1⁄2 Ο +á1⁄2 υ +á1⁄2‘ υ +á1⁄2’ υ +á1⁄2“ υ +á1⁄2” υ +á1⁄2• υ +á1⁄2– υ +á1⁄2— υ +á1⁄2™ Î\ +á1⁄2› Î\ +á1⁄2 Î\ +á1⁄2Ÿ Î\ +á1⁄2 ω +á1⁄2¡ ω +á1⁄2¢ ω +á1⁄2£ ω +á1⁄2¤ ω +á1⁄2\ ω +á1⁄2¦ ω +á1⁄2§ ω +á1⁄2 ̈ Ω +á1⁄2© Ω +á1⁄2a Ω +á1⁄2« Ω +á1⁄2¬ Ω +á1⁄2­ Ω +á1⁄2® Ω +á1⁄2 ̄ Ω +á1⁄2° α +á1⁄22 Îμ +á1⁄2 ́ η +á1⁄2¶ Î1 +á1⁄2 ̧ ο +á1⁄2o υ +á1⁄21⁄4 ω +á3⁄4€ α +á3⁄4 α +á3⁄4‚ α +á3⁄4ƒ α +á3⁄4„ α +á3⁄4… α +á3⁄4† α +á3⁄4‡ α +á3⁄4ˆ Α +á3⁄4‰ Α +á3⁄4Š Α +á3⁄4‹ Α +á3⁄4Œ Α +á3⁄4 Α +á3⁄4Ž Α +á3⁄4 Α +á3⁄4 η +á3⁄4‘ η +á3⁄4’ η +á3⁄4“ η +á3⁄4” η +á3⁄4• η +á3⁄4– η +á3⁄4— η +á3⁄4˜ Η +á3⁄4™ Η +á3⁄4š Η +á3⁄4› Η +á3⁄4œ Η +á3⁄4 Η +á3⁄4ž Η +á3⁄4Ÿ Η +á3⁄4 ω +á3⁄4¡ ω +á3⁄4¢ ω +á3⁄4£ ω +á3⁄4¤ ω +á3⁄4\ ω +á3⁄4¦ ω +á3⁄4§ ω +á3⁄4 ̈ Ω +á3⁄4© Ω +á3⁄4a Ω +á3⁄4« Ω +á3⁄4¬ Ω +á3⁄4­ Ω +á3⁄4® Ω +á3⁄4 ̄ Ω +á3⁄4° α +á3⁄4± α +á3⁄42 α +á3⁄43 α +á3⁄4 ́ α +á3⁄4¶ α +á3⁄4· α +á3⁄4 ̧ Α +á3⁄41 Α +á3⁄4o Α +á3⁄41⁄4 Α +ῂ η +ῃ η +ῄ η +ῆ η +ῇ η +Ὲ Ε +Ὴ Η +ῌ Η +ῐ Î1 +ῑ Î1 +ῒ Î1 +ῖ Î1 +ῗ Î1 +Ῐ Ι +Ῑ Ι +Ὶ Ι +ῠυ +á¿¡ υ +á¿¢ υ +ῤ ρ +á¿\ ρ +ῦ υ +á¿§ υ +á¿ ̈ Î\ +á¿© Î\ +á¿a Î\ +Ῥ Ρ +á¿2 ω +á¿3 ω +á¿ ́ ω +á¿¶ ω +á¿· ω +á¿ ̧ Ο +á¿o Ω +á¿1⁄4 Ω ‐ - ‑ - ‒ - -- 2.39.5

AltStyle によって変換されたページ (->オリジナル) /