{-# LANGUAGE Trustworthy #-}{-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-}{-# OPTIONS_HADDOCK hide #-}------------------------------------------------------------------------------- |-- Module : GHC.Unicode-- Copyright : (c) The University of Glasgow, 2003-- License : see libraries/base/LICENSE-- -- Maintainer : cvs-ghc@haskell.org-- Stability : internal-- Portability : non-portable (GHC extensions)---- Implementations for the character predicates (isLower, isUpper, etc.)-- and the conversions (toUpper, toLower). The implementation uses-- libunicode on Unix systems if that is available.-------------------------------------------------------------------------------moduleGHC.Unicode(GeneralCategory (..),generalCategory ,isAscii ,isLatin1 ,isControl ,isAsciiUpper ,isAsciiLower ,isPrint ,isSpace ,isUpper ,isLower ,isAlpha ,isDigit ,isOctDigit ,isHexDigit ,isAlphaNum ,isPunctuation ,isSymbol ,toUpper ,toLower ,toTitle ,wgencat )whereimportGHC.Base importGHC.Char (chr )importGHC.Real importGHC.Enum (Enum (..),Bounded (..))importGHC.Arr (Ix (..))importGHC.Num -- Data.Char.chr already imports this and we need to define a Show instance-- for GeneralCategoryimportGHC.Show (Show )#include "HsBaseConfig.h"-- | Unicode General Categories (column 2 of the UnicodeData table) in-- the order they are listed in the Unicode standard (the Unicode-- Character Database, in particular).---- ==== __Examples__---- Basic usage:---- >>> :t OtherLetter-- OtherLetter :: GeneralCategory---- 'Eq' instance:---- >>> UppercaseLetter == UppercaseLetter-- True-- >>> UppercaseLetter == LowercaseLetter-- False---- 'Ord' instance:---- >>> NonSpacingMark <= MathSymbol-- True---- 'Enum' instance:---- >>> enumFromTo ModifierLetter SpacingCombiningMark-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]---- 'Read' instance:---- >>> read "DashPunctuation" :: GeneralCategory-- DashPunctuation-- >>> read "17" :: GeneralCategory-- *** Exception: Prelude.read: no parse---- 'Show' instance:---- >>> show EnclosingMark-- "EnclosingMark"---- 'Bounded' instance:---- >>> minBound :: GeneralCategory-- UppercaseLetter-- >>> maxBound :: GeneralCategory-- NotAssigned---- 'Ix' instance:---- >>> import Data.Ix ( index )-- >>> index (OtherLetter,Control) FinalQuote-- 12-- >>> index (OtherLetter,Control) Format-- *** Exception: Error in array index--dataGeneralCategory =UppercaseLetter -- ^ Lu: Letter, Uppercase|LowercaseLetter -- ^ Ll: Letter, Lowercase|TitlecaseLetter -- ^ Lt: Letter, Titlecase|ModifierLetter -- ^ Lm: Letter, Modifier|OtherLetter -- ^ Lo: Letter, Other|NonSpacingMark -- ^ Mn: Mark, Non-Spacing|SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining|EnclosingMark -- ^ Me: Mark, Enclosing|DecimalNumber -- ^ Nd: Number, Decimal|LetterNumber -- ^ Nl: Number, Letter|OtherNumber -- ^ No: Number, Other|ConnectorPunctuation -- ^ Pc: Punctuation, Connector|DashPunctuation -- ^ Pd: Punctuation, Dash|OpenPunctuation -- ^ Ps: Punctuation, Open|ClosePunctuation -- ^ Pe: Punctuation, Close|InitialQuote -- ^ Pi: Punctuation, Initial quote|FinalQuote -- ^ Pf: Punctuation, Final quote|OtherPunctuation -- ^ Po: Punctuation, Other|MathSymbol -- ^ Sm: Symbol, Math|CurrencySymbol -- ^ Sc: Symbol, Currency|ModifierSymbol -- ^ Sk: Symbol, Modifier|OtherSymbol -- ^ So: Symbol, Other|Space -- ^ Zs: Separator, Space|LineSeparator -- ^ Zl: Separator, Line|ParagraphSeparator -- ^ Zp: Separator, Paragraph|Control -- ^ Cc: Other, Control|Format -- ^ Cf: Other, Format|Surrogate -- ^ Cs: Other, Surrogate|PrivateUse -- ^ Co: Other, Private Use|NotAssigned -- ^ Cn: Other, Not Assignedderiving(Show ,Eq,Ord,Enum ,Bounded ,Ix )-- | The Unicode general category of the character. This relies on the-- 'Enum' instance of 'GeneralCategory', which must remain in the-- same order as the categories are presented in the Unicode-- standard.---- ==== __Examples__---- Basic usage:---- >>> generalCategory 'a'-- LowercaseLetter-- >>> generalCategory 'A'-- UppercaseLetter-- >>> generalCategory '0'-- DecimalNumber-- >>> generalCategory '%'-- OtherPunctuation-- >>> generalCategory '♥'-- OtherSymbol-- >>> generalCategory '31円'-- Control-- >>> generalCategory ' '-- Space--generalCategory::Char->GeneralCategory generalCategory c =toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c -- | Selects the first 128 characters of the Unicode character set,-- corresponding to the ASCII character set.isAscii::Char->BoolisAscii c =c <'\x80'-- | Selects the first 256 characters of the Unicode character set,-- corresponding to the ISO 8859-1 (Latin-1) character set.isLatin1::Char->BoolisLatin1 c =c <='\xff'-- | Selects ASCII lower-case letters,-- i.e. characters satisfying both 'isAscii' and 'isLower'.isAsciiLower::Char->BoolisAsciiLower c =c >='a'&&c <='z'-- | Selects ASCII upper-case letters,-- i.e. characters satisfying both 'isAscii' and 'isUpper'.isAsciiUpper::Char->BoolisAsciiUpper c =c >='A'&&c <='Z'-- | Selects control characters, which are the non-printing characters of-- the Latin-1 subset of Unicode.isControl::Char->Bool-- | Selects printable Unicode characters-- (letters, numbers, marks, punctuation, symbols and spaces).isPrint::Char->Bool-- | Returns 'True' for any Unicode space character, and the control-- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.isSpace::Char->Bool-- isSpace includes non-breaking space-- The magic 0x377 isn't really that magical. As of 2014, all the codepoints-- at or below 0x377 have been assigned, so we shouldn't have to worry about-- any new spaces appearing below there. It would probably be best to-- use branchless ||, but currently the eqLit transformation will undo that,-- so we'll do it like this until there's a way around that.isSpace c |uc <=0x377=uc ==32||uc -0x9<=4||uc ==0xa0|otherwise =iswspace (ord c )/=0whereuc =fromIntegral (ord c )::Word-- | Selects upper-case or title-case alphabetic Unicode characters (letters).-- Title case is used by a small number of letter ligatures like the-- single-character form of /Lj/.isUpper::Char->Bool-- | Selects lower-case alphabetic Unicode characters (letters).isLower::Char->Bool-- | Selects alphabetic Unicode characters (lower-case, upper-case and-- title-case letters, plus letters of caseless scripts and modifiers letters).-- This function is equivalent to 'Data.Char.isLetter'.isAlpha::Char->Bool-- | Selects alphabetic or numeric digit Unicode characters.---- Note that numeric digits outside the ASCII range are selected by this-- function but not by 'isDigit'. Such digits may be part of identifiers-- but are not used by the printer and reader to represent numbers.isAlphaNum::Char->Bool-- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.isDigit::Char->BoolisDigit c =(fromIntegral (ord c -ord '0')::Word)<=9-- We use an addition and an unsigned comparison instead of two signed-- comparisons because it's usually faster and puts less strain on branch-- prediction. It likely also enables some CSE when combined with functions-- that follow up with an actual conversion.-- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.isOctDigit::Char->BoolisOctDigit c =(fromIntegral (ord c -ord '0')::Word)<=7-- | Selects ASCII hexadecimal digits,-- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.isHexDigit::Char->BoolisHexDigit c =isDigit c ||(fromIntegral (ord c -ord 'A')::Word)<=5||(fromIntegral (ord c -ord 'a')::Word)<=5-- | Selects Unicode punctuation characters, including various kinds-- of connectors, brackets and quotes.---- This function returns 'True' if its argument has one of the-- following 'GeneralCategory's, or 'False' otherwise:---- * 'ConnectorPunctuation'-- * 'DashPunctuation'-- * 'OpenPunctuation'-- * 'ClosePunctuation'-- * 'InitialQuote'-- * 'FinalQuote'-- * 'OtherPunctuation'---- These classes are defined in the-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,-- part of the Unicode standard. The same document defines what is-- and is not a \"Punctuation\".---- ==== __Examples__---- Basic usage:---- >>> isPunctuation 'a'-- False-- >>> isPunctuation '7'-- False-- >>> isPunctuation '♥'-- False-- >>> isPunctuation '"'-- True-- >>> isPunctuation '?'-- True-- >>> isPunctuation '—'-- True--isPunctuation::Char->BoolisPunctuation c =casegeneralCategory c ofConnectorPunctuation ->TrueDashPunctuation ->TrueOpenPunctuation ->TrueClosePunctuation ->TrueInitialQuote ->TrueFinalQuote ->TrueOtherPunctuation ->True_->False-- | Selects Unicode symbol characters, including mathematical and-- currency symbols.---- This function returns 'True' if its argument has one of the-- following 'GeneralCategory's, or 'False' otherwise:---- * 'MathSymbol'-- * 'CurrencySymbol'-- * 'ModifierSymbol'-- * 'OtherSymbol'---- These classes are defined in the-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,-- part of the Unicode standard. The same document defines what is-- and is not a \"Symbol\".---- ==== __Examples__---- Basic usage:---- >>> isSymbol 'a'-- False-- >>> isSymbol '6'-- False-- >>> isSymbol '='-- True---- The definition of \"math symbol\" may be a little-- counter-intuitive depending on one's background:---- >>> isSymbol '+'-- True-- >>> isSymbol '-'-- False--isSymbol::Char->BoolisSymbol c =casegeneralCategory c ofMathSymbol ->TrueCurrencySymbol ->TrueModifierSymbol ->TrueOtherSymbol ->True_->False-- | Convert a letter to the corresponding upper-case letter, if any.-- Any other character is returned unchanged.toUpper::Char->Char-- | Convert a letter to the corresponding lower-case letter, if any.-- Any other character is returned unchanged.toLower::Char->Char-- | Convert a letter to the corresponding title-case or upper-case-- letter, if any. (Title case differs from upper case only for a small-- number of ligature letters.)-- Any other character is returned unchanged.toTitle::Char->Char-- ------------------------------------------------------------------------------- Implementation with the supplied auto-generated Unicode character properties-- table-- Regardless of the O/S and Library, use the functions contained in WCsubst.cisAlpha c =iswalpha (ord c )/=0isAlphaNum c =iswalnum (ord c )/=0isControl c =iswcntrl (ord c )/=0isPrint c =iswprint (ord c )/=0isUpper c =iswupper (ord c )/=0isLower c =iswlower (ord c )/=0toLower c =chr (towlower (ord c ))toUpper c =chr (towupper (ord c ))toTitle c =chr (towtitle (ord c ))foreignimportccallunsafe"u_iswalpha"iswalpha::Int->Intforeignimportccallunsafe"u_iswalnum"iswalnum::Int->Intforeignimportccallunsafe"u_iswcntrl"iswcntrl::Int->Intforeignimportccallunsafe"u_iswspace"iswspace::Int->Intforeignimportccallunsafe"u_iswprint"iswprint::Int->Intforeignimportccallunsafe"u_iswlower"iswlower::Int->Intforeignimportccallunsafe"u_iswupper"iswupper::Int->Intforeignimportccallunsafe"u_towlower"towlower::Int->Intforeignimportccallunsafe"u_towupper"towupper::Int->Intforeignimportccallunsafe"u_towtitle"towtitle::Int->Intforeignimportccallunsafe"u_gencat"wgencat::Int->Int