{-# LANGUAGE Trustworthy #-}{-# LANGUAGE CPP, NoImplicitPrelude #-}{-# OPTIONS_GHC -funbox-strict-fields #-}------------------------------------------------------------------------------- |-- Module : GHC.IO.Encoding-- Copyright : (c) The University of Glasgow, 2008-2009-- License : see libraries/base/LICENSE---- Maintainer : libraries@haskell.org-- Stability : internal-- Portability : non-portable---- Text codecs for I/O-------------------------------------------------------------------------------moduleGHC.IO.Encoding(BufferCodec (..),TextEncoding (..),TextEncoder ,TextDecoder ,CodingProgress (..),latin1 ,latin1_encode ,latin1_decode ,utf8 ,utf8_bom ,utf16 ,utf16le ,utf16be ,utf32 ,utf32le ,utf32be ,initLocaleEncoding ,getLocaleEncoding ,getFileSystemEncoding ,getForeignEncoding ,setLocaleEncoding ,setFileSystemEncoding ,setForeignEncoding ,char8 ,mkTextEncoding ,argvEncoding )whereimportGHC.Base importGHC.IO.Exception importGHC.IO.Buffer importGHC.IO.Encoding.Failure importGHC.IO.Encoding.Types 
#if !defined(mingw32_HOST_OS)
importqualifiedGHC.IO.Encoding.Iconv asIconv
#else
importqualifiedGHC.IO.Encoding.CodePageasCodePageimportText.Read(reads)
#endif
importqualifiedGHC.IO.Encoding.Latin1 asLatin1importqualifiedGHC.IO.Encoding.UTF8 asUTF8importqualifiedGHC.IO.Encoding.UTF16 asUTF16importqualifiedGHC.IO.Encoding.UTF32 asUTF32importGHC.List importGHC.Word importData.IORef importData.Char (toUpper )importSystem.IO.Unsafe (unsafePerformIO )-- ------------------------------------------------------------------------------- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes-- directly to the first 256 Unicode code points, and is thus not a-- complete Unicode encoding. An attempt to write a character greater than-- @\'\255円\'@ to a 'System.IO.Handle' using the 'latin1' encoding will result in an-- error.latin1 ::TextEncoding latin1 :: TextEncoding
latin1 =TextEncoding
Latin1.latin1_checked -- | The UTF-8 Unicode encodingutf8 ::TextEncoding utf8 :: TextEncoding
utf8 =TextEncoding
UTF8.utf8 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte-- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',-- except that on input, the BOM sequence is ignored at the beginning-- of the stream, and on output, the BOM sequence is prepended.---- The byte-order-mark is strictly unnecessary in UTF-8, but is-- sometimes used to identify the encoding of a file.--utf8_bom ::TextEncoding utf8_bom :: TextEncoding
utf8_bom =TextEncoding
UTF8.utf8_bom -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf16 ::TextEncoding utf16 :: TextEncoding
utf16 =TextEncoding
UTF16.utf16 -- | The UTF-16 Unicode encoding (little-endian)utf16le ::TextEncoding utf16le :: TextEncoding
utf16le =TextEncoding
UTF16.utf16le -- | The UTF-16 Unicode encoding (big-endian)utf16be ::TextEncoding utf16be :: TextEncoding
utf16be =TextEncoding
UTF16.utf16be -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf32 ::TextEncoding utf32 :: TextEncoding
utf32 =TextEncoding
UTF32.utf32 -- | The UTF-32 Unicode encoding (little-endian)utf32le ::TextEncoding utf32le :: TextEncoding
utf32le =TextEncoding
UTF32.utf32le -- | The UTF-32 Unicode encoding (big-endian)utf32be ::TextEncoding utf32be :: TextEncoding
utf32be =TextEncoding
UTF32.utf32be -- | The Unicode encoding of the current locale---- @since 4.5.0.0getLocaleEncoding ::IO TextEncoding {-# NOINLINEgetLocaleEncoding #-}-- | The encoding of the current locale, but allowing arbitrary-- undecodable bytes to be round-tripped through it.---- Do not expect the encoding to be Unicode-compatible: it could appear to be ASCII or anything else.---- This 'TextEncoding' is used to decode and encode command line arguments-- and environment variables on non-Windows platforms.---- On Windows, this encoding *should not* be used if possible because-- the use of code pages is deprecated: Strings should be retrieved-- via the "wide" W-family of UTF-16 APIs instead---- @since 4.5.0.0getFileSystemEncoding ::IO TextEncoding {-# NOINLINEgetFileSystemEncoding #-}-- | The Unicode encoding of the current locale, but where undecodable-- bytes are replaced with their closest visual match. Used for-- the 'Foreign.C.String.CString' marshalling functions in "Foreign.C.String"---- @since 4.5.0.0getForeignEncoding ::IO TextEncoding {-# NOINLINEgetForeignEncoding #-}-- | Set locale encoding for your program. The locale affects-- how 'Char's are encoded and decoded when serialized to bytes: e. g.,-- when you read or write files ('System.IO.readFile'', 'System.IO.writeFile')-- or use standard input/output ('System.IO.getLine', 'System.IO.putStrLn').-- For instance, if your program prints non-ASCII characters, it is prudent to execute---- > setLocaleEncoding utf8---- This is necessary, but not enough on Windows, where console is-- a stateful device, which needs to be configured using-- @System.Win32.Console.setConsoleOutputCP@ and restored back afterwards.-- These intricacies are covered by-- <https://hackage.haskell.org/package/code-page code-page> package,-- which offers a crossplatform @System.IO.CodePage.withCodePage@ bracket.---- Wrong locale encoding typically causes error messages like-- "invalid argument (cannot decode byte sequence starting from ...)"-- or "invalid argument (cannot encode character ...)".---- @since 4.5.0.0setLocaleEncoding ::TextEncoding ->IO (){-# NOINLINEsetLocaleEncoding #-}-- | @since 4.5.0.0setFileSystemEncoding ::TextEncoding ->IO (){-# NOINLINEsetFileSystemEncoding #-}-- | @since 4.5.0.0setForeignEncoding ::TextEncoding ->IO (){-# NOINLINEsetForeignEncoding #-}(IO TextEncoding
getLocaleEncoding ,TextEncoding -> IO ()
setLocaleEncoding )=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())
forall a. a -> (IO a, a -> IO ())
mkGlobal TextEncoding
initLocaleEncoding (IO TextEncoding
getFileSystemEncoding ,TextEncoding -> IO ()
setFileSystemEncoding )=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())
forall a. a -> (IO a, a -> IO ())
mkGlobal TextEncoding
initFileSystemEncoding (IO TextEncoding
getForeignEncoding ,TextEncoding -> IO ()
setForeignEncoding )=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())
forall a. a -> (IO a, a -> IO ())
mkGlobal TextEncoding
initForeignEncoding mkGlobal ::a ->(IO a ,a ->IO ())mkGlobal :: forall a. a -> (IO a, a -> IO ())
mkGlobal a
x =IO (IO a, a -> IO ()) -> (IO a, a -> IO ())
forall a. IO a -> a
unsafePerformIO (IO (IO a, a -> IO ()) -> (IO a, a -> IO ()))
-> IO (IO a, a -> IO ()) -> (IO a, a -> IO ())
forall a b. (a -> b) -> a -> b
$ doIORef a
x_ref <-a -> IO (IORef a)
forall a. a -> IO (IORef a)
newIORef a
x (IO a, a -> IO ()) -> IO (IO a, a -> IO ())
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (IORef a -> IO a
forall a. IORef a -> IO a
readIORef IORef a
x_ref ,IORef a -> a -> IO ()
forall a. IORef a -> a -> IO ()
writeIORef IORef a
x_ref ){-# NOINLINEmkGlobal #-}-- | @since 4.5.0.0initLocaleEncoding ,initFileSystemEncoding ,initForeignEncoding ::TextEncoding {-# NOINLINEinitLocaleEncoding #-}-- N.B. initLocaleEncoding is exported for use in System.IO.localeEncoding.-- NOINLINE ensures that this result is shared.
#if !defined(mingw32_HOST_OS)
-- It is rather important that we don't just call Iconv.mkIconvEncoding here-- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode-- lone surrogates without complaint.---- By going through our Haskell implementations of those encodings, we are-- guaranteed to catch such errors.---- FIXME: this is not a complete solution because if the locale encoding is one-- which we don't have a Haskell-side decoder for, iconv might still ignore the-- lone surrogate in the input.initLocaleEncoding :: TextEncoding
initLocaleEncoding =IO TextEncoding -> TextEncoding
forall a. IO a -> a
unsafePerformIO (IO TextEncoding -> TextEncoding)
-> IO TextEncoding -> TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> String -> IO TextEncoding
mkTextEncoding' CodingFailureMode
ErrorOnCodingFailure String
Iconv.localeEncodingName initFileSystemEncoding :: TextEncoding
initFileSystemEncoding =IO TextEncoding -> TextEncoding
forall a. IO a -> a
unsafePerformIO (IO TextEncoding -> TextEncoding)
-> IO TextEncoding -> TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> String -> IO TextEncoding
mkTextEncoding' CodingFailureMode
RoundtripFailure String
Iconv.localeEncodingName initForeignEncoding :: TextEncoding
initForeignEncoding =IO TextEncoding -> TextEncoding
forall a. IO a -> a
unsafePerformIO (IO TextEncoding -> TextEncoding)
-> IO TextEncoding -> TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> String -> IO TextEncoding
mkTextEncoding' CodingFailureMode
IgnoreCodingFailure String
Iconv.localeEncodingName 
#else
initLocaleEncoding=CodePage.localeEncodinginitFileSystemEncoding=CodePage.mkLocaleEncodingRoundtripFailureinitForeignEncoding=CodePage.mkLocaleEncodingIgnoreCodingFailure
#endif
-- See Note [Windows Unicode Arguments] in rts/RtsFlags.c-- On Windows we assume hs_init argv is in utf8 encoding.-- | Internal encoding of argvargvEncoding ::IO TextEncoding 
#if defined(mingw32_HOST_OS)
argvEncoding=returnutf8
#else
argvEncoding :: IO TextEncoding
argvEncoding =IO TextEncoding
getFileSystemEncoding 
#endif
-- | An encoding in which Unicode code points are translated to bytes-- by taking the code point modulo 256. When decoding, bytes are-- translated directly into the equivalent code point.---- This encoding never fails in either direction. However, encoding-- discards information, so encode followed by decode is not the-- identity.---- @since 4.4.0.0char8 ::TextEncoding char8 :: TextEncoding
char8 =TextEncoding
Latin1.latin1 -- | Look up the named Unicode encoding. May fail with---- * 'System.IO.Error.isDoesNotExistError' if the encoding is unknown---- The set of known encodings is system-dependent, but includes at least:---- * @UTF-8@---- * @UTF-16@, @UTF-16BE@, @UTF-16LE@---- * @UTF-32@, @UTF-32BE@, @UTF-32LE@---- There is additional notation (borrowed from GNU iconv) for specifying-- how illegal characters are handled:---- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause-- all illegal sequences on input to be ignored, and on output-- will drop all code points that have no representation in the-- target encoding.---- * a suffix of @\/\/TRANSLIT@ will choose a replacement character-- for illegal sequences or code points.---- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism-- to represent any invalid bytes in the input as Unicode codepoints (specifically,-- as lone surrogates, which are normally invalid in UTF-32).-- Upon output, these special codepoints are detected and turned back into the-- corresponding original byte.---- In theory, this mechanism allows arbitrary data to be roundtripped via-- a 'String' with no loss of data. In practice, there are two limitations-- to be aware of:---- 1. This only stands a chance of working for an encoding which is an ASCII-- superset, as for security reasons we refuse to escape any bytes smaller-- than 128. Many encodings of interest are ASCII supersets (in particular,-- you can assume that the locale encoding is an ASCII superset) but many-- (such as UTF-16) are not.---- 2. If the underlying encoding is not itself roundtrippable, this mechanism-- can fail. Roundtrippable encodings are those which have an injective mapping-- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,-- Shift-JIS (CP932) and Big5 contain several different encodings of the same-- Unicode codepoint.---- On Windows, you can access supported code pages with the prefix-- @CP@; for example, @\"CP1250\"@.--mkTextEncoding ::String ->IO TextEncoding mkTextEncoding :: String -> IO TextEncoding
mkTextEncoding String
e =caseMaybe CodingFailureMode
mb_coding_failure_mode ofMaybe CodingFailureMode
Nothing ->String -> IO TextEncoding
forall a. String -> IO a
unknownEncodingErr String
e Just CodingFailureMode
cfm ->CodingFailureMode -> String -> IO TextEncoding
mkTextEncoding' CodingFailureMode
cfm String
enc where(String
enc ,String
suffix )=(Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'/')String
e mb_coding_failure_mode :: Maybe CodingFailureMode
mb_coding_failure_mode =caseString
suffix ofString
""->CodingFailureMode -> Maybe CodingFailureMode
forall a. a -> Maybe a
Just CodingFailureMode
ErrorOnCodingFailure String
"//IGNORE"->CodingFailureMode -> Maybe CodingFailureMode
forall a. a -> Maybe a
Just CodingFailureMode
IgnoreCodingFailure String
"//TRANSLIT"->CodingFailureMode -> Maybe CodingFailureMode
forall a. a -> Maybe a
Just CodingFailureMode
TransliterateCodingFailure String
"//ROUNDTRIP"->CodingFailureMode -> Maybe CodingFailureMode
forall a. a -> Maybe a
Just CodingFailureMode
RoundtripFailure String
_->Maybe CodingFailureMode
forall a. Maybe a
Nothing mkTextEncoding' ::CodingFailureMode ->String ->IO TextEncoding mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
mkTextEncoding' CodingFailureMode
cfm String
enc =case[Char -> Char
toUpper Char
c |Char
c <-String
enc ,Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'-']of-- UTF-8 and friends we can handle ourselvesString
"UTF8"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF8.mkUTF8 CodingFailureMode
cfm String
"UTF16"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF16.mkUTF16 CodingFailureMode
cfm String
"UTF16LE"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF16.mkUTF16le CodingFailureMode
cfm String
"UTF16BE"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF16.mkUTF16be CodingFailureMode
cfm String
"UTF32"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF32.mkUTF32 CodingFailureMode
cfm String
"UTF32LE"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF32.mkUTF32le CodingFailureMode
cfm String
"UTF32BE"->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (TextEncoding -> IO TextEncoding)
-> TextEncoding -> IO TextEncoding
forall a b. (a -> b) -> a -> b
$ CodingFailureMode -> TextEncoding
UTF32.mkUTF32be CodingFailureMode
cfm -- On AIX, we want to avoid iconv, because it is either-- a) totally broken, or b) non-reentrant, or c) actually works.-- Detecting b) is difficult as you'd have to trigger the reentrancy-- corruption.-- Therefore, on AIX, we handle the popular ASCII and latin1 encodings-- ourselves. For consistency, we do the same on other platforms.-- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter-- completely ignores the CodingFailureMode (TEST=encoding005).String
_|Bool
isAscii ->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CodingFailureMode -> TextEncoding
Latin1.mkAscii CodingFailureMode
cfm )String
_|Bool
isLatin1 ->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CodingFailureMode -> TextEncoding
Latin1.mkLatin1_checked CodingFailureMode
cfm )
#if defined(mingw32_HOST_OS)
'C':'P':n|[(cp,"")]<-readsn->return$CodePage.mkCodePageEncodingcfmcp_->unknownEncodingErr(enc++codingFailureModeSuffixcfm)
#else
-- Otherwise, handle other encoding needs via iconv.-- Unfortunately there is no good way to determine whether iconv is actually-- functional without telling it to do something.String
_->doMaybe TextEncoding
res <-CodingFailureMode -> String -> IO (Maybe TextEncoding)
Iconv.mkIconvEncoding CodingFailureMode
cfm String
enc caseMaybe TextEncoding
res ofJust TextEncoding
e ->TextEncoding -> IO TextEncoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return TextEncoding
e Maybe TextEncoding
Nothing ->String -> IO TextEncoding
forall a. String -> IO a
unknownEncodingErr (String
enc String -> String -> String
forall a. [a] -> [a] -> [a]
++ CodingFailureMode -> String
codingFailureModeSuffix CodingFailureMode
cfm )
#endif
whereisAscii :: Bool
isAscii =String
enc String -> [String] -> Bool
forall a. Eq a => a -> [a] -> Bool
`elem` [String]
asciiEncNames isLatin1 :: Bool
isLatin1 =String
enc String -> [String] -> Bool
forall a. Eq a => a -> [a] -> Bool
`elem` [String]
latin1EncNames asciiEncNames :: [String]
asciiEncNames =-- ASCII aliases specified by RFC 1345 and RFC 3808.[String
"ANSI_X3.4-1968",String
"iso-ir-6",String
"ANSI_X3.4-1986",String
"ISO_646.irv:1991",String
"US-ASCII",String
"us",String
"IBM367",String
"cp367",String
"csASCII",String
"ASCII",String
"ISO646-US"]latin1EncNames :: [String]
latin1EncNames =-- latin1 aliases specified by RFC 1345 and RFC 3808.[String
"ISO_8859-1:1987",String
"iso-ir-100",String
"ISO_8859-1",String
"ISO-8859-1",String
"latin1",String
"l1",String
"IBM819",String
"CP819",String
"csISOLatin1"]latin1_encode ::CharBuffer ->Buffer Word8 ->IO (CharBuffer ,Buffer Word8 )latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
latin1_encode CharBuffer
input Buffer Word8
output =((CodingProgress, CharBuffer, Buffer Word8)
 -> (CharBuffer, Buffer Word8))
-> IO (CodingProgress, CharBuffer, Buffer Word8)
-> IO (CharBuffer, Buffer Word8)
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\(CodingProgress
_why ,CharBuffer
input' ,Buffer Word8
output' )->(CharBuffer
input' ,Buffer Word8
output' ))(IO (CodingProgress, CharBuffer, Buffer Word8)
 -> IO (CharBuffer, Buffer Word8))
-> IO (CodingProgress, CharBuffer, Buffer Word8)
-> IO (CharBuffer, Buffer Word8)
forall a b. (a -> b) -> a -> b
$ EncodeBuffer
Latin1.latin1_encode CharBuffer
input Buffer Word8
output -- unchecked, used for char8--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encodelatin1_decode ::Buffer Word8 ->CharBuffer ->IO (Buffer Word8 ,CharBuffer )latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
latin1_decode Buffer Word8
input CharBuffer
output =((CodingProgress, Buffer Word8, CharBuffer)
 -> (Buffer Word8, CharBuffer))
-> IO (CodingProgress, Buffer Word8, CharBuffer)
-> IO (Buffer Word8, CharBuffer)
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\(CodingProgress
_why ,Buffer Word8
input' ,CharBuffer
output' )->(Buffer Word8
input' ,CharBuffer
output' ))(IO (CodingProgress, Buffer Word8, CharBuffer)
 -> IO (Buffer Word8, CharBuffer))
-> IO (CodingProgress, Buffer Word8, CharBuffer)
-> IO (Buffer Word8, CharBuffer)
forall a b. (a -> b) -> a -> b
$ DecodeBuffer
Latin1.latin1_decode Buffer Word8
input CharBuffer
output --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encodeunknownEncodingErr ::String ->IO a unknownEncodingErr :: forall a. String -> IO a
unknownEncodingErr String
e =IOException -> IO a
forall a. IOException -> IO a
ioException (Maybe Handle
-> IOErrorType
-> String
-> String
-> Maybe CInt
-> Maybe String
-> IOException
IOError Maybe Handle
forall a. Maybe a
Nothing IOErrorType
NoSuchThing String
"mkTextEncoding"(String
"unknown encoding:"String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
e )Maybe CInt
forall a. Maybe a
Nothing Maybe String
forall a. Maybe a
Nothing )