GHC/IO/Encoding.hs

{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}

-----------------------------------------------------------------------------
-- |
-- Module : GHC.IO.Encoding
-- Copyright : (c) The University of Glasgow, 2008-2009
-- License : see libraries/base/LICENSE
-- 
-- Maintainer : libraries@haskell.org
-- Stability : internal
-- Portability : non-portable
--
-- Text codecs for I/O
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding (
 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
 latin1, latin1_encode, latin1_decode,
 utf8, utf8_bom,
 utf16, utf16le, utf16be,
 utf32, utf32le, utf32be, 
 localeEncoding, fileSystemEncoding, foreignEncoding,
 char8,
 mkTextEncoding,
 ) where

import GHC.Base
import GHC.IO.Exception
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
#if !defined(mingw32_HOST_OS)
import qualified GHC.IO.Encoding.Iconv as Iconv
#else
import qualified GHC.IO.Encoding.CodePage as CodePage
import Text.Read (reads)
#endif
import qualified GHC.IO.Encoding.Latin1 as Latin1
import qualified GHC.IO.Encoding.UTF8 as UTF8
import qualified GHC.IO.Encoding.UTF16 as UTF16
import qualified GHC.IO.Encoding.UTF32 as UTF32

import Data.List
import Data.Maybe

-- -----------------------------------------------------------------------------

-- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
-- directly to the first 256 Unicode code points, and is thus not a
-- complete Unicode encoding. An attempt to write a character greater than
-- '255円' to a 'Handle' using the 'latin1' encoding will result in an error.
latin1 :: TextEncoding
latin1 = Latin1.latin1_checked

-- | The UTF-8 Unicode encoding
utf8 :: TextEncoding
utf8 = UTF8.utf8

-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
-- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
-- except that on input, the BOM sequence is ignored at the beginning
-- of the stream, and on output, the BOM sequence is prepended.
--
-- The byte-order-mark is strictly unnecessary in UTF-8, but is
-- sometimes used to identify the encoding of a file.
--
utf8_bom :: TextEncoding
utf8_bom = UTF8.utf8_bom

-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf16 :: TextEncoding
utf16 = UTF16.utf16

-- | The UTF-16 Unicode encoding (litte-endian)
utf16le :: TextEncoding
utf16le = UTF16.utf16le

-- | The UTF-16 Unicode encoding (big-endian)
utf16be :: TextEncoding
utf16be = UTF16.utf16be

-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf32 :: TextEncoding
utf32 = UTF32.utf32

-- | The UTF-32 Unicode encoding (litte-endian)
utf32le :: TextEncoding
utf32le = UTF32.utf32le

-- | The UTF-32 Unicode encoding (big-endian)
utf32be :: TextEncoding
utf32be = UTF32.utf32be

-- | The Unicode encoding of the current locale
localeEncoding :: TextEncoding

-- | The Unicode encoding of the current locale, but allowing arbitrary
-- undecodable bytes to be round-tripped through it.
--
-- This 'TextEncoding' is used to decode and encode command line arguments
-- and environment variables on non-Windows platforms.
--
-- On Windows, this encoding *should not* be used if possible because
-- the use of code pages is deprecated: Strings should be retrieved
-- via the "wide" W-family of UTF-16 APIs instead
fileSystemEncoding :: TextEncoding

-- | The Unicode encoding of the current locale, but where undecodable
-- bytes are replaced with their closest visual match. Used for
-- the 'CString' marshalling functions in "Foreign.C.String"
foreignEncoding :: TextEncoding

#if !defined(mingw32_HOST_OS)
localeEncoding = Iconv.localeEncoding
fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
#else
localeEncoding = CodePage.localeEncoding
fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
#endif

-- | An encoding in which Unicode code points are translated to bytes
-- by taking the code point modulo 256. When decoding, bytes are
-- translated directly into the equivalent code point.
--
-- This encoding never fails in either direction. However, encoding
-- discards information, so encode followed by decode is not the
-- identity.
char8 :: TextEncoding
char8 = Latin1.latin1

-- | Look up the named Unicode encoding. May fail with 
--
-- * 'isDoesNotExistError' if the encoding is unknown
--
-- The set of known encodings is system-dependent, but includes at least:
--
-- * @UTF-8@
--
-- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
--
-- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
--
-- On systems using GNU iconv (e.g. Linux), there is additional
-- notation for specifying how illegal characters are handled:
--
-- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause 
-- all illegal sequences on input to be ignored, and on output
-- will drop all code points that have no representation in the
-- target encoding.
--
-- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
-- for illegal sequences or code points.
--
-- On Windows, you can access supported code pages with the prefix
-- @CP@; for example, @\"CP1250\"@.
--
mkTextEncoding :: String -> IO TextEncoding
mkTextEncoding e = case mb_coding_failure_mode of
 Nothing -> unknown_encoding
 Just cfm -> case enc of
 "UTF-8" -> return $ UTF8.mkUTF8 cfm
 "UTF-16" -> return $ UTF16.mkUTF16 cfm
 "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
 "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
 "UTF-32" -> return $ UTF32.mkUTF32 cfm
 "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
 "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
#if defined(mingw32_HOST_OS)
 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
 _ -> unknown_encoding
#else
 _ -> Iconv.mkIconvEncoding cfm enc
#endif
 where
 -- The only problem with actually documenting //IGNORE and //TRANSLIT as
 -- supported suffixes is that they are not necessarily supported with non-GNU iconv
 (enc, suffix) = span (/= '/') e
 mb_coding_failure_mode = case suffix of
 "" -> Just ErrorOnCodingFailure
 "//IGNORE" -> Just IgnoreCodingFailure
 "//TRANSLIT" -> Just TransliterateCodingFailure
 "//ROUNDTRIP" -> Just RoundtripFailure
 _ -> Nothing
 
 unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
 ("unknown encoding:" ++ e) Nothing Nothing)

latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode

latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
--latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode