GHC/IO/Encoding/UTF32.hs

{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude
 , BangPatterns
 , NondecreasingIndentation
 , MagicHash
 #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}

-----------------------------------------------------------------------------
-- |
-- Module : GHC.IO.Encoding.UTF32
-- Copyright : (c) The University of Glasgow, 2009
-- License : see libraries/base/LICENSE
-- 
-- Maintainer : libraries@haskell.org
-- Stability : internal
-- Portability : non-portable
--
-- UTF-32 Codecs for the IO library
--
-- Portions Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
-- (c) Duncan Coutts 2009
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding.UTF32 (
 utf32, mkUTF32,
 utf32_decode,
 utf32_encode,

 utf32be, mkUTF32be,
 utf32be_decode,
 utf32be_encode,

 utf32le, mkUTF32le,
 utf32le_decode,
 utf32le_encode,
 ) where

import GHC.Base
import GHC.Real
import GHC.Num
-- import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
import Data.Maybe
import GHC.IORef

-- -----------------------------------------------------------------------------
-- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM

utf32 :: TextEncoding
utf32 = mkUTF32 ErrorOnCodingFailure

-- | /Since: 4.4.0.0/
mkUTF32 :: CodingFailureMode -> TextEncoding
mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
 mkTextDecoder = utf32_DF cfm,
 mkTextEncoder = utf32_EF cfm }

utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
utf32_DF cfm = do
 seen_bom <- newIORef Nothing
 return (BufferCodec {
 encode = utf32_decode seen_bom,
 recover = recoverDecode cfm,
 close = return (),
 getState = readIORef seen_bom,
 setState = writeIORef seen_bom
 })

utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
utf32_EF cfm = do
 done_bom <- newIORef False
 return (BufferCodec {
 encode = utf32_encode done_bom,
 recover = recoverEncode cfm,
 close = return (),
 getState = readIORef done_bom,
 setState = writeIORef done_bom
 })

utf32_encode :: IORef Bool -> EncodeBuffer
utf32_encode done_bom input
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
 = do
 b <- readIORef done_bom
 if b then utf32_native_encode input output
 else if os - ow < 4
 then return (OutputUnderflow, input,output)
 else do
 writeIORef done_bom True
 writeWord8Buf oraw ow bom0
 writeWord8Buf oraw (ow+1) bom1
 writeWord8Buf oraw (ow+2) bom2
 writeWord8Buf oraw (ow+3) bom3
 utf32_native_encode input output{ bufR = ow+4 }

utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
utf32_decode seen_bom
 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
 output
 = do
 mb <- readIORef seen_bom
 case mb of
 Just decode -> decode input output
 Nothing ->
 if iw - ir < 4 then return (InputUnderflow, input,output) else do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 c2 <- readWord8Buf iraw (ir+2)
 c3 <- readWord8Buf iraw (ir+3)
 case () of
 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
 writeIORef seen_bom (Just utf32be_decode)
 utf32be_decode input{ bufL= ir+4 } output
 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
 writeIORef seen_bom (Just utf32le_decode)
 utf32le_decode input{ bufL= ir+4 } output
 | otherwise -> do
 writeIORef seen_bom (Just utf32_native_decode)
 utf32_native_decode input output


bom0, bom1, bom2, bom3 :: Word8
bom0 = 0
bom1 = 0
bom2 = 0xfe
bom3 = 0xff

-- choose UTF-32BE by default for UTF-32 output
utf32_native_decode :: DecodeBuffer
utf32_native_decode = utf32be_decode

utf32_native_encode :: EncodeBuffer
utf32_native_encode = utf32be_encode

-- -----------------------------------------------------------------------------
-- UTF32LE and UTF32BE

utf32be :: TextEncoding
utf32be = mkUTF32be ErrorOnCodingFailure

-- | /Since: 4.4.0.0/
mkUTF32be :: CodingFailureMode -> TextEncoding
mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
 mkTextDecoder = utf32be_DF cfm,
 mkTextEncoder = utf32be_EF cfm }

utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
utf32be_DF cfm =
 return (BufferCodec {
 encode = utf32be_decode,
 recover = recoverDecode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })

utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
utf32be_EF cfm =
 return (BufferCodec {
 encode = utf32be_encode,
 recover = recoverEncode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })


utf32le :: TextEncoding
utf32le = mkUTF32le ErrorOnCodingFailure

-- | /Since: 4.4.0.0/
mkUTF32le :: CodingFailureMode -> TextEncoding
mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
 mkTextDecoder = utf32le_DF cfm,
 mkTextEncoder = utf32le_EF cfm }

utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
utf32le_DF cfm =
 return (BufferCodec {
 encode = utf32le_decode,
 recover = recoverDecode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })

utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
utf32le_EF cfm =
 return (BufferCodec {
 encode = utf32le_encode,
 recover = recoverEncode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })


utf32be_decode :: DecodeBuffer
utf32be_decode 
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 loop !ir !ow
 | ow >= os = done OutputUnderflow ir ow
 | iw - ir < 4 = done InputUnderflow ir ow
 | otherwise = do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 c2 <- readWord8Buf iraw (ir+2)
 c3 <- readWord8Buf iraw (ir+3)
 let x1 = chr4 c0 c1 c2 c3
 if not (validate x1) then invalid else do
 ow' <- writeCharBuf oraw ow x1
 loop (ir+4) ow'
 where
 invalid = done InvalidSequence ir ow

 -- lambda-lifted, to avoid thunks being built in the inner-loop:
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 in
 loop ir0 ow0

utf32le_decode :: DecodeBuffer
utf32le_decode 
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 loop !ir !ow
 | ow >= os = done OutputUnderflow ir ow
 | iw - ir < 4 = done InputUnderflow ir ow
 | otherwise = do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 c2 <- readWord8Buf iraw (ir+2)
 c3 <- readWord8Buf iraw (ir+3)
 let x1 = chr4 c3 c2 c1 c0
 if not (validate x1) then invalid else do
 ow' <- writeCharBuf oraw ow x1
 loop (ir+4) ow'
 where
 invalid = done InvalidSequence ir ow

 -- lambda-lifted, to avoid thunks being built in the inner-loop:
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 in
 loop ir0 ow0

utf32be_encode :: EncodeBuffer
utf32be_encode
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 loop !ir !ow
 | ir >= iw = done InputUnderflow ir ow
 | os - ow < 4 = done OutputUnderflow ir ow
 | otherwise = do
 (c,ir') <- readCharBuf iraw ir
 if isSurrogate c then done InvalidSequence ir ow else do
 let (c0,c1,c2,c3) = ord4 c
 writeWord8Buf oraw ow c0
 writeWord8Buf oraw (ow+1) c1
 writeWord8Buf oraw (ow+2) c2
 writeWord8Buf oraw (ow+3) c3
 loop ir' (ow+4)
 in
 loop ir0 ow0

utf32le_encode :: EncodeBuffer
utf32le_encode
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 loop !ir !ow
 | ir >= iw = done InputUnderflow ir ow
 | os - ow < 4 = done OutputUnderflow ir ow
 | otherwise = do
 (c,ir') <- readCharBuf iraw ir
 if isSurrogate c then done InvalidSequence ir ow else do
 let (c0,c1,c2,c3) = ord4 c
 writeWord8Buf oraw ow c3
 writeWord8Buf oraw (ow+1) c2
 writeWord8Buf oraw (ow+2) c1
 writeWord8Buf oraw (ow+3) c0
 loop ir' (ow+4)
 in
 loop ir0 ow0

chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
 C# (chr# (z1# +# z2# +# z3# +# z4#))
 where
 !y1# = word2Int# x1#
 !y2# = word2Int# x2#
 !y3# = word2Int# x3#
 !y4# = word2Int# x4#
 !z1# = uncheckedIShiftL# y1# 24#
 !z2# = uncheckedIShiftL# y2# 16#
 !z3# = uncheckedIShiftL# y3# 8#
 !z4# = y4#
{-# INLINE chr4 #-}

ord4 :: Char -> (Word8,Word8,Word8,Word8)
ord4 c = (fromIntegral (x `shiftR` 24), 
 fromIntegral (x `shiftR` 16), 
 fromIntegral (x `shiftR` 8),
 fromIntegral x)
 where
 x = ord c
{-# INLINE ord4 #-}


validate :: Char -> Bool
validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
 where x1 = ord c
{-# INLINE validate #-}

AltStyle によって変換されたページ (->オリジナル) /