GHC/IO/Encoding/UTF16.hs

{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE CPP
 , NoImplicitPrelude
 , BangPatterns
 , NondecreasingIndentation
 , MagicHash
 #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}

-----------------------------------------------------------------------------
-- |
-- Module : GHC.IO.Encoding.UTF16
-- Copyright : (c) The University of Glasgow, 2009
-- License : see libraries/base/LICENSE
-- 
-- Maintainer : libraries@haskell.org
-- Stability : internal
-- Portability : non-portable
--
-- UTF-16 Codecs for the IO library
--
-- Portions Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
-- (c) Duncan Coutts 2009
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding.UTF16 (
 utf16, mkUTF16,
 utf16_decode,
 utf16_encode,

 utf16be, mkUTF16be,
 utf16be_decode,
 utf16be_encode,

 utf16le, mkUTF16le,
 utf16le_decode,
 utf16le_encode,
 ) where

import GHC.Base
import GHC.Real
import GHC.Num
-- import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
import Data.Maybe
import GHC.IORef

-- -----------------------------------------------------------------------------
-- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM

utf16 :: TextEncoding
utf16 = mkUTF16 ErrorOnCodingFailure

mkUTF16 :: CodingFailureMode -> TextEncoding
mkUTF16 cfm = TextEncoding { textEncodingName = "UTF-16",
 mkTextDecoder = utf16_DF cfm,
 mkTextEncoder = utf16_EF cfm }

utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
utf16_DF cfm = do
 seen_bom <- newIORef Nothing
 return (BufferCodec {
 encode = utf16_decode seen_bom,
 recover = recoverDecode cfm,
 close = return (),
 getState = readIORef seen_bom,
 setState = writeIORef seen_bom
 })

utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool)
utf16_EF cfm = do
 done_bom <- newIORef False
 return (BufferCodec {
 encode = utf16_encode done_bom,
 recover = recoverEncode cfm,
 close = return (),
 getState = readIORef done_bom,
 setState = writeIORef done_bom
 })

utf16_encode :: IORef Bool -> EncodeBuffer
utf16_encode done_bom input
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
 = do
 b <- readIORef done_bom
 if b then utf16_native_encode input output
 else if os - ow < 2
 then return (OutputUnderflow,input,output)
 else do
 writeIORef done_bom True
 writeWord8Buf oraw ow bom1
 writeWord8Buf oraw (ow+1) bom2
 utf16_native_encode input output{ bufR = ow+2 }

utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
utf16_decode seen_bom
 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
 output
 = do
 mb <- readIORef seen_bom
 case mb of
 Just decode -> decode input output
 Nothing ->
 if iw - ir < 2 then return (InputUnderflow,input,output) else do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 case () of
 _ | c0 == bomB && c1 == bomL -> do
 writeIORef seen_bom (Just utf16be_decode)
 utf16be_decode input{ bufL= ir+2 } output
 | c0 == bomL && c1 == bomB -> do
 writeIORef seen_bom (Just utf16le_decode)
 utf16le_decode input{ bufL= ir+2 } output
 | otherwise -> do
 writeIORef seen_bom (Just utf16_native_decode)
 utf16_native_decode input output


bomB, bomL, bom1, bom2 :: Word8
bomB = 0xfe
bomL = 0xff

-- choose UTF-16BE by default for UTF-16 output
utf16_native_decode :: DecodeBuffer
utf16_native_decode = utf16be_decode

utf16_native_encode :: EncodeBuffer
utf16_native_encode = utf16be_encode

bom1 = bomB
bom2 = bomL

-- -----------------------------------------------------------------------------
-- UTF16LE and UTF16BE

utf16be :: TextEncoding
utf16be = mkUTF16be ErrorOnCodingFailure

mkUTF16be :: CodingFailureMode -> TextEncoding
mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE",
 mkTextDecoder = utf16be_DF cfm,
 mkTextEncoder = utf16be_EF cfm }

utf16be_DF :: CodingFailureMode -> IO (TextDecoder ())
utf16be_DF cfm =
 return (BufferCodec {
 encode = utf16be_decode,
 recover = recoverDecode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })

utf16be_EF :: CodingFailureMode -> IO (TextEncoder ())
utf16be_EF cfm =
 return (BufferCodec {
 encode = utf16be_encode,
 recover = recoverEncode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })

utf16le :: TextEncoding
utf16le = mkUTF16le ErrorOnCodingFailure

mkUTF16le :: CodingFailureMode -> TextEncoding
mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE",
 mkTextDecoder = utf16le_DF cfm,
 mkTextEncoder = utf16le_EF cfm }

utf16le_DF :: CodingFailureMode -> IO (TextDecoder ())
utf16le_DF cfm =
 return (BufferCodec {
 encode = utf16le_decode,
 recover = recoverDecode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })

utf16le_EF :: CodingFailureMode -> IO (TextEncoder ())
utf16le_EF cfm =
 return (BufferCodec {
 encode = utf16le_encode,
 recover = recoverEncode cfm,
 close = return (),
 getState = return (),
 setState = const $ return ()
 })


utf16be_decode :: DecodeBuffer
utf16be_decode 
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 loop !ir !ow
 | ow >= os = done OutputUnderflow ir ow
 | ir >= iw = done InputUnderflow ir ow
 | ir + 1 == iw = done InputUnderflow ir ow
 | otherwise = do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
 if validate1 x1
 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
 loop (ir+2) ow'
 else if iw - ir < 4 then done InputUnderflow ir ow else do
 c2 <- readWord8Buf iraw (ir+2)
 c3 <- readWord8Buf iraw (ir+3)
 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
 if not (validate2 x1 x2) then invalid else do
 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
 loop (ir+4) ow'
 where
 invalid = done InvalidSequence ir ow

 -- lambda-lifted, to avoid thunks being built in the inner-loop:
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 in
 loop ir0 ow0

utf16le_decode :: DecodeBuffer
utf16le_decode 
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 loop !ir !ow
 | ow >= os = done OutputUnderflow ir ow
 | ir >= iw = done InputUnderflow ir ow
 | ir + 1 == iw = done InputUnderflow ir ow
 | otherwise = do
 c0 <- readWord8Buf iraw ir
 c1 <- readWord8Buf iraw (ir+1)
 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
 if validate1 x1
 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
 loop (ir+2) ow'
 else if iw - ir < 4 then done InputUnderflow ir ow else do
 c2 <- readWord8Buf iraw (ir+2)
 c3 <- readWord8Buf iraw (ir+3)
 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
 if not (validate2 x1 x2) then invalid else do
 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
 loop (ir+4) ow'
 where
 invalid = done InvalidSequence ir ow

 -- lambda-lifted, to avoid thunks being built in the inner-loop:
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 in
 loop ir0 ow0

utf16be_encode :: EncodeBuffer
utf16be_encode
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let 
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 loop !ir !ow
 | ir >= iw = done InputUnderflow ir ow
 | os - ow < 2 = done OutputUnderflow ir ow
 | otherwise = do
 (c,ir') <- readCharBuf iraw ir
 case ord c of
 x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
 writeWord8Buf oraw (ow+1) (fromIntegral x)
 loop ir' (ow+2)
 | otherwise -> do
 if os - ow < 4 then done OutputUnderflow ir ow else do
 let 
 n1 = x - 0x10000
 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
 c2 = fromIntegral (n1 `shiftR` 10)
 n2 = n1 .&. 0x3FF
 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
 c4 = fromIntegral n2
 --
 writeWord8Buf oraw ow c1
 writeWord8Buf oraw (ow+1) c2
 writeWord8Buf oraw (ow+2) c3
 writeWord8Buf oraw (ow+3) c4
 loop ir' (ow+4)
 in
 loop ir0 ow0

utf16le_encode :: EncodeBuffer
utf16le_encode
 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
 = let
 done why !ir !ow = return (why,
 if ir == iw then input{ bufL=0, bufR=0 }
 else input{ bufL=ir },
 output{ bufR=ow })
 loop !ir !ow
 | ir >= iw = done InputUnderflow ir ow
 | os - ow < 2 = done OutputUnderflow ir ow
 | otherwise = do
 (c,ir') <- readCharBuf iraw ir
 case ord c of
 x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
 writeWord8Buf oraw ow (fromIntegral x)
 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
 loop ir' (ow+2)
 | otherwise ->
 if os - ow < 4 then done OutputUnderflow ir ow else do
 let 
 n1 = x - 0x10000
 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
 c2 = fromIntegral (n1 `shiftR` 10)
 n2 = n1 .&. 0x3FF
 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
 c4 = fromIntegral n2
 --
 writeWord8Buf oraw ow c2
 writeWord8Buf oraw (ow+1) c1
 writeWord8Buf oraw (ow+2) c4
 writeWord8Buf oraw (ow+3) c3
 loop ir' (ow+4)
 in
 loop ir0 ow0

chr2 :: Word16 -> Word16 -> Char
chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
 where
 !x# = word2Int# a#
 !y# = word2Int# b#
 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
 !lower# = y# -# 0xDC00#
{-# INLINE chr2 #-}

validate1 :: Word16 -> Bool
validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
{-# INLINE validate1 #-}

validate2 :: Word16 -> Word16 -> Bool
validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
 x2 >= 0xDC00 && x2 <= 0xDFFF
{-# INLINE validate2 #-}

AltStyle によって変換されたページ (->オリジナル) /