{-# LANGUAGE Trustworthy #-}{-# LANGUAGE NoImplicitPrelude
 , BangPatterns
 , NondecreasingIndentation
 , MagicHash
 #-}{-# OPTIONS_GHC -funbox-strict-fields #-}------------------------------------------------------------------------------- |-- Module : GHC.IO.Encoding.UTF8-- Copyright : (c) The University of Glasgow, 2009-- License : see libraries/base/LICENSE-- -- Maintainer : libraries@haskell.org-- Stability : internal-- Portability : non-portable---- UTF-8 Codec for the IO library---- Portions Copyright : (c) Tom Harper 2008-2009,-- (c) Bryan O'Sullivan 2009,-- (c) Duncan Coutts 2009-------------------------------------------------------------------------------moduleGHC.IO.Encoding.UTF8(utf8 ,mkUTF8 ,utf8_bom ,mkUTF8_bom )whereimportGHC.Base importGHC.Real importGHC.Num importGHC.IORef -- import GHC.IOimportGHC.IO.Buffer importGHC.IO.Encoding.Failure importGHC.IO.Encoding.Types importGHC.Word importData.Bits utf8::TextEncoding utf8 =mkUTF8 ErrorOnCodingFailure -- | @since 4.4.0.0mkUTF8::CodingFailureMode ->TextEncoding mkUTF8 cfm =TextEncoding {textEncodingName="UTF-8",mkTextDecoder=utf8_DF cfm ,mkTextEncoder=utf8_EF cfm }utf8_DF::CodingFailureMode ->IO(TextDecoder ())utf8_DF cfm =return (BufferCodec {encode=utf8_decode ,recover=recoverDecode cfm ,close=return (),getState=return (),setState=const $ return ()})utf8_EF::CodingFailureMode ->IO(TextEncoder ())utf8_EF cfm =return (BufferCodec {encode=utf8_encode ,recover=recoverEncode cfm ,close=return (),getState=return (),setState=const $ return ()})utf8_bom::TextEncoding utf8_bom =mkUTF8_bom ErrorOnCodingFailure mkUTF8_bom::CodingFailureMode ->TextEncoding mkUTF8_bom cfm =TextEncoding {textEncodingName="UTF-8BOM",mkTextDecoder=utf8_bom_DF cfm ,mkTextEncoder=utf8_bom_EF cfm }utf8_bom_DF::CodingFailureMode ->IO(TextDecoder Bool)utf8_bom_DF cfm =doref <-newIORef Truereturn (BufferCodec {encode=utf8_bom_decode ref ,recover=recoverDecode cfm ,close=return (),getState=readIORef ref ,setState=writeIORef ref })utf8_bom_EF::CodingFailureMode ->IO(TextEncoder Bool)utf8_bom_EF cfm =doref <-newIORef Truereturn (BufferCodec {encode=utf8_bom_encode ref ,recover=recoverEncode cfm ,close=return (),getState=readIORef ref ,setState=writeIORef ref })utf8_bom_decode::IORef Bool->DecodeBuffer utf8_bom_decode ref input @Buffer {bufRaw=iraw ,bufL=ir ,bufR=iw ,bufSize=_}output =dofirst <-readIORef ref ifnotfirst thenutf8_decode input output elsedoletno_bom =dowriteIORef ref False;utf8_decode input output ifiw -ir <1thenreturn (InputUnderflow ,input ,output )elsedoc0 <-readWord8Buf iraw ir if(c0 /=bom0 )thenno_bom elsedoifiw -ir <2thenreturn (InputUnderflow ,input ,output )elsedoc1 <-readWord8Buf iraw (ir + 1)if(c1 /=bom1 )thenno_bom elsedoifiw -ir <3thenreturn (InputUnderflow ,input ,output )elsedoc2 <-readWord8Buf iraw (ir + 2)if(c2 /=bom2 )thenno_bom elsedo-- found a BOM, ignore it and carry onwriteIORef ref Falseutf8_decode input {bufL=ir + 3}output utf8_bom_encode::IORef Bool->EncodeBuffer utf8_bom_encode ref input output @Buffer {bufRaw=oraw ,bufL=_,bufR=ow ,bufSize=os }=dob <-readIORef ref ifnotb thenutf8_encode input output elseifos -ow <3thenreturn (OutputUnderflow ,input ,output )elsedowriteIORef ref FalsewriteWord8Buf oraw ow bom0 writeWord8Buf oraw (ow + 1)bom1 writeWord8Buf oraw (ow + 2)bom2 utf8_encode input output {bufR=ow + 3}bom0,bom1,bom2::Word8 bom0 =0xefbom1 =0xbbbom2 =0xbfutf8_decode::DecodeBuffer utf8_decode input @Buffer {bufRaw=iraw ,bufL=ir0 ,bufR=iw ,bufSize=_}output @Buffer {bufRaw=oraw ,bufL=_,bufR=ow0 ,bufSize=os }=letloop !ir !ow |ow >=os =done OutputUnderflow ir ow |ir >=iw =done InputUnderflow ir ow |otherwise =doc0 <-readWord8Buf iraw ir casec0 of_|c0 <=0x7f->doow' <-writeCharBuf oraw ow (unsafeChr (fromIntegral c0 ))loop (ir + 1)ow' |c0 >=0xc0&&c0 <=0xc1->invalid -- Overlong forms|c0 >=0xc2&&c0 <=0xdf->ifiw -ir <2thendone InputUnderflow ir ow elsedoc1 <-readWord8Buf iraw (ir + 1)if(c1 <0x80||c1 >=0xc0)theninvalid elsedoow' <-writeCharBuf oraw ow (chr2 c0 c1 )loop (ir + 2)ow' |c0 >=0xe0&&c0 <=0xef->caseiw -ir of1->done InputUnderflow ir ow 2->do-- check for an error even when we don't have-- the full sequence yet (#3341)c1 <-readWord8Buf iraw (ir + 1)ifnot(validate3 c0 c1 0x80)theninvalid elsedone InputUnderflow ir ow _->doc1 <-readWord8Buf iraw (ir + 1)c2 <-readWord8Buf iraw (ir + 2)ifnot(validate3 c0 c1 c2 )theninvalid elsedoow' <-writeCharBuf oraw ow (chr3 c0 c1 c2 )loop (ir + 3)ow' |c0 >=0xf0->caseiw -ir of1->done InputUnderflow ir ow 2->do-- check for an error even when we don't have-- the full sequence yet (#3341)c1 <-readWord8Buf iraw (ir + 1)ifnot(validate4 c0 c1 0x800x80)theninvalid elsedone InputUnderflow ir ow 3->doc1 <-readWord8Buf iraw (ir + 1)c2 <-readWord8Buf iraw (ir + 2)ifnot(validate4 c0 c1 c2 0x80)theninvalid elsedone InputUnderflow ir ow _->doc1 <-readWord8Buf iraw (ir + 1)c2 <-readWord8Buf iraw (ir + 2)c3 <-readWord8Buf iraw (ir + 3)ifnot(validate4 c0 c1 c2 c3 )theninvalid elsedoow' <-writeCharBuf oraw ow (chr4 c0 c1 c2 c3 )loop (ir + 4)ow' |otherwise ->invalid whereinvalid =done InvalidSequence ir ow -- lambda-lifted, to avoid thunks being built in the inner-loop:done why !ir !ow =return (why ,ifir ==iw theninput {bufL=0,bufR=0}elseinput {bufL=ir },output {bufR=ow })inloop ir0 ow0 utf8_encode::EncodeBuffer utf8_encode input @Buffer {bufRaw=iraw ,bufL=ir0 ,bufR=iw ,bufSize=_}output @Buffer {bufRaw=oraw ,bufL=_,bufR=ow0 ,bufSize=os }=letdone why !ir !ow =return (why ,ifir ==iw theninput {bufL=0,bufR=0}elseinput {bufL=ir },output {bufR=ow })loop !ir !ow |ow >=os =done OutputUnderflow ir ow |ir >=iw =done InputUnderflow ir ow |otherwise =do(c ,ir' )<-readCharBuf iraw ir caseord c ofx |x <=0x7F->dowriteWord8Buf oraw ow (fromIntegral x )loop ir' (ow + 1)|x <=0x07FF->ifos -ow <2thendone OutputUnderflow ir ow elsedolet(c1 ,c2 )=ord2 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow + 1)c2 loop ir' (ow + 2)|x <=0xFFFF->ifisSurrogate c thendone InvalidSequence ir ow elsedoifos -ow <3thendone OutputUnderflow ir ow elsedolet(c1 ,c2 ,c3 )=ord3 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow + 1)c2 writeWord8Buf oraw (ow + 2)c3 loop ir' (ow + 3)|otherwise ->doifos -ow <4thendone OutputUnderflow ir ow elsedolet(c1 ,c2 ,c3 ,c4 )=ord4 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow + 1)c2 writeWord8Buf oraw (ow + 2)c3 writeWord8Buf oraw (ow + 3)c4 loop ir' (ow + 4)inloop ir0 ow0 -- ------------------------------------------------------------------------------- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8ord2::Char->(Word8 ,Word8 )ord2 c =assert (n >=0x80&&n <=0x07ff)(x1 ,x2 )wheren =ord c x1 =fromIntegral $ (n `shiftR `6)+ 0xC0x2 =fromIntegral $ (n .&. 0x3F)+ 0x80ord3::Char->(Word8 ,Word8 ,Word8 )ord3 c =assert (n >=0x0800&&n <=0xffff)(x1 ,x2 ,x3 )wheren =ord c x1 =fromIntegral $ (n `shiftR `12)+ 0xE0x2 =fromIntegral $ ((n `shiftR `6).&. 0x3F)+ 0x80x3 =fromIntegral $ (n .&. 0x3F)+ 0x80ord4::Char->(Word8 ,Word8 ,Word8 ,Word8 )ord4 c =assert (n >=0x10000)(x1 ,x2 ,x3 ,x4 )wheren =ord c x1 =fromIntegral $ (n `shiftR `18)+ 0xF0x2 =fromIntegral $ ((n `shiftR `12).&. 0x3F)+ 0x80x3 =fromIntegral $ ((n `shiftR `6).&. 0x3F)+ 0x80x4 =fromIntegral $ (n .&. 0x3F)+ 0x80chr2::Word8 ->Word8 ->Charchr2 (W8# x1# )(W8# x2# )=C#(chr#(z1# +#z2# ))where!y1# =word2Int#x1# !y2# =word2Int#x2# !z1# =uncheckedIShiftL#(y1# -#0xC0#)6#!z2# =y2# -#0x80#{-# INLINE chr2 #-}chr3::Word8 ->Word8 ->Word8 ->Charchr3 (W8# x1# )(W8# x2# )(W8# x3# )=C#(chr#(z1# +#z2# +#z3# ))where!y1# =word2Int#x1# !y2# =word2Int#x2# !y3# =word2Int#x3# !z1# =uncheckedIShiftL#(y1# -#0xE0#)12#!z2# =uncheckedIShiftL#(y2# -#0x80#)6#!z3# =y3# -#0x80#{-# INLINE chr3 #-}chr4::Word8 ->Word8 ->Word8 ->Word8 ->Charchr4 (W8# x1# )(W8# x2# )(W8# x3# )(W8# x4# )=C#(chr#(z1# +#z2# +#z3# +#z4# ))where!y1# =word2Int#x1# !y2# =word2Int#x2# !y3# =word2Int#x3# !y4# =word2Int#x4# !z1# =uncheckedIShiftL#(y1# -#0xF0#)18#!z2# =uncheckedIShiftL#(y2# -#0x80#)12#!z3# =uncheckedIShiftL#(y3# -#0x80#)6#!z4# =y4# -#0x80#{-# INLINE chr4 #-}between::Word8 -- ^ byte to check->Word8 -- ^ lower bound->Word8 -- ^ upper bound->Boolbetween x y z =x >=y &&x <=z {-# INLINE between #-}validate3::Word8 ->Word8 ->Word8 ->Bool{-# INLINE validate3 #-}validate3 x1 x2 x3 =validate3_1 ||validate3_2 ||validate3_3 ||validate3_4 wherevalidate3_1 =(x1 ==0xE0)&&between x2 0xA00xBF&&between x3 0x800xBFvalidate3_2 =between x1 0xE10xEC&&between x2 0x800xBF&&between x3 0x800xBFvalidate3_3 =x1 ==0xED&&between x2 0x800x9F&&between x3 0x800xBFvalidate3_4 =between x1 0xEE0xEF&&between x2 0x800xBF&&between x3 0x800xBFvalidate4::Word8 ->Word8 ->Word8 ->Word8 ->Bool{-# INLINE validate4 #-}validate4 x1 x2 x3 x4 =validate4_1 ||validate4_2 ||validate4_3 wherevalidate4_1 =x1 ==0xF0&&between x2 0x900xBF&&between x3 0x800xBF&&between x4 0x800xBFvalidate4_2 =between x1 0xF10xF3&&between x2 0x800xBF&&between x3 0x800xBF&&between x4 0x800xBFvalidate4_3 =x1 ==0xF4&&between x2 0x800x8F&&between x3 0x800xBF&&between x4 0x800xBF

AltStyle によって変換されたページ (->オリジナル) /