{-# LANGUAGE Trustworthy #-}{-# LANGUAGE NoImplicitPrelude #-}{-# LANGUAGE UnboxedTuples #-}{-# LANGUAGE MagicHash #-}{-# LANGUAGE BangPatterns #-}------------------------------------------------------------------------------- |-- Module : GHC.IO.Encoding.Failure-- Copyright : (c) The University of Glasgow, 2008-2011-- License : see libraries/base/LICENSE---- Maintainer : libraries@haskell.org-- Stability : internal-- Portability : non-portable---- Types for specifying how text encoding/decoding fails-------------------------------------------------------------------------------moduleGHC.IO.Encoding.Failure(CodingFailureMode (..),codingFailureModeSuffix ,isSurrogate ,recoverDecode ,recoverEncode ,recoverDecode# ,recoverEncode# ,)whereimportGHC.IO importGHC.IO.Buffer importGHC.IO.Exception importGHC.Base importGHC.Char importGHC.Word importGHC.Show importGHC.Num importGHC.Real (fromIntegral )--import System.Posix.Internals-- | The 'CodingFailureMode' is used to construct 'System.IO.TextEncoding's,-- and specifies how they handle illegal sequences.dataCodingFailureMode =ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered|IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is-- encountered|TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal-- sequence|RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow-- illegal sequences to be roundtripped.deriving(Int -> CodingFailureMode -> ShowS
[CodingFailureMode] -> ShowS
CodingFailureMode -> String
(Int -> CodingFailureMode -> ShowS)
-> (CodingFailureMode -> String)
-> ([CodingFailureMode] -> ShowS)
-> Show CodingFailureMode
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CodingFailureMode -> ShowS
showsPrec :: Int -> CodingFailureMode -> ShowS
$cshow :: CodingFailureMode -> String
show :: CodingFailureMode -> String
$cshowList :: [CodingFailureMode] -> ShowS
showList :: [CodingFailureMode] -> ShowS
Show -- ^ @since 4.4.0.0)-- This will only work properly for those encodings which are-- strict supersets of ASCII in the sense that valid ASCII data-- is also valid in that encoding. This is not true for-- e.g. UTF-16, because ASCII characters must be padded to two-- bytes to retain their meaning.-- Note [Roundtripping]-- ~~~~~~~~~~~~~~~~~~~~-- Roundtripping is based on the ideas of PEP383.---- We used to use the range of private-use characters from 0xEF80 to-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registry-- to encode these characters.---- However, people didn't like this because it means we don't get-- guaranteed roundtripping for byte sequences that look like a UTF-8-- encoded codepoint 0xEFxx.---- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape-- undecodable bytes, even though that may confuse Unicode processing-- software written in Haskell. This guarantees roundtripping because-- unicode input that includes lone surrogate codepoints is invalid by-- definition.------ When we used private-use characters there was a technical problem when it-- came to encoding back to bytes using iconv. The iconv code will not fail when-- it tries to encode a private-use character (as it would if trying to encode-- a surrogate), which means that we wouldn't get a chance to replace it-- with the byte we originally escaped.---- To work around this, when filling the buffer to be encoded (in-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the-- private-use characters with lone surrogates again! Likewise, when-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had-- to do the inverse process.---- The user of String would never see these lone surrogates, but it-- ensured that iconv will throw an error when encountering them. We-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.codingFailureModeSuffix ::CodingFailureMode ->String codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix CodingFailureMode
ErrorOnCodingFailure =String
""codingFailureModeSuffix CodingFailureMode
IgnoreCodingFailure =String
"//IGNORE"codingFailureModeSuffix CodingFailureMode
TransliterateCodingFailure =String
"//TRANSLIT"codingFailureModeSuffix CodingFailureMode
RoundtripFailure =String
"//ROUNDTRIP"-- | In transliterate mode, we use this character when decoding-- unknown bytes.---- This is the defined Unicode replacement character:-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>unrepresentableChar ::Char unrepresentableChar :: Char
unrepresentableChar =Char
'\xFFFD'-- It is extraordinarily important that this series of-- predicates/transformers gets inlined, because they tend to be used-- in inner loops related to text encoding. In particular,-- surrogatifyRoundtripCharacter must be inlined (see #5536)-- | Some characters are actually "surrogate" codepoints defined for-- use in UTF-16. We need to signal an invalid character if we detect-- them when encoding a sequence of 'Char's into 'Word8's because they-- won't give valid Unicode.---- We may also need to signal an invalid character if we detect them-- when encoding a sequence of 'Char's into 'Word8's because the-- 'RoundtripFailure' mode creates these to round-trip bytes through-- our internal UTF-16 encoding.{-# INLINEisSurrogate #-}isSurrogate ::Char ->Bool isSurrogate :: Char -> Bool
isSurrogate Char
c =(Int
0xD800Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xDBFF)Bool -> Bool -> Bool
|| (Int
0xDC00Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xDFFF)wherex :: Int
x =Char -> Int
ord Char
c -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem){-# INLINEescapeToRoundtripCharacterSurrogate #-}escapeToRoundtripCharacterSurrogate ::Word8 ->Char escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate Word8
b |Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
128=Int -> Char
chr (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b )-- Disallow 'smuggling' of ASCII bytes. For roundtripping to-- work, this assumes encoding is ASCII-superset.|Bool
otherwise =Int -> Char
chr (Int
0xDC00Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b )-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8){-# INLINEunescapeRoundtripCharacterSurrogate #-}unescapeRoundtripCharacterSurrogate ::Char ->Maybe Word8 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate Char
c |Int
0xDC80Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xDD00=Word8 -> Maybe Word8
forall a. a -> Maybe a
Just (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
x )-- Discard high byte|Bool
otherwise =Maybe Word8
forall a. Maybe a
Nothing wherex :: Int
x =Char -> Int
ord Char
c recoverDecode# ::CodingFailureMode ->Buffer Word8 ->Buffer Char ->State# RealWorld ->(#State# RealWorld ,Buffer Word8 ,Buffer Char #)recoverDecode# :: CodingFailureMode
-> Buffer Word8
-> Buffer Char
-> State# RealWorld
-> (# State# RealWorld, Buffer Word8, Buffer Char #)
recoverDecode# CodingFailureMode
cfm Buffer Word8
input Buffer Char
output State# RealWorld
st =let!(#State# RealWorld
st' ,(Buffer Word8
bIn ,Buffer Char
bOut )#)=IO (Buffer Word8, Buffer Char)
-> State# RealWorld
-> (# State# RealWorld, (Buffer Word8, Buffer Char) #)
forall a. IO a -> State# RealWorld -> (# State# RealWorld, a #)
unIO (CodingFailureMode
-> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
recoverDecode CodingFailureMode
cfm Buffer Word8
input Buffer Char
output )State# RealWorld
st in(#State# RealWorld
st' ,Buffer Word8
bIn ,Buffer Char
bOut #)recoverDecode ::CodingFailureMode ->Buffer Word8 ->Buffer Char ->IO (Buffer Word8 ,Buffer Char )recoverDecode :: CodingFailureMode
-> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
recoverDecode CodingFailureMode
cfm input :: Buffer Word8
input @Buffer {bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw =RawBuffer Word8
iraw ,bufL :: forall e. Buffer e -> Int
bufL =Int
ir ,bufR :: forall e. Buffer e -> Int
bufR =Int
_}output :: Buffer Char
output @Buffer {bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw =RawBuffer Char
oraw ,bufL :: forall e. Buffer e -> Int
bufL =Int
_,bufR :: forall e. Buffer e -> Int
bufR =Int
ow }=--puts $ "recoverDecode " ++ show ircaseCodingFailureMode
cfm ofCodingFailureMode
ErrorOnCodingFailure ->doWord8
b <-RawBuffer Word8 -> Int -> IO Word8
readWord8Buf RawBuffer Word8
iraw Int
ir Word8 -> IO (Buffer Word8, Buffer Char)
forall a. Word8 -> IO a
ioe_decodingError Word8
b CodingFailureMode
IgnoreCodingFailure ->(Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input {bufL =ir + 1},Buffer Char
output )CodingFailureMode
TransliterateCodingFailure ->doInt
ow' <-RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
oraw Int
ow Char
unrepresentableChar (Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input {bufL =ir + 1},Buffer Char
output {bufR =ow' })CodingFailureMode
RoundtripFailure ->doWord8
b <-RawBuffer Word8 -> Int -> IO Word8
readWord8Buf RawBuffer Word8
iraw Int
ir Int
ow' <-RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
oraw Int
ow (Word8 -> Char
escapeToRoundtripCharacterSurrogate Word8
b )(Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input {bufL =ir + 1},Buffer Char
output {bufR =ow' })recoverEncode# ::CodingFailureMode ->Buffer Char ->Buffer Word8 ->State# RealWorld ->(#State# RealWorld ,Buffer Char ,Buffer Word8 #)recoverEncode# :: CodingFailureMode
-> Buffer Char
-> Buffer Word8
-> State# RealWorld
-> (# State# RealWorld, Buffer Char, Buffer Word8 #)
recoverEncode# CodingFailureMode
cfm Buffer Char
input Buffer Word8
output State# RealWorld
st =let!(#State# RealWorld
st' ,(Buffer Char
bIn ,Buffer Word8
bOut )#)=IO (Buffer Char, Buffer Word8)
-> State# RealWorld
-> (# State# RealWorld, (Buffer Char, Buffer Word8) #)
forall a. IO a -> State# RealWorld -> (# State# RealWorld, a #)
unIO (CodingFailureMode
-> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
recoverEncode CodingFailureMode
cfm Buffer Char
input Buffer Word8
output )State# RealWorld
st in(#State# RealWorld
st' ,Buffer Char
bIn ,Buffer Word8
bOut #)recoverEncode ::CodingFailureMode ->Buffer Char ->Buffer Word8 ->IO (Buffer Char ,Buffer Word8 )recoverEncode :: CodingFailureMode
-> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
recoverEncode CodingFailureMode
cfm input :: Buffer Char
input @Buffer {bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw =RawBuffer Char
iraw ,bufL :: forall e. Buffer e -> Int
bufL =Int
ir ,bufR :: forall e. Buffer e -> Int
bufR =Int
_}output :: Buffer Word8
output @Buffer {bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw =RawBuffer Word8
oraw ,bufL :: forall e. Buffer e -> Int
bufL =Int
_,bufR :: forall e. Buffer e -> Int
bufR =Int
ow }=do(Char
c ,Int
ir' )<-RawBuffer Char -> Int -> IO (Char, Int)
readCharBuf RawBuffer Char
iraw Int
ir --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'caseCodingFailureMode
cfm ofCodingFailureMode
IgnoreCodingFailure ->(Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input {bufL =ir' },Buffer Word8
output )CodingFailureMode
TransliterateCodingFailure ->doifChar
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'?'then(Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input {bufL =ir' },Buffer Word8
output )elsedo-- XXX: evil hack! To implement transliteration, we just-- poke an ASCII ? into the input buffer and tell the caller-- to try and decode again. This is *probably* safe given-- current uses of TextEncoding.---- The "if" test above ensures we skip if the encoding fails-- to deal with the ?, though this should never happen in-- practice as all encodings are in fact capable of-- reperesenting all ASCII characters.Int
_ir' <-RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
iraw Int
ir Char
'?'(Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input ,Buffer Word8
output )-- This implementation does not work because e.g. UTF-16-- requires 2 bytes to encode a simple ASCII value--writeWord8Buf oraw ow unrepresentableByte--return (input { bufL=ir' }, output { bufR=ow+1 })CodingFailureMode
RoundtripFailure |Just Word8
x <-Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate Char
c ->doRawBuffer Word8 -> Int -> Word8 -> IO ()
writeWord8Buf RawBuffer Word8
oraw Int
ow Word8
x (Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input {bufL =ir' },Buffer Word8
output {bufR =ow + 1})CodingFailureMode
_->Char -> IO (Buffer Char, Buffer Word8)
forall a. Char -> IO a
ioe_encodingError Char
c ioe_decodingError ::Word8 ->IO a ioe_decodingError :: forall a. Word8 -> IO a
ioe_decodingError Word8
b =IOException -> IO a
forall a. IOException -> IO a
ioException (Maybe Handle
-> IOErrorType
-> String
-> String
-> Maybe CInt
-> Maybe String
-> IOException
IOError Maybe Handle
forall a. Maybe a
Nothing IOErrorType
InvalidArgument String
"recoverDecode"(String
"cannot decode byte sequence starting from "String -> ShowS
forall a. [a] -> [a] -> [a]
++ Word8 -> String
forall a. Show a => a -> String
show Word8
b )Maybe CInt
forall a. Maybe a
Nothing Maybe String
forall a. Maybe a
Nothing )ioe_encodingError ::Char ->IO a ioe_encodingError :: forall a. Char -> IO a
ioe_encodingError Char
ch =IOException -> IO a
forall a. IOException -> IO a
ioException (Maybe Handle
-> IOErrorType
-> String
-> String
-> Maybe CInt
-> Maybe String
-> IOException
IOError Maybe Handle
forall a. Maybe a
Nothing IOErrorType
InvalidArgument String
"recoverEncode"-- This assumes that @show ch@ escapes non-ASCII symbols-- and thus does not cause recursive encoding failures.(String
"cannot encode character "String -> ShowS
forall a. [a] -> [a] -> [a]
++ Char -> String
forall a. Show a => a -> String
show Char
ch )Maybe CInt
forall a. Maybe a
Nothing Maybe String
forall a. Maybe a
Nothing )

AltStyle によって変換されたページ (->オリジナル) /