Skip to content

Commit d199a64

Browse files
authored
Make get @Char fail on invalid encodings (#236)
1 parent d7a2ce2 commit d199a64

2 files changed

Lines changed: 44 additions & 25 deletions

File tree

src/Data/Binary/Class.hs

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
{-# LANGUAGE DefaultSignatures #-}
33
{-# LANGUAGE GADTs #-}
44
{-# LANGUAGE InstanceSigs #-}
5+
{-# LANGUAGE MultiWayIf #-}
56
{-# LANGUAGE ScopedTypeVariables #-}
67
{-# LANGUAGE PatternGuards #-}
78
{-# LANGUAGE PolyKinds #-}
@@ -57,6 +58,7 @@ module Data.Binary.Class (
5758
import Prelude hiding (Foldable(..))
5859
import Data.Foldable (Foldable(..))
5960

61+
import Data.Char (chr)
6062
import Data.Word
6163
import Data.Bits
6264
import Data.Int
@@ -499,35 +501,44 @@ instance Binary a => Binary (Complex a) where
499501

500502
------------------------------------------------------------------------
501503

502-
-- Char is serialised as UTF-8
504+
-- | Uses WTF-8 (like UTF-8, but surrogates are allowed).
503505
instance Binary Char where
504506
put = putCharUtf8
505507
putList str = put (length str) <> putStringUtf8 str
506508
get = do
507-
let getByte = liftM (fromIntegral :: Word8 -> Int) get
508-
shiftL6 = flip shiftL 6 :: Int -> Int
509-
w <- getByte
510-
r <- case () of
511-
_ | w < 0x80 -> return w
512-
| w < 0xe0 -> do
513-
x <- liftM (xor 0x80) getByte
514-
return (x .|. shiftL6 (xor 0xc0 w))
515-
| w < 0xf0 -> do
516-
x <- liftM (xor 0x80) getByte
517-
y <- liftM (xor 0x80) getByte
518-
return (y .|. shiftL6 (x .|. shiftL6
519-
(xor 0xe0 w)))
520-
| otherwise -> do
521-
x <- liftM (xor 0x80) getByte
522-
y <- liftM (xor 0x80) getByte
523-
z <- liftM (xor 0x80) getByte
524-
return (z .|. shiftL6 (y .|. shiftL6
525-
(x .|. shiftL6 (xor 0xf0 w))))
526-
getChr r
509+
w <- fmap fromIntegral getWord8
510+
if
511+
| w < 0x80 -> getChr w
512+
| w < 0xc2 -> invalid -- continuation byte or overlong encoding
513+
| w < 0xe0 -> do
514+
x <- getWord8 >>= continuationByte
515+
getChr ((x .&. 0x3f) .|. ((w .&. 0x1f) `shiftL` 6))
516+
| w < 0xf0 -> do
517+
x <- getWord8 >>= continuationByte
518+
when (w == 0xe0 && x < 0xa0) invalid -- overlong encoding
519+
y <- getWord8 >>= continuationByte
520+
getChr ((y .&. 0x3f) .|. ((x .&. 0x3f) `shiftL` 6) .|. ((w .&. 0x0f) `shiftL` 12))
521+
| w < 0xf5 -> do
522+
x <- getWord8 >>= continuationByte
523+
when (w == 0xf0 && x < 0x90) invalid -- overlong encoding
524+
when (w == 0xf4 && x > 0x8f) invalid -- outside of Unicode range
525+
y <- getWord8 >>= continuationByte
526+
z <- getWord8 >>= continuationByte
527+
getChr ((z .&. 0x3f) .|. ((y .&. 0x3f) `shiftL` 6) .|. ((x .&. 0x3f) `shiftL` 12) .|. ((w .&. 0x07) `shiftL` 18))
528+
| otherwise -> invalid
527529
where
528-
getChr w
529-
| w <= 0x10ffff = return $! toEnum $ fromEnum w
530-
| otherwise = fail "Not a valid Unicode code point!"
530+
invalid :: Get a
531+
invalid = fail "invalid Char encoding"
532+
533+
continuationByte :: Word8 -> Get Int
534+
continuationByte x =
535+
let x' = x .&. 0xc0
536+
in if x' == 0x80
537+
then pure (fromIntegral x)
538+
else invalid -- no continuation byte
539+
540+
getChr :: Int -> Get Char
541+
getChr i = pure $! chr i
531542

532543
------------------------------------------------------------------------
533544
-- Instances for the first few tuples

tests/QC.hs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,14 @@ tests =
742742
, testTypeable
743743

744744
, testGroup "Generic"
745-
[ testProperty "Generic256" $ prop_Generic256
745+
[ testProperty "Generic256" prop_Generic256
746+
]
747+
748+
, testGroup "Char"
749+
[ testProperty "encodings are unique" $
750+
withMaxSize 4 $ \bs ->
751+
case runGetOrFail (get :: Get Char) bs of
752+
Left _ -> discard
753+
Right (_, n, x) -> runPut (put x) === L.take n bs
746754
]
747755
]

0 commit comments

Comments
 (0)