Fix #7522 by checking for empty byte buffers a little more

Quite a few lines have changed but that is mostly comments.
author: Max Bolingbroke <batterseapower@hotmail.com> 2013-04-10 21:13:28 +0100
committer: Max Bolingbroke <batterseapower@hotmail.com> 2013-04-10 21:13:28 +0100
commit: 69a8c2d23a75f9f1bda59f14c61fbd057e200126 (patch)
tree: 6280f1c1eb9d24cc9de24c960011028d0e1732f4 /libraries
parent: 90d1d92ce6ac7d34111f4b6c583a1dbc77fb90a7 (diff)
download: haskell-69a8c2d23a75f9f1bda59f14c61fbd057e200126.tar.gz
2 files changed, 34 insertions, 11 deletions
diff --git a/libraries/base/GHC/IO/Encoding/Types.hs b/libraries/base/GHC/IO/Encoding/Types.hs
index 6147d01fa1..e31c067bad 100644
--- a/libraries/base/GHC/IO/Encoding/Types.hs
+++ b/libraries/base/GHC/IO/Encoding/Types.hs
@@ -41,17 +41,14 @@ data BufferCodec from to state = BufferCodec {
    -- if there is either not enough room in @to@, or @from@ does not
    -- contain a complete multibyte sequence.
    --
+   -- If multiple CodingProgress returns are possible, OutputUnderflow must be
+   -- preferred to InvalidSequence. This allows GHC's IO library to assume that
+   -- if we observe InvalidSequence there is at least a single element available
+   -- in the output buffer.
+   --
    -- The fact that as many elements as possible are translated is used by the IO
    -- library in order to report translation errors at the point they
    -- actually occur, rather than when the buffer is translated.
-   --
-   -- To allow us to use iconv as a BufferCode efficiently, character buffers are
-   -- defined to contain lone surrogates instead of those private use characters that
-   -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
-   -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
-   -- respectively.
-   --
-   -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
   
   recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
    -- ^ The @recover@ function is used to continue decoding
diff --git a/libraries/base/GHC/IO/Handle/Internals.hs b/libraries/base/GHC/IO/Handle/Internals.hs
index 79228d3500..855e31cb44 100644
--- a/libraries/base/GHC/IO/Handle/Internals.hs
+++ b/libraries/base/GHC/IO/Handle/Internals.hs
@@ -378,7 +378,15 @@ streamEncode codec from to = go (from, to)
       -- underflow particularly, and we want to delay errors about invalid
       -- sequences as far as possible.
       case why of
-        Encoding.InvalidSequence | bufL from == bufL from' -> recover codec from' to' >>= go
+        Encoding.InvalidSequence | bufL from == bufL from' -> do
+          -- NB: it is OK to call recover here. Because we saw InvalidSequence, by the invariants
+          -- on "encode" it must be the case that there is at least one elements available in the output
+          -- buffer. Furthermore, clearly there is at least one element in the input buffer since we found
+          -- something invalid there!
+          --debugIO $ "Before streamEncode recovery: from=" ++ summaryBuffer from' ++ ", to=" ++ summaryBuffer to'
+          (from', to') <- recover codec from' to'
+          --debugIO $ "After streamEncode recovery: from=" ++ summaryBuffer from' ++ ", to=" ++ summaryBuffer to'
+          go (from', to')
         _ -> return (from', to')
 
 -- -----------------------------------------------------------------------------
@@ -802,6 +810,16 @@ debugIO s
 -- Read characters into the provided buffer.  Return when any
 -- characters are available; raise an exception if the end of 
 -- file is reached.
+--
+-- In uses of readTextDevice within base, the input buffer is either:
+--   * empty
+--   * or contains a single \r (when doing newline translation)
+--
+-- The input character buffer must have a capacity at least 1 greater
+-- than the number of elements it currently contains.
+--
+-- Users of this function expect that the buffer returned contains
+-- at least 1 more character than the input buffer.
 readTextDevice :: Handle__ -> CharBuffer -> IO CharBuffer
 readTextDevice h_@Handle__{..} cbuf = do
   --
@@ -832,9 +850,13 @@ readTextDevice h_@Handle__{..} cbuf = do
   debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ 
         " bbuf=" ++ summaryBuffer bbuf2)
 
+  -- We can't return from readTextDevice without reading at least a single extra character,
+  -- so check that we have managed to achieve that
   writeIORef haByteBuffer bbuf2
-  if bufR cbuf' == bufR cbuf -- no new characters
-     then readTextDevice' h_ bbuf2 cbuf -- we need more bytes to make a Char
+  if bufR cbuf' == bufR cbuf
+     -- we need more bytes to make a Char. NB: bbuf2 may be empty (even though bbuf1 wasn't) when we
+     -- are using an encoding that can skip bytes without outputting characters, such as UTF8//IGNORE
+     then readTextDevice' h_ bbuf2 cbuf
      else return cbuf'
 
 -- we have an incomplete byte sequence at the end of the buffer: try to
@@ -853,7 +875,11 @@ readTextDevice' h_@Handle__{..} bbuf0 cbuf0 = do
   (r,bbuf2) <- Buffered.fillReadBuffer haDevice bbuf1
   if r == 0
    then do
+     -- bbuf2 can be empty here when we encounter an invalid byte sequence at the end of the input
+     -- with a //IGNORE codec which consumes bytes without outputting characters
+     if isEmptyBuffer bbuf2 then ioe_EOF else do
      (bbuf3, cbuf1) <- recover decoder bbuf2 cbuf0
+     debugIO ("readTextDevice' after recovery: bbuf=" ++ summaryBuffer bbuf3 ++ ", cbuf=" ++ summaryBuffer cbuf1)
      writeIORef haByteBuffer bbuf3
      -- We should recursively invoke readTextDevice after recovery,
      -- if recovery did not add at least one new character to the buffer:
author	Max Bolingbroke <batterseapower@hotmail.com>	2013-04-10 21:13:28 +0100
committer	Max Bolingbroke <batterseapower@hotmail.com>	2013-04-10 21:13:28 +0100
commit	69a8c2d23a75f9f1bda59f14c61fbd057e200126 (patch)
tree	6280f1c1eb9d24cc9de24c960011028d0e1732f4 /libraries
parent	90d1d92ce6ac7d34111f4b6c583a1dbc77fb90a7 (diff)
download	haskell-69a8c2d23a75f9f1bda59f14c61fbd057e200126.tar.gz