diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs index fbcb3b299..69fb2ae0e 100644 --- a/Data/ByteString/Builder/Internal.hs +++ b/Data/ByteString/Builder/Internal.hs @@ -138,6 +138,7 @@ import Data.Semigroup (Semigroup(..)) import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.ByteString as S +import qualified Data.ByteString.Unsafe as S import qualified Data.ByteString.Internal.Type as S import qualified Data.ByteString.Lazy.Internal as L import qualified Data.ByteString.Short.Internal as Sh @@ -819,24 +820,24 @@ ensureFree minFree = | ope `minusPtr` op < minFree = return $ bufferFull minFree op k | otherwise = k br --- | Copy the bytes from a 'BufferRange' into the output stream. -wrappedBytesCopyStep :: BufferRange -- ^ Input 'BufferRange'. +-- | Copy the bytes from a 'S.StrictByteString' into the output stream. +wrappedBytesCopyStep :: S.StrictByteString -- ^ Input 'S.StrictByteString'. -> BuildStep a -> BuildStep a -wrappedBytesCopyStep (BufferRange ip0 ipe) k = - go ip0 +-- See Note [byteStringCopyStep and wrappedBytesCopyStep] +wrappedBytesCopyStep bs0 k = + go bs0 where - go !ip (BufferRange op ope) + go !bs@(S.BS ifp inpRemaining) (BufferRange op ope) | inpRemaining <= outRemaining = do - copyBytes op ip inpRemaining + S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip inpRemaining let !br' = BufferRange (op `plusPtr` inpRemaining) ope k br' | otherwise = do - copyBytes op ip outRemaining - let !ip' = ip `plusPtr` outRemaining - return $ bufferFull 1 ope (go ip') + S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip outRemaining + let !bs' = S.unsafeDrop outRemaining bs + return $ bufferFull 1 ope (go bs') where outRemaining = ope `minusPtr` op - inpRemaining = ipe `minusPtr` ip -- Strict ByteStrings @@ -857,7 +858,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder byteStringThreshold maxCopySize = \bs -> builder $ step bs where - step bs@(S.BS _ len) !k br@(BufferRange !op _) + step bs@(S.BS _ len) k br@(BufferRange !op _) | len <= maxCopySize = byteStringCopyStep bs k br | otherwise = return $ insertChunk op bs k @@ -871,21 +872,69 @@ byteStringThreshold maxCopySize = byteStringCopy :: S.StrictByteString -> Builder byteStringCopy = \bs -> builder $ byteStringCopyStep bs +{- +Note [byteStringCopyStep and wrappedBytesCopyStep] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A Builder that copies the contents of an arbitrary ByteString needs a +recursive loop, since the bytes to be copied might not fit into the +first few chunk buffers provided by the driver. That loop is +implemented in 'wrappedBytesCopyStep'. But we also have a +non-recursive wrapper, 'byteStringCopyStep', which performs exactly +the first iteration of that loop, falling back to 'wrappedBytesCopyStep' +if a chunk boundary is reached before the entire ByteString is copied. + +This is very strange! Why do we do this? Perhaps mostly for +historical reasons. But sadly, changing this to use a single +recursive loop regresses the benchmark 'foldMap byteStringCopy' by +about 30% as of 2024, in one of two ways: + + 1. If the continuation 'k' is taken as an argument of the + inner copying loop, it remains an unknown function call. + So for each bytestring copied, that continuation must be + entered later via a gen-apply function, which incurs dozens + of cycles of extra overhead. + 2. If the continuation 'k' is lifted out of the inner copying + loop, it becomes a free variable. And after a bit of + inlining, there will be no unknown function call. But, if + the continuation function has any free variables, these + become free variables of the inner copying loop, which + prevent the loop from floating out. (In the actual + benchmark, the tail of the list of bytestrings to copy is + such a free variable of the continuation.) As a result, + the inner copying loop becomes a function closure object + rather than a top-level function. And that means a new + inner-copying-loop function-closure-object must be + allocated on the heap for every bytestring copied, which + is expensive. + + In theory, GHC's late-lambda-lifting pass can clean this up by + abstracting over the problematic free variables. But for some + unknown reason (perhaps a bug in ghc-9.10.1) this optimization + does not fire on the relevant benchmark code, even with a + sufficiently high value of -fstg-lift-lams-rec-args. + + + +Alternatively, it is possible to avoid recursion altogether by +requesting that the next chunk be large enough to accommodate the +entire remainder of the input when a chunk boundary is reached. +But: + * For very large ByteStrings, this may incur unwanted latency. + * Large next-chunk-size requests have caused breakage downstream + in the past. See also https://github.com/yesodweb/wai/issues/894 +-} + {-# INLINE byteStringCopyStep #-} byteStringCopyStep :: S.StrictByteString -> BuildStep a -> BuildStep a -byteStringCopyStep (S.BS ifp isize) !k0 br0@(BufferRange op ope) - -- Ensure that the common case is not recursive and therefore yields - -- better code. - | op' <= ope = do copyBytes op ip isize - touchForeignPtr ifp - k0 (BufferRange op' ope) - | otherwise = wrappedBytesCopyStep (BufferRange ip ipe) k br0 +-- See Note [byteStringCopyStep and wrappedBytesCopyStep] +byteStringCopyStep bs@(S.BS ifp isize) k br@(BufferRange op ope) + | isize <= osize = do + S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip isize + k (BufferRange op' ope) + | otherwise = wrappedBytesCopyStep bs k br where + osize = ope `minusPtr` op op' = op `plusPtr` isize - ip = unsafeForeignPtrToPtr ifp - ipe = ip `plusPtr` isize - k br = do touchForeignPtr ifp -- input consumed: OK to release here - k0 br -- | Construct a 'Builder' that always inserts the 'S.StrictByteString' -- directly as a chunk. diff --git a/tests/Properties/ByteString.hs b/tests/Properties/ByteString.hs index 37c1f1def..b3f8dea5d 100644 --- a/tests/Properties/ByteString.hs +++ b/tests/Properties/ByteString.hs @@ -367,7 +367,7 @@ tests = , testProperty "toChunks . fromChunks" $ \xs -> B.toChunks (B.fromChunks xs) === filter (/= mempty) xs , testProperty "append lazy" $ - \(toElem -> c) -> B.head (B.singleton c <> undefined) === c + \(toElem -> c) -> B.head (B.singleton c <> tooStrictErr) === c , testProperty "compareLength 1" $ \x -> B.compareLength x (B.length x) === EQ , testProperty "compareLength 2" $ @@ -379,13 +379,13 @@ tests = , testProperty "compareLength 5" $ \x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n , testProperty "dropEnd lazy" $ - \(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c + \(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> tooStrictErr)) === B.singleton c , testProperty "dropWhileEnd lazy" $ - \(toElem -> c) -> B.take 1 (B.dropWhileEnd (const False) (B.singleton c <> undefined)) === B.singleton c + \(toElem -> c) -> B.take 1 (B.dropWhileEnd (const False) (B.singleton c <> tooStrictErr)) === B.singleton c , testProperty "breakEnd lazy" $ - \(toElem -> c) -> B.take 1 (fst $ B.breakEnd (const True) (B.singleton c <> undefined)) === B.singleton c + \(toElem -> c) -> B.take 1 (fst $ B.breakEnd (const True) (B.singleton c <> tooStrictErr)) === B.singleton c , testProperty "spanEnd lazy" $ - \(toElem -> c) -> B.take 1 (fst $ B.spanEnd (const False) (B.singleton c <> undefined)) === B.singleton c + \(toElem -> c) -> B.take 1 (fst $ B.spanEnd (const False) (B.singleton c <> tooStrictErr)) === B.singleton c #endif , testProperty "length" $ @@ -604,12 +604,21 @@ tests = # ifdef BYTESTRING_LAZY -- Don't use (===) in these laziness tests: -- We don't want printing the test case to fail! - , testProperty "zip is lazy" $ lazyZipTest $ - \x y -> B.zip x y == zip (B.unpack x) (B.unpack y) - , testProperty "zipWith is lazy" $ \f -> lazyZipTest $ - \x y -> (B.zipWith f x y :: [Int]) == zipWith f (B.unpack x) (B.unpack y) - , testProperty "packZipWith is lazy" $ \f -> lazyZipTest $ - \x y -> B.unpack (B.packZipWith ((toElem .) . f) x y) == zipWith ((toElem .) . f) (B.unpack x) (B.unpack y) + , testProperty "zip is lazy in the longer input" $ zipLazyInLongerInputTest $ + \x y -> B.zip x y == zip (B.unpack x) (B.unpack y) + , testProperty "zipWith is lazy in the longer input" $ + \f -> zipLazyInLongerInputTest $ + \x y -> (B.zipWith f x y :: [Int]) == zipWith f (B.unpack x) (B.unpack y) + , testProperty "packZipWith is lazy in the longer input" $ + \f -> zipLazyInLongerInputTest $ + \x y -> B.unpack (B.packZipWith ((toElem .) . f) x y) == zipWith ((toElem .) . f) (B.unpack x) (B.unpack y) + , testProperty "zip is maximally lazy" $ \x y -> + zip (B.unpack x) (B.unpack y) `List.isPrefixOf` + B.zip (x <> tooStrictErr) (y <> tooStrictErr) + , testProperty "zipWith is maximally lazy" $ \f x y -> + zipWith f (B.unpack x) (B.unpack y) `List.isPrefixOf` + B.zipWith @Int f (x <> tooStrictErr) (y <> tooStrictErr) + -- (It's not clear if packZipWith is required to be maximally lazy.) # endif , testProperty "unzip" $ \(fmap (toElem *** toElem) -> xs) -> (B.unpack *** B.unpack) (B.unzip xs) === unzip xs @@ -807,15 +816,15 @@ readIntegerUnsigned xs = case readMaybe ys of #endif #ifdef BYTESTRING_LAZY -lazyZipTest +zipLazyInLongerInputTest :: Testable prop => (BYTESTRING_TYPE -> BYTESTRING_TYPE -> prop) -> BYTESTRING_TYPE -> BYTESTRING_TYPE -> Property -lazyZipTest fun = \x0 y0 -> let +zipLazyInLongerInputTest fun = \x0 y0 -> let msg = "Input chunks are: " ++ show (B.toChunks x0, B.toChunks y0) (x, y) | B.length x0 <= B.length y0 - = (x0, y0 <> error "too strict") + = (x0, y0 <> tooStrictErr) | otherwise - = (x0 <> error "too strict", y0) + = (x0 <> tooStrictErr, y0) in counterexample msg (fun x y) #endif diff --git a/tests/QuickCheckUtils.hs b/tests/QuickCheckUtils.hs index 7328aed5d..13e3ae0e2 100644 --- a/tests/QuickCheckUtils.hs +++ b/tests/QuickCheckUtils.hs @@ -7,6 +7,7 @@ module QuickCheckUtils , CByteString(..) , Sqrt(..) , int64OK + , tooStrictErr ) where import Test.Tasty.QuickCheck @@ -19,6 +20,7 @@ import Data.Int import System.IO import Foreign.C (CChar) import GHC.TypeLits (TypeError, ErrorMessage(..)) +import GHC.Stack (withFrozenCallStack, HasCallStack) import qualified Data.ByteString.Short as SB import qualified Data.ByteString as P @@ -134,3 +136,7 @@ instance {-# OVERLAPPING #-} -- defined in "QuickCheckUtils". int64OK :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property int64OK f = propertyForAllShrinkShow arbitrary shrink (\v -> [show v]) f + +tooStrictErr :: forall a. HasCallStack => a +tooStrictErr = withFrozenCallStack $ + error "A lazy sub-expression was unexpectedly evaluated" diff --git a/tests/builder/Data/ByteString/Builder/Tests.hs b/tests/builder/Data/ByteString/Builder/Tests.hs index a40c3fd2f..224f27531 100644 --- a/tests/builder/Data/ByteString/Builder/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Tests.hs @@ -18,7 +18,7 @@ import Control.Monad.Trans.State (StateT, evalStateT, evalState, put, import Control.Monad.Trans.Class (lift) import Control.Monad.Trans.Writer (WriterT, execWriterT, tell) -import Foreign (minusPtr) +import Foreign (minusPtr, castPtr, ForeignPtr, withForeignPtr, Int64) import Data.Char (chr) import Data.Bits ((.|.), shiftL) @@ -40,7 +40,6 @@ import Data.ByteString.Builder.Prim.TestUtils import Control.Exception (evaluate) import System.IO (openTempFile, hPutStr, hClose, hSetBinaryMode, hSetEncoding, utf8, hSetNewlineMode, noNewlineTranslation) -import Foreign (ForeignPtr, withForeignPtr, castPtr) import Foreign.C.String (withCString) import Numeric (showFFloat) import System.Posix.Internals (c_unlink) @@ -50,7 +49,8 @@ import Test.Tasty.QuickCheck ( Arbitrary(..), oneof, choose, listOf, elements , counterexample, ioProperty, Property, testProperty , (===), (.&&.), conjoin, forAll, forAllShrink - , UnicodeString(..), NonNegative(..) + , UnicodeString(..), NonNegative(..), Positive(..) + , mapSize, (==>) ) import QuickCheckUtils @@ -70,7 +70,8 @@ tests = testsASCII ++ testsFloating ++ testsChar8 ++ - testsUtf8 + testsUtf8 ++ + [testLaziness] ------------------------------------------------------------------------------ @@ -981,3 +982,44 @@ testsUtf8 = [ testBuilderConstr "charUtf8" charUtf8_list charUtf8 , testBuilderConstr "stringUtf8" (foldMap charUtf8_list) stringUtf8 ] + +testLaziness :: TestTree +testLaziness = testGroup "Builder laziness" + [ testProperty "byteString" $ mapSize (+ 10) $ + \bs (Positive chunkSize) -> + let strategy = safeStrategy chunkSize chunkSize + lbs = toLazyByteStringWith strategy L.empty + (byteString bs <> tooStrictErr) + in (S.length bs > max chunkSize 8) ==> L.head lbs == S.head bs + , testProperty "byteStringCopy" $ mapSize (+ 10) $ + \bs (Positive chunkSize) -> + let strategy = safeStrategy chunkSize chunkSize + lbs = toLazyByteStringWith strategy L.empty + (byteStringCopy bs <> tooStrictErr) + in (S.length bs > max chunkSize 8) ==> L.head lbs == S.head bs + , testProperty "byteStringInsert" $ mapSize (+ 10) $ + \bs (Positive chunkSize) -> + let strategy = safeStrategy chunkSize chunkSize + lbs = toLazyByteStringWith strategy L.empty + (byteStringInsert bs <> tooStrictErr) + in L.take (fromIntegral @Int @Int64 (S.length bs)) lbs + == L.fromStrict bs + , testProperty "lazyByteString" $ mapSize (+ 10) $ + \bs (Positive chunkSize) -> + let strategy = safeStrategy chunkSize chunkSize + lbs = toLazyByteStringWith strategy L.empty + (lazyByteString bs <> tooStrictErr) + in (L.length bs > fromIntegral @Int @Int64 (max chunkSize 8)) + ==> L.head lbs == L.head bs + , testProperty "shortByteString" $ mapSize (+ 10) $ + \bs (Positive chunkSize) -> + let strategy = safeStrategy chunkSize chunkSize + lbs = toLazyByteStringWith strategy L.empty + (shortByteString bs <> tooStrictErr) + in (Sh.length bs > max chunkSize 8) ==> L.head lbs == Sh.head bs + , testProperty "flush" $ \recipe -> let + !(b, toLBS) = recipeComponents recipe + !lbs1 = toLazyByteString b + !lbs2 = L.take (L.length lbs1) (toLBS $ b <> flush <> tooStrictErr) + in lbs1 == lbs2 + ]