diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2019-02-11 17:40:00 +0100 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2019-02-14 02:29:54 -0500 |
commit | 1d9a1d9fb8fe0a1fea2c44c4246f102ff3e1f3a3 (patch) | |
tree | 4abf3da5c8a8d5fdd88903613d2ce42346e4943f | |
parent | 0f1eb88c93143359fa671bb72aceebc299c87a95 (diff) | |
download | haskell-1d9a1d9fb8fe0a1fea2c44c4246f102ff3e1f3a3.tar.gz |
NCG: fast compilation of very large strings (#16190)
This patch adds an optimization into the NCG: for large strings
(threshold configurable via -fbinary-blob-threshold=NNN flag), instead
of printing `.asciz "..."` in the generated ASM source, we print
`.incbin "tmpXXX.dat"` and we dump the contents of the string into a
temporary "tmpXXX.dat" file.
See the note for more details.
-rw-r--r-- | compiler/main/DynFlags.hs | 6 | ||||
-rw-r--r-- | compiler/nativeGen/PPC/Ppr.hs | 3 | ||||
-rw-r--r-- | compiler/nativeGen/PprBase.hs | 48 | ||||
-rw-r--r-- | compiler/nativeGen/SPARC/Ppr.hs | 8 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 4 | ||||
-rw-r--r-- | docs/users_guide/using-optimisation.rst | 16 |
6 files changed, 73 insertions, 12 deletions
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs index a9b4a03962..858d174c17 100644 --- a/compiler/main/DynFlags.hs +++ b/compiler/main/DynFlags.hs @@ -911,6 +911,9 @@ data DynFlags = DynFlags { specConstrCount :: Maybe Int, -- ^ Max number of specialisations for any one function specConstrRecursive :: Int, -- ^ Max number of specialisations for recursive types -- Not optional; otherwise ForceSpecConstr can diverge. + binBlobThreshold :: Word, -- ^ Binary literals (e.g. strings) whose size is above + -- this threshold will be dumped in a binary file + -- by the assembler code generator (0 to disable) liberateCaseThreshold :: Maybe Int, -- ^ Threshold for LiberateCase floatLamArgs :: Maybe Int, -- ^ Arg count for lambda floating -- See CoreMonad.FloatOutSwitches @@ -1884,6 +1887,7 @@ defaultDynFlags mySettings (myLlvmTargets, myLlvmPasses) = maxPmCheckIterations = 2000000, ruleCheck = Nothing, inlineCheck = Nothing, + binBlobThreshold = 500000, -- 500K is a good default (see #16190) maxRelevantBinds = Just 6, maxValidHoleFits = Just 6, maxRefHoleFits = Just 6, @@ -3526,6 +3530,8 @@ dynamic_flags_deps = [ setOptLevel (mb_n `orElse` 1))) -- If the number is missing, use 1 + , make_ord_flag defFlag "fbinary-blob-threshold" + (intSuffix (\n d -> d { binBlobThreshold = fromIntegral n })) , make_ord_flag defFlag "fmax-relevant-binds" (intSuffix (\n d -> d { maxRelevantBinds = Just n })) diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs index 47ab07b633..c54d4430eb 100644 --- a/compiler/nativeGen/PPC/Ppr.hs +++ b/compiler/nativeGen/PPC/Ppr.hs @@ -125,8 +125,7 @@ pprDatas :: CmmStatics -> SDoc pprDatas (Statics lbl dats) = vcat (pprLabel lbl : map pprData dats) pprData :: CmmStatic -> SDoc -pprData (CmmString str) - = text "\t.string" <+> doubleQuotes (pprASCII str) +pprData (CmmString str) = pprBytes str pprData (CmmUninitialised bytes) = text ".space " <> int bytes pprData (CmmStaticLit lit) = pprDataItem lit diff --git a/compiler/nativeGen/PprBase.hs b/compiler/nativeGen/PprBase.hs index afd16f8178..1f068c261b 100644 --- a/compiler/nativeGen/PprBase.hs +++ b/compiler/nativeGen/PprBase.hs @@ -14,6 +14,7 @@ module PprBase ( floatToBytes, doubleToBytes, pprASCII, + pprBytes, pprSectionHeader ) @@ -28,6 +29,7 @@ import DynFlags import FastString import Outputable import Platform +import FileCleanup import qualified Data.Array.Unsafe as U ( castSTUArray ) import Data.Array.ST @@ -40,6 +42,7 @@ import Data.ByteString (ByteString) import qualified Data.ByteString as BS import GHC.Exts import GHC.Word +import System.IO.Unsafe @@ -125,6 +128,51 @@ pprASCII str ] ord0 = 0x30 -- = ord '0' +-- | Pretty print binary data. +-- +-- Use either the ".string" directive or a ".incbin" directive. +-- See Note [Embedding large binary blobs] +-- +-- A NULL byte is added after the binary data. +-- +pprBytes :: ByteString -> SDoc +pprBytes bs = sdocWithDynFlags $ \dflags -> + if binBlobThreshold dflags == 0 + || fromIntegral (BS.length bs) <= binBlobThreshold dflags + then text "\t.string " <> doubleQuotes (pprASCII bs) + else unsafePerformIO $ do + bFile <- newTempName dflags TFL_CurrentModule ".dat" + BS.writeFile bFile bs + return $ text "\t.incbin \"" <> text bFile <> text "\"\n\t.byte 0" + +{- +Note [Embedding large binary blobs] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To embed a blob of binary data (e.g. an UTF-8 encoded string) into the generated +code object, we have several options: + + 1. Generate a ".byte" directive for each byte. This is what was done in the past + (see Note [Pretty print ASCII when AsmCodeGen]). + + 2. Generate a single ".string"/".asciz" directive for the whole sequence of + bytes. Bytes in the ASCII printable range are rendered as characters and + other values are escaped (e.g., "\t", "\077", etc.). + + 3. Create a temporary file into which we dump the binary data and generate a + single ".incbin" directive. The assembler will include the binary file for + us in the generated output object. + +Now the code generator uses either (2) or (3), depending on the binary blob +size. Using (3) for small blobs adds too much overhead (see benchmark results +in #16190), so we only do it when the size is above a threshold (500K at the +time of writing). + +The threshold is configurable via the `-fbinary-blob-threshold` flag. + +-} + + {- Note [Pretty print ASCII when AsmCodeGen] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs index 705fc31153..42ba13def4 100644 --- a/compiler/nativeGen/SPARC/Ppr.hs +++ b/compiler/nativeGen/SPARC/Ppr.hs @@ -49,8 +49,6 @@ import Unique ( pprUniqueAlways ) import Outputable import Platform import FastString -import Data.Word -import qualified Data.ByteString as BS -- ----------------------------------------------------------------------------- -- Printing this stuff out @@ -110,11 +108,7 @@ pprDatas :: CmmStatics -> SDoc pprDatas (Statics lbl dats) = vcat (pprLabel lbl : map pprData dats) pprData :: CmmStatic -> SDoc -pprData (CmmString str) - = vcat (map do1 (BS.unpack str)) $$ do1 0 - where - do1 :: Word8 -> SDoc - do1 w = text "\t.byte\t" <> int (fromIntegral w) +pprData (CmmString str) = pprBytes str pprData (CmmUninitialised bytes) = text ".skip " <> int bytes pprData (CmmStaticLit lit) = pprDataItem lit diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index 075bb26337..83356758af 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -47,7 +47,6 @@ import FastString import Outputable import Data.Word - import Data.Bits -- ----------------------------------------------------------------------------- @@ -154,8 +153,7 @@ pprDatas (align, (Statics lbl dats)) = vcat (pprAlign align : pprLabel lbl : map pprData dats) pprData :: CmmStatic -> SDoc -pprData (CmmString str) - = ptext (sLit "\t.asciz ") <> doubleQuotes (pprASCII str) +pprData (CmmString str) = pprBytes str pprData (CmmUninitialised bytes) = sdocWithPlatform $ \platform -> diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst index cacc55325e..d6240bc5cb 100644 --- a/docs/users_guide/using-optimisation.rst +++ b/docs/users_guide/using-optimisation.rst @@ -1238,3 +1238,19 @@ by saying ``-fno-wombat``. if a function definition will be inlined *at a call site*. The other option determines if a function definition will be kept around at all for potential inlining. + +.. ghc-flag:: -fbinary-blob-threshold=⟨n⟩ + :shortdesc: *default: 500K.* Tweak assembly generator for binary blobs. + :type: dynamic + :category: optimization + + :default: 500000 + + The native code-generator can either dump binary blobs (e.g. string + literals) into the assembly file (by using ".asciz" or ".string" assembler + directives) or it can dump them as binary data into a temporary file which + is then included by the assembler (using the ".incbin" assembler directive). + + This flag sets the size (in bytes) threshold above which the second approach + is used. You can disable the second approach entirely by setting the + threshold to 0. |