summaryrefslogtreecommitdiff
path: root/libraries/base/tests/unicode003.hs
blob: db0bdf532305fe4a9c5235cd1e34c9458862b9d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
{-# LANGUAGE BlockArguments, LambdaCase #-}

-------------------------------------------------------------------------------
-- |
-- Description: Test general caterory and simple case mappings for all characters.
--
-- Checking the exact value of these properties for each character would result
-- in a CSV-like file of several MiB. Thus we perform only a /sanity check/
-- using the following method:
--
-- * Group characters in chunks of 100 elements;
-- * For each chunk, compute the hash of each property and create an entry;
-- * An entry is in the format:
--   @<codepoint of first item in chunk>,<hash of prop 1>,<hash of prop 2>,…@
--   (see 'header' for more details).
--
-- Reference test output (@unicode003.stdout@) was initially generated with:
--
-- > runghc-9.2.2 unicode003.hs > unicode003.stdout
--
-- and compared to Python 3.11.0 using @ucd2haskell/tests/check_test_data.py@
--
-------------------------------------------------------------------------------

module Main where

import Data.Bits
import Data.Char
import Data.Foldable
import Data.Int
import Data.List (unfoldr)
import Numeric

-- | File header
header :: String
header = "First Char,GeneralCategory Digest,Case Mappings Digest"

type Chunk a = [a]

-- | Break a list into chunks of given maximum size.
chunksOf :: Int -> [a] -> [Chunk a]
chunksOf n = unfoldr \case
  [] -> Nothing
  l  -> Just (splitAt n l)

type Salt   = Int32
type Digest = Int32

-- [NOTE] adapted from “hashable” package (see: Data.Hashable.LowLevel).
-- Use salt 16777619 for 32 bits and 1099511628211 for 64 bits.
-- | Compute the hash of an integer given a salt.
hashInt :: Salt -> Digest -> Salt
hashInt s x = (s * 16777619) `xor` x

-- | Compute the hash of 'generalCategory'.
generalCategoryHash :: Char -> Digest
generalCategoryHash = fromIntegral . fromEnum . generalCategory

-- | Compute the hash of a character’s case mappings
-- ('toLower', 'toUpper', 'toTitle').
caseMappingsHash :: Char -> Digest
caseMappingsHash c = fromIntegral (ord (toLower c))
           `hashInt` fromIntegral (ord (toUpper c))
           `hashInt` fromIntegral (ord (toTitle c))

-- | Compute the hash of a character with a given conversion function and salt.
hashUsing :: (Char -> Digest) -> Salt -> Char -> Digest
hashUsing f s = hashInt s . f

-- | Make a CSV entry for a chunk.
mkEntry :: Chunk Char -> String
mkEntry []       = mempty
mkEntry cs@(c:_) =
  -- Compute chunk’s hash by chaining hash computations
  let gcDigest = foldl' (hashUsing generalCategoryHash) (generalCategoryHash c) cs
      cmDigest = foldl' (hashUsing caseMappingsHash)    (caseMappingsHash    c) cs
  -- Create CSV line
  in showHex (ord c) (',' : shows gcDigest (',' : show cmDigest))

main :: IO ()
main = do
  putStrLn header
  traverse_ (putStrLn . mkEntry) (chunksOf 100 [minBound..maxBound])