tests/multibyte-white-space


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

#! /bin/sh
# Test whether \s matches SP and UTF-8 multi-byte white space characters.
#
# Copyright (C) 2013-2016 Free Software Foundation, Inc.
#
# Copying and distribution of this file, with or without modification,
# are permitted in any medium without royalty provided the copyright
# notice and this notice are preserved.

. "${srcdir=.}/init.sh"; path_prepend_ ../src

require_en_utf8_locale_

LC_ALL=en_US.UTF-8
export LC_ALL

# It would have been nice to be able to use all UTF8 characters
# with the Unicode WSpace=Y character property,
# https://en.wikipedia.org/wiki/Whitespace_character, but that
# would currently cause distracting failures everywhere I've tried.

# FIXME: including any the following in the list below would
# make this test fail on Fedora 19/glibc-2.17-18.fc19.
# Restore them to the list once it is fixed.
these_fail_with_glibc='
U+00A0 NO-BREAK SPACE:            c2 a0
U+2007 FIGURE SPACE:              e2 80 87
U+200B ZERO WIDTH SPACE:          e2 80 8b
U+202F NARROW NO-BREAK SPACE:     e2 80 af
'
fail_with_other='
U+000A Line feed:                 0a
U+0085 Next line:                 85
'

utf8_space_characters=$(sed 's/.*://;s/  */\\x/g' <<\EOF
U+0009 Horizontal Tab:            09
U+000B Vertical Tab:              0b
U+000C Form feed:                 0c
U+000D Carriage return:           0d
U+0020 SPACE:                     20
U+1680 OGHAM SPACE MARK:          e1 9a 80
U+2000 EN QUAD:                   e2 80 80
U+2001 EM QUAD:                   e2 80 81
U+2002 EN SPACE:                  e2 80 82
U+2003 EM SPACE:                  e2 80 83
U+2004 THREE-PER-EM SPACE:        e2 80 84
U+2005 FOUR-PER-EM SPACE:         e2 80 85
U+2006 SIX-PER-EM SPACE:          e2 80 86
U+2008 PUNCTUATION SPACE:         e2 80 88
U+2009 THIN SPACE:                e2 80 89
U+200A HAIR SPACE:                e2 80 8a
U+205F MEDIUM MATHEMATICAL SPACE: e2 81 9f
U+3000 IDEOGRAPHIC SPACE:         e3 80 80
EOF
)

fail=0

for i in $utf8_space_characters; do
  hex_printf_ "$i" | grep -q '^\s$' \
      || { warn_ "$i FAILED to match \\s"; fail=1; }
  hex_printf_ "$i" | grep -q '\S'
  test $? = 1 \
      || { warn_ "$i vs. \\S FAILED"; fail=1; }
done


# This is a separate test, only nominally related to \s.
# It is solely to get coverage of a code path (exercising dfa.c's
# match_mb_charset function) that would have otherwise been untouched.
# However, as of the change-set adding this new test, match_mb_charset
# is unreachable via grep.
printf '\0' | grep -aE '^\s?$' > out 2>&1
test $? = 1 || fail=1
compare /dev/null out

Exit $fail