tests: revamp multibyte-white-space test to be more permissive

This test elicits too many failures. Whether a system has accurate unicode "whitespace" attributes should not influence whether grep's test suite passes. In many cases, now you will see a warning that some multibyte characters do not pass whitespace-related tests, but this test no longer fails. However, if you run this test on a modern enough system, it does require that \s and \S do work properly with most of the listed characters. * tests/multibyte-white-space: Confirm that Fedora 24's locale tables still declare those four Unicode code points *not* whitespace. Honor a new column telling how to handle failure. Provide more information in each diagnostic. Reported by Nelson H. F. Beebe. https://bugs.gnu.org/24530
author: Jim Meyering <meyering@fb.com> 2016-09-24 15:56:08 -0700
committer: Jim Meyering <meyering@fb.com> 2016-09-24 17:23:38 -0700
commit: 7c4c69400c6ab456f1fdea4d0960c5a49cb040eb (patch)
tree: 0d00a8fe393d00b9fab310ab6aeca5c531dbaba8 /tests/multibyte-white-space
parent: 0b7fae5850db49a4238511d10a7f0ebb7067abaa (diff)
download: grep-7c4c69400c6ab456f1fdea4d0960c5a49cb040eb.tar.gz
1 files changed, 57 insertions, 34 deletions
diff --git a/tests/multibyte-white-space b/tests/multibyte-white-space
index c2a493b1..24b14cbf 100755
--- a/tests/multibyte-white-space
+++ b/tests/multibyte-white-space
@@ -18,51 +18,74 @@ export LC_ALL
 # with the Unicode WSpace=Y character property,
 # https://en.wikipedia.org/wiki/Whitespace_character, but that
 # would currently cause distracting failures everywhere I've tried.
+# Instead, I've listed each with an indicator column, telling what
+# this test should do if the system's locale/tools produce the
+# wrong answer.
 
-# FIXME: including any the following in the list below would
-# make this test fail on Fedora 19/glibc-2.17-18.fc19.
-# Restore them to the list once it is fixed.
-these_fail_with_glibc='
-U+00A0 NO-BREAK SPACE:            c2 a0
-U+2007 FIGURE SPACE:              e2 80 87
-U+200B ZERO WIDTH SPACE:          e2 80 8b
-U+202F NARROW NO-BREAK SPACE:     e2 80 af
-'
-fail_with_other='
-U+000A Line feed:                 0a
-U+0085 Next line:                 85
-'
+# The values in that column:
+# X required on all systems (fail if \s or \S fail to work as expected)
+# x required on "modern enough" systems
+# O optional: \s or \S misbehavior elicits a warning, but never failure
 
-utf8_space_characters=$(sed 's/.*://;s/  */\\x/g' <<\EOF
-U+0009 Horizontal Tab:            09
-U+000B Vertical Tab:              0b
-U+000C Form feed:                 0c
-U+000D Carriage return:           0d
-U+0020 SPACE:                     20
-U+1680 OGHAM SPACE MARK:          e1 9a 80
-U+2000 EN QUAD:                   e2 80 80
-U+2001 EM QUAD:                   e2 80 81
-U+2002 EN SPACE:                  e2 80 82
-U+2003 EM SPACE:                  e2 80 83
-U+2004 THREE-PER-EM SPACE:        e2 80 84
-U+2005 FOUR-PER-EM SPACE:         e2 80 85
-U+2006 SIX-PER-EM SPACE:          e2 80 86
-U+2008 PUNCTUATION SPACE:         e2 80 88
-U+2009 THIN SPACE:                e2 80 89
-U+200A HAIR SPACE:                e2 80 8a
-U+205F MEDIUM MATHEMATICAL SPACE: e2 81 9f
-U+3000 IDEOGRAPHIC SPACE:         e3 80 80
+utf8_space_characters=$(sed 's/.*: *//;s/  */\\x/g' <<\EOF
+U+0009 Horizontal Tab:            X 09
+U+000A Line feed:                 O 0a
+U+000B Vertical Tab:              X 0b
+U+000C Form feed:                 X 0c
+U+000D Carriage return:           X 0d
+U+0020 SPACE:                     X 20
+U+0085 Next line:                 O 85
+U+00A0 NO-BREAK SPACE:            O c2 a0
+U+1680 OGHAM SPACE MARK:          x e1 9a 80
+U+2000 EN QUAD:                   x e2 80 80
+U+2001 EM QUAD:                   x e2 80 81
+U+2002 EN SPACE:                  x e2 80 82
+U+2003 EM SPACE:                  x e2 80 83
+U+2004 THREE-PER-EM SPACE:        x e2 80 84
+U+2005 FOUR-PER-EM SPACE:         x e2 80 85
+U+2006 SIX-PER-EM SPACE:          x e2 80 86
+U+2007 FIGURE SPACE:              O e2 80 87
+U+2008 PUNCTUATION SPACE:         x e2 80 88
+U+2009 THIN SPACE:                x e2 80 89
+U+200A HAIR SPACE:                x e2 80 8a
+U+200B ZERO WIDTH SPACE:          O e2 80 8b
+U+202F NARROW NO-BREAK SPACE:     O e2 80 af
+U+205F MEDIUM MATHEMATICAL SPACE: x e2 81 9f
+U+3000 IDEOGRAPHIC SPACE:         x e3 80 80
 EOF
 )
 
 fail=0
 
+# On systems that are not "modern enough," simply warn when an "x"-marked
+# character is not classified as white space.  Too many systems
+# have inadequate UTF-8 tables in this respect, and that lack should not
+# discourage/confuse those who consider whether to install grep.
+
+# As for what constitutes "modern enough", I've arbitrarily started
+# with "Fedora 20 or newer".  Tested additions welcome.
+modern_enough=0
+grep -iE 'fedora release [2-9][0-9]+\b' /etc/redhat-release >/dev/null 2>&1 \
+  && modern_enough=1
+
 for i in $utf8_space_characters; do
+  eval 'fail() { fail=1; }'
+  m=ERROR
+  case $i in
+      X*) ;;
+      x*) test $modern_enough = 1 || { eval 'fail() { :; }'; m=warning; } ;;
+      O*) m=warning; eval 'fail() { :; }' ;;
+      *) warn_ "unexpected prefix: $i"; exit 1 ;;
+  esac
+
+  # Strip the prefix byte.
+  i=${i#?}
+
   hex_printf_ "$i" | grep -q '^\s$' \
-      || { warn_ "$i FAILED to match \\s"; fail=1; }
+      || { warn_ " $m: \\s failed to match $i in the $LC_ALL locale"; fail; }
   hex_printf_ "$i" | grep -q '\S'
   test $? = 1 \
-      || { warn_ "$i vs. \\S FAILED"; fail=1; }
+      || { warn_ " $m: \\S mistakenly matched $i in the $LC_ALL locale"; fail; }
 done
author	Jim Meyering <meyering@fb.com>	2016-09-24 15:56:08 -0700
committer	Jim Meyering <meyering@fb.com>	2016-09-24 17:23:38 -0700
commit	7c4c69400c6ab456f1fdea4d0960c5a49cb040eb (patch)
tree	0d00a8fe393d00b9fab310ab6aeca5c531dbaba8 /tests/multibyte-white-space
parent	0b7fae5850db49a4238511d10a7f0ebb7067abaa (diff)
download	grep-7c4c69400c6ab456f1fdea4d0960c5a49cb040eb.tar.gz