summaryrefslogtreecommitdiff
path: root/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
blob: c90e5c2fc1c4945bb25cbba9d384d22dd1b3ab2c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format

// This version runs the test when the platform has Unicode support.
// UNSUPPORTED: libcpp-has-no-unicode

// TODO FMT Investigate Windows issues.
// UNSUPPORTED msvc, target={{.+}}-windows-gnu

// TODO FMT This test should not require std::to_chars(floating-point)
// XFAIL: availability-fp_to_chars-missing

// <format>

// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.

// More specific extended grapheme cluster boundary rules are tested in
// test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
// this test is based on test data provided by the Unicode Consortium.

#include <format>
#include <cassert>
#include <vector>

#include "make_string.h"
#include "test_macros.h"
#include "string_literal.h"
#include "test_format_string.h"

#ifndef TEST_HAS_NO_LOCALIZATION
#  include <iostream>
#  include <type_traits>
#endif

#define SV(S) MAKE_STRING_VIEW(CharT, S)

template < class CharT, class... Args>
void check(std::basic_string_view<CharT> expected, test_format_string<CharT, Args...> fmt, Args&&... args) {
  std::basic_string<CharT> out = std::format(fmt, std::forward<Args>(args)...);
#ifndef TEST_HAS_NO_LOCALIZATION
  if constexpr (std::same_as<CharT, char>)
    if (out != expected)
      std::cerr << "\nFormat string   " << fmt.get() << "\nExpected output " << expected << "\nActual output   " << out
                << '\n';
#endif
  assert(out == expected);
};

template <class CharT>
static void test_single_code_point_fill() {
  //*** 1-byte code points ***
  check(SV("* *"), SV("{:*^3}"), SV(" "));
  check(SV("*~*"), SV("{:*^3}"), SV("~"));

  //*** 2-byte code points ***
  check(SV("*\u00a1*"), SV("{:*^3}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
  check(SV("*\u07ff*"), SV("{:*^3}"), SV("\u07ff")); // NKO TAMAN SIGN

  //*** 3-byte code points ***
  check(SV("*\u0800*"), SV("{:*^3}"), SV("\u0800")); // SAMARITAN LETTER ALAF
  check(SV("*\ufffd*"), SV("{:*^3}"), SV("\ufffd")); // REPLACEMENT CHARACTER

  // 2 column ranges
  check(SV("*\u1100*"), SV("{:*^4}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
  check(SV("*\u115f*"), SV("{:*^4}"), SV("\u115f")); // HANGUL CHOSEONG FILLER

  check(SV("*\u2329*"), SV("{:*^4}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
  check(SV("*\u232a*"), SV("{:*^4}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET

  check(SV("*\u2e80*"), SV("{:*^4}"), SV("\u2e80")); // CJK RADICAL REPEAT
  check(SV("*\u303e*"), SV("{:*^4}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR

  check(SV("*\u3041*"), SV("{:*^4}"), SV("\u3041")); // U+3041 HIRAGANA LETTER SMALL A
  check(SV("*\ua4d0*"), SV("{:*^3}"), SV("\ua4d0")); // U+A4D0 LISU LETTER BA

  check(SV("*\uac00*"), SV("{:*^4}"), SV("\uac00")); // <Hangul Syllable, First>
  check(SV("*\ud7a3*"), SV("{:*^4}"), SV("\ud7a3")); // Hangul Syllable Hih

  check(SV("*\uf900*"), SV("{:*^4}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
  check(SV("*\ufaff*"), SV("{:*^4}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF

  check(SV("*\ufe10*"), SV("{:*^4}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
  check(SV("*\ufe19*"), SV("{:*^4}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS

  check(SV("*\ufe30*"), SV("{:*^4}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
  check(SV("*\ufe70*"), SV("{:*^3}"), SV("\ufe70")); // U+FE70 ARABIC FATHATAN ISOLATED FORM

  check(SV("*\uff01*"), SV("{:*^4}"), SV("\uff01")); // U+FF01 FULLWIDTH EXCLAMATION MARK
  check(SV("*\uff60*"), SV("{:*^4}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS

  check(SV("*\uffe0*"), SV("{:*^4}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
  check(SV("*\uffe6*"), SV("{:*^4}"), SV("\uffe6")); // FULLWIDTH WON SIGN

  //*** 4-byte code points ***
  check(SV("*\U00010000*"), SV("{:*^3}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
  check(SV("*\U0010FFFF*"), SV("{:*^3}"), SV("\U0010FFFF")); // Undefined Character

  // 2 column ranges
  check(SV("*\U0001f300*"), SV("{:*^4}"), SV("\U0001f300")); // CYCLONE
  check(SV("*\U0001f64f*"), SV("{:*^4}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
  check(SV("*\U0001f900*"), SV("{:*^4}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
  check(SV("*\U0001f9ff*"), SV("{:*^4}"), SV("\U0001f9ff")); // NAZAR AMULET
  check(SV("*\U00020000*"), SV("{:*^4}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
  check(SV("*\U0002fffd*"), SV("{:*^4}"), SV("\U0002fffd")); // Undefined Character
  check(SV("*\U00030000*"), SV("{:*^4}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
  check(SV("*\U0003fffd*"), SV("{:*^4}"), SV("\U0003fffd")); // Undefined Character
}

// One column output is unaffected.
// Two column output is removed, thus the result is only the fill character.
template <class CharT>
static void test_single_code_point_truncate() {
  //*** 1-byte code points ***
  check(SV("* *"), SV("{:*^3.1}"), SV(" "));
  check(SV("*~*"), SV("{:*^3.1}"), SV("~"));

  //*** 2-byte code points ***
  check(SV("*\u00a1*"), SV("{:*^3.1}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
  check(SV("*\u07ff*"), SV("{:*^3.1}"), SV("\u07ff")); // NKO TAMAN SIGN

  //*** 3.1-byte code points ***
  check(SV("*\u0800*"), SV("{:*^3.1}"), SV("\u0800")); // SAMARITAN LETTER ALAF
  check(SV("*\ufffd*"), SV("{:*^3.1}"), SV("\ufffd")); // REPLACEMENT CHARACTER

  // 2 column ranges
  check(SV("***"), SV("{:*^3.1}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
  check(SV("***"), SV("{:*^3.1}"), SV("\u115f")); // HANGUL CHOSEONG FILLER

  check(SV("***"), SV("{:*^3.1}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
  check(SV("***"), SV("{:*^3.1}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET

  check(SV("***"), SV("{:*^3.1}"), SV("\u2e80")); // CJK RADICAL REPEAT
  check(SV("***"), SV("{:*^3.1}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR

  check(SV("***"), SV("{:*^3.1}"), SV("\u3041")); // U+3041 HIRAGANA LETTER SMALL A
  check(SV("*\ua4d0*"), SV("{:*^3.1}"), SV("\ua4d0")); // U+A4D0 LISU LETTER BA

  check(SV("***"), SV("{:*^3.1}"), SV("\uac00")); // <Hangul Syllable, First>
  check(SV("***"), SV("{:*^3.1}"), SV("\ud7a3")); // Hangul Syllable Hih

  check(SV("***"), SV("{:*^3.1}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
  check(SV("***"), SV("{:*^3.1}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF

  check(SV("***"), SV("{:*^3.1}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
  check(SV("***"), SV("{:*^3.1}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS

  check(SV("***"), SV("{:*^3.1}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
  check(SV("*\ufe70*"), SV("{:*^3.1}"), SV("\ufe70")); // U+FE70 ARABIC FATHATAN ISOLATED FORM

  check(SV("***"), SV("{:*^3.1}"), SV("\uff01"));      // U+FF01 FULLWIDTH EXCLAMATION MARK
  check(SV("***"), SV("{:*^3.1}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS

  check(SV("***"), SV("{:*^3.1}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
  check(SV("***"), SV("{:*^3.1}"), SV("\uffe6")); // FULLWIDTH WON SIGN

  //*** 3.1-byte code points ***
  check(SV("*\U00010000*"), SV("{:*^3.1}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
  check(SV("*\U0010FFFF*"), SV("{:*^3.1}"), SV("\U0010FFFF")); // Undefined Character

  // 2 column ranges
  check(SV("***"), SV("{:*^3.1}"), SV("\U0001f300")); // CYCLONE
  check(SV("***"), SV("{:*^3.1}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
  check(SV("***"), SV("{:*^3.1}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
  check(SV("***"), SV("{:*^3.1}"), SV("\U0001f9ff")); // NAZAR AMULET
  check(SV("***"), SV("{:*^3.1}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
  check(SV("***"), SV("{:*^3.1}"), SV("\U0002fffd")); // Undefined Character
  check(SV("***"), SV("{:*^3.1}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
  check(SV("***"), SV("{:*^3.1}"), SV("\U0003fffd")); // Undefined Character
}

// The examples used in that paper.
template <class CharT>
static void test_P1868() {
  // Fill
  check(SV("*\u0041*"), SV("{:*^3}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
  check(SV("*\u00c1*"), SV("{:*^3}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
  check(SV("*\u0041\u0301*"),
        SV("{:*^3}"),
        SV("\u0041\u0301"));                         // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
  check(SV("*\u0132*"), SV("{:*^3}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
  check(SV("*\u0394*"), SV("{:*^3}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }

  check(SV("*\u0429*"), SV("{:*^3}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
  check(SV("*\u05d0*"), SV("{:*^3}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
  check(SV("*\u0634*"), SV("{:*^3}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
  check(SV("*\u3009*"), SV("{:*^4}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
  check(SV("*\u754c*"), SV("{:*^4}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
  check(SV("*\U0001f921*"), SV("{:*^4}"), SV("\U0001f921")); // { UNICORN FACE }
  check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
        SV("{:*^4}"),
        SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }

  // Truncate to 1 column: 1 column grapheme clusters are kept together.
  check(SV("*\u0041*"), SV("{:*^3.1}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
  check(SV("*\u00c1*"), SV("{:*^3.1}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
  check(SV("*\u0041\u0301*"),
        SV("{:*^3.1}"),
        SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
  check(SV("*\u0132*"), SV("{:*^3.1}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
  check(SV("*\u0394*"), SV("{:*^3.1}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }

  check(SV("*\u0429*"), SV("{:*^3.1}"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
  check(SV("*\u05d0*"), SV("{:*^3.1}"), SV("\u05d0")); // { HEBREW LETTER ALEF }
  check(SV("*\u0634*"), SV("{:*^3.1}"), SV("\u0634")); // { ARABIC LETTER SHEEN }
  check(SV("***"), SV("{:*^3.1}"), SV("\u3009"));      // { RIGHT-POINTING ANGLE BRACKET }
  check(SV("***"), SV("{:*^3.1}"), SV("\u754c"));      // { CJK Unified Ideograph-754C }
  check(SV("***"), SV("{:*^3.1}"), SV("\U0001f921"));  // { UNICORN FACE }
  check(SV("***"),
        SV("{:*^3.1}"),
        SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }

  // Truncate to 2 column: 2 column grapheme clusters are kept together.
  check(SV("*\u0041*"), SV("{:*^3.2}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
  check(SV("*\u00c1*"), SV("{:*^3.2}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
  check(SV("*\u0041\u0301*"),
        SV("{:*^3.2}"),
        SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
  check(SV("*\u0132*"), SV("{:*^3.2}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
  check(SV("*\u0394*"), SV("{:*^3.2}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }

  check(SV("*\u0429*"), SV("{:*^3.2}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
  check(SV("*\u05d0*"), SV("{:*^3.2}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
  check(SV("*\u0634*"), SV("{:*^3.2}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
  check(SV("*\u3009*"), SV("{:*^4.2}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
  check(SV("*\u754c*"), SV("{:*^4.2}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
  check(SV("*\U0001f921*"), SV("{:*^4.2}"), SV("\U0001f921")); // { UNICORN FACE }
  check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
        SV("{:*^4.2}"),
        SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
}

#ifdef _LIBCPP_VERSION
// Tests the libc++ specific behaviour for malformed UTF-sequences. The
// Standard doesn't specify how to handle this.
template <class CharT>
static void test_malformed_code_point() {
  if constexpr (sizeof(CharT) == 1) {
    // Malformed at end.
    check(SV("*ZZZZ\x8f*"), SV("{:*^7}"), SV("ZZZZ\x8f"));
    check(SV("*ZZZZ\xcf*"), SV("{:*^7}"), SV("ZZZZ\xcf"));
    check(SV("*ZZZZ\xef*"), SV("{:*^7}"), SV("ZZZZ\xef"));
    check(SV("*ZZZZ\xff*"), SV("{:*^7}"), SV("ZZZZ\xff"));

    // Malformed in middle, no continuation
    check(SV("*ZZZZ\x8fZ*"), SV("{:*^8}"), SV("ZZZZ\x8fZ"));
    check(SV("*ZZZZ\xcfZ*"), SV("{:*^8}"), SV("ZZZZ\xcfZ"));
    check(SV("*ZZZZ\xefZ*"), SV("{:*^8}"), SV("ZZZZ\xefZ"));
    check(SV("*ZZZZ\xffZ*"), SV("{:*^8}"), SV("ZZZZ\xffZ"));

    check(SV("*ZZZZ\x8fZZ*"), SV("{:*^9}"), SV("ZZZZ\x8fZZ"));
    check(SV("*ZZZZ\xcfZZ*"), SV("{:*^9}"), SV("ZZZZ\xcfZZ"));
    check(SV("*ZZZZ\xefZZ*"), SV("{:*^9}"), SV("ZZZZ\xefZZ"));
    check(SV("*ZZZZ\xffZZ*"), SV("{:*^9}"), SV("ZZZZ\xffZZ"));

    check(SV("*ZZZZ\x8fZZZ*"), SV("{:*^10}"), SV("ZZZZ\x8fZZZ"));
    check(SV("*ZZZZ\xcfZZZ*"), SV("{:*^10}"), SV("ZZZZ\xcfZZZ"));
    check(SV("*ZZZZ\xefZZZ*"), SV("{:*^10}"), SV("ZZZZ\xefZZZ"));
    check(SV("*ZZZZ\xffZZZ*"), SV("{:*^10}"), SV("ZZZZ\xffZZZ"));

    check(SV("*ZZZZ\x8fZZZZ*"), SV("{:*^11}"), SV("ZZZZ\x8fZZZZ"));
    check(SV("*ZZZZ\xcfZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xcfZZZZ"));
    check(SV("*ZZZZ\xefZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xefZZZZ"));
    check(SV("*ZZZZ\xffZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xffZZZZ"));

    // Premature end.
    check(SV("*ZZZZ\xef\xf5*"), SV("{:*^8}"), SV("ZZZZ\xef\xf5"));
    check(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("{:*^12}"), SV("ZZZZ\xef\xf5ZZZZ"));
    check(SV("*ZZZZ\xff\xf5\xf5*"), SV("{:*^9}"), SV("ZZZZ\xff\xf5\xf5"));
    check(SV("*ZZZZ\xff\xf5\xf5ZZZZ*"), SV("{:*^13}"), SV("ZZZZ\xff\xf5\xf5ZZZZ"));

  } else if constexpr (sizeof(CharT) == 2) {
    // TODO FMT Add these tests.
  }
  // UTF-32 doesn't combine characters, thus no corruption tests.
}
#endif

template <class CharT>
static void test() {
  test_single_code_point_fill<CharT>();
  test_single_code_point_truncate<CharT>();
  test_P1868<CharT>();

#ifdef _LIBCPP_VERSION
  test_malformed_code_point<CharT>();
#endif
}

int main(int, char**) {
  test<char>();

#ifndef TEST_HAS_NO_WIDE_CHARACTERS
  test<wchar_t>();
#endif

  return 0;
}