As the standard requires, take 4 hex digits after the \u opener of a Universal Character Name, or take 8 hex digits after \U, but reject smaller counts and don't consume more (https://port70.net/~nsz/c/c11/n1570.html#6.4.3, https://port70.net/~nsz/c/c99/n1256.html#6.4.3). The unicode codepoint used to get truncated to 1 byte. Now it gets expanded into UTF-8, matching gcc & clang behavior on Linux. TODO: Universal character names should also be supported in identifiers, as in, e.g., char \u010dau_sv\u011bte[]="čau_světe";
19 lines
536 B
C
19 lines
536 B
C
// this file contains BMP chars encoded in UTF-8
|
|
#include <stdio.h>
|
|
#include <wchar.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
int main()
|
|
{
|
|
char hello_world_in_czech[] = "čau, světe";
|
|
char hello_world_in_czech_ucn[] = "\u010dau, sv\u011bte";
|
|
if (sizeof(hello_world_in_czech) != sizeof(hello_world_in_czech_ucn)
|
|
|| strcmp(hello_world_in_czech, hello_world_in_czech_ucn))
|
|
abort();
|
|
|
|
wchar_t s[] = L"hello$$你好¢¢世界€€world";
|
|
wchar_t *p;
|
|
for (p = s; *p; p++) printf("%04X ", (unsigned) *p);
|
|
return 0;
|
|
}
|