From a8ba1ddd78544b4bda749051d44f7b2a8a0ec5ff Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Fri, 24 Mar 2023 11:53:53 -0700 Subject: Use UTF-8 encoding for literal extended regexps with UTF-8 characters in comments Fixes [Bug #19455] --- re.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 're.c') diff --git a/re.c b/re.c index d7490bbbbf..f6abf46131 100644 --- a/re.c +++ b/re.c @@ -2948,7 +2948,11 @@ escape_asis: case '#': if (extended_mode && !in_char_class) { /* consume and ignore comment in extended regexp */ - while ((p < end) && ((c = *p++) != '\n')); + while ((p < end) && ((c = *p++) != '\n')) { + if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) { + *encp = enc; + } + } break; } rb_str_buf_cat(buf, (char *)&c, 1); @@ -2983,6 +2987,9 @@ escape_asis: switch (c = *p++) { default: if (!(c & 0x80)) break; + if (!*encp && enc == rb_utf8_encoding()) { + *encp = enc; + } --p; /* fallthrough */ case '\\': -- cgit v1.2.1