parse: Don't allow creation of invalid UTF8 strings or identifiers.

.. even when compiling non UTF-8 files or byte strings. Closes: #17855 Signed-off-by: Jeff Epler <jepler@gmail.com>
2025-08-07 09:02:57 -05:00 · 2025-08-07 09:02:57 -05:00 · 90e366eefa
commit 90e366eefa
parent a614243deb
3 changed files with 10 additions and 1 deletions
--- a/py/objstr.c
+++ b/py/objstr.c
@ -2461,7 +2461,7 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
 }

 #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
-static bool mp_utf8_check(const byte *p, size_t len) {
+bool mp_utf8_check(const byte *p, size_t len) {
    uint8_t need = 0;
    const byte *end = p + len;
    for (; p < end; p++) {
--- a/py/objstr.h
+++ b/py/objstr.h
@ -122,10 +122,14 @@ extern const mp_obj_dict_t mp_obj_array_locals_dict;
 #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
 // Throws an exception if string content is not UTF-8
 void mp_utf8_require(const byte *p, size_t len);
+bool mp_utf8_check(const byte *p, size_t len);
 #else
 // If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
 static inline void mp_utf8_require(const byte *p, size_t len) {
 }
+static inline bool mp_utf8_check(const byte *p, size_t len) {
+    return true;
+}
 #endif

 #endif // MICROPY_INCLUDED_PY_OBJSTR_H
--- a/py/parse.c
+++ b/py/parse.c
@ -598,6 +598,11 @@ static mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t
 static void push_result_token(parser_t *parser, uint8_t rule_id) {
    mp_parse_node_t pn;
    mp_lexer_t *lex = parser->lexer;
+    if (lex->tok_kind == MP_TOKEN_NAME || lex->tok_kind == MP_TOKEN_STRING) {
+        if (!mp_utf8_check((byte *)lex->vstr.buf, lex->vstr.len)) {
+            mp_raise_msg(&mp_type_SyntaxError, NULL);
+        }
+    }
    if (lex->tok_kind == MP_TOKEN_NAME) {
        qstr id = qstr_from_strn(lex->vstr.buf, lex->vstr.len);
        #if MICROPY_COMP_CONST