parse: Don't allow creation of invalid UTF8 strings or identifiers.

.. even when compiling non UTF-8 files or byte strings.

Closes: #17855
Signed-off-by: Jeff Epler <jepler@gmail.com>
This commit is contained in:
Jeff Epler 2025-08-07 09:02:57 -05:00
parent a614243deb
commit 90e366eefa
3 changed files with 10 additions and 1 deletions

View file

@ -2461,7 +2461,7 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
}
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
static bool mp_utf8_check(const byte *p, size_t len) {
bool mp_utf8_check(const byte *p, size_t len) {
uint8_t need = 0;
const byte *end = p + len;
for (; p < end; p++) {

View file

@ -122,10 +122,14 @@ extern const mp_obj_dict_t mp_obj_array_locals_dict;
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
// Throws an exception if string content is not UTF-8
void mp_utf8_require(const byte *p, size_t len);
bool mp_utf8_check(const byte *p, size_t len);
#else
// If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
static inline void mp_utf8_require(const byte *p, size_t len) {
}
static inline bool mp_utf8_check(const byte *p, size_t len) {
return true;
}
#endif
#endif // MICROPY_INCLUDED_PY_OBJSTR_H

View file

@ -598,6 +598,11 @@ static mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t
static void push_result_token(parser_t *parser, uint8_t rule_id) {
mp_parse_node_t pn;
mp_lexer_t *lex = parser->lexer;
if (lex->tok_kind == MP_TOKEN_NAME || lex->tok_kind == MP_TOKEN_STRING) {
if (!mp_utf8_check((byte *)lex->vstr.buf, lex->vstr.len)) {
mp_raise_msg(&mp_type_SyntaxError, NULL);
}
}
if (lex->tok_kind == MP_TOKEN_NAME) {
qstr id = qstr_from_strn(lex->vstr.buf, lex->vstr.len);
#if MICROPY_COMP_CONST