From 07907984bf8c5de667ed514269c90ca47441557d Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Wed, 3 Jan 2024 19:31:35 -0600 Subject: [PATCH] core: Throw an exception for invalid int literals like "01". This includes making int("01") parse in base 10 like standard Python. The new error message is different from cpython. It says e.g., `SyntaxError: invalid syntax for integer with base 0: '09'` Additional test cases were added to cover the changed & added code. Co-authored-by: Damien George Signed-off-by: Jeff Epler --- py/objint.c | 2 +- py/parsenum.c | 4 ++-- py/parsenumbase.c | 29 +++++++++++------------------ tests/basics/int1.py | 15 +++++++++++++++ tests/basics/lexer.py | 8 ++++++++ 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/py/objint.c b/py/objint.c index 773e180343..4be6009a44 100644 --- a/py/objint.c +++ b/py/objint.c @@ -55,7 +55,7 @@ static mp_obj_t mp_obj_int_make_new(const mp_obj_type_t *type_in, size_t n_args, return o; } else if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) { // a textual representation, parse it - return mp_parse_num_integer(bufinfo.buf, bufinfo.len, 0, NULL); + return mp_parse_num_integer(bufinfo.buf, bufinfo.len, 10, NULL); #if MICROPY_PY_BUILTINS_FLOAT } else if (mp_obj_is_float(args[0])) { return mp_obj_new_int_from_float(mp_obj_float_get(args[0])); diff --git a/py/parsenum.c b/py/parsenum.c index b33ffb6ff2..27d6641198 100644 --- a/py/parsenum.c +++ b/py/parsenum.c @@ -151,13 +151,13 @@ value_error: raise_exc(exc, lex); #elif MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_NORMAL mp_obj_t exc = mp_obj_new_exception_msg_varg(&mp_type_ValueError, - MP_ERROR_TEXT("invalid syntax for integer with base %d"), base); + MP_ERROR_TEXT("invalid syntax for integer with base %d"), base == 1 ? 0 : base); raise_exc(exc, lex); #else vstr_t vstr; mp_print_t print; vstr_init_print(&vstr, 50, &print); - mp_printf(&print, "invalid syntax for integer with base %d: ", base); + mp_printf(&print, "invalid syntax for integer with base %d: ", base == 1 ? 0 : base); mp_str_print_quoted(&print, str_val_start, top - str_val_start, true); mp_obj_t exc = mp_obj_new_exception_arg1(&mp_type_ValueError, mp_obj_new_str_from_utf8_vstr(&vstr)); diff --git a/py/parsenumbase.c b/py/parsenumbase.c index 94523a666d..cc3275c456 100644 --- a/py/parsenumbase.c +++ b/py/parsenumbase.c @@ -30,35 +30,28 @@ // find real radix base, and strip preceding '0x', '0o' and '0b' // puts base in *base, and returns number of bytes to skip the prefix +// in base-0, puts 1 in *base to indicate a number that starts with 0, to provoke a +// ValueError if it's not all-digits-zero. size_t mp_parse_num_base(const char *str, size_t len, int *base) { const byte *p = (const byte *)str; if (len <= 1) { goto no_prefix; } unichar c = *(p++); - if ((*base == 0 || *base == 16) && c == '0') { - c = *(p++); - if ((c | 32) == 'x') { + if (c == '0') { + c = *(p++) | 32; + int b = *base; + if (c == 'x' && !(b & ~16)) { *base = 16; - } else if (*base == 0 && (c | 32) == 'o') { + } else if (c == 'o' && !(b & ~8)) { *base = 8; - } else if (*base == 0 && (c | 32) == 'b') { + } else if (c == 'b' && !(b & ~2)) { *base = 2; } else { - if (*base == 0) { - *base = 10; + p -= 2; + if (b == 0) { + *base = 1; } - p -= 2; - } - } else if (*base == 8 && c == '0') { - c = *(p++); - if ((c | 32) != 'o') { - p -= 2; - } - } else if (*base == 2 && c == '0') { - c = *(p++); - if ((c | 32) != 'b') { - p -= 2; } } else { p--; diff --git a/tests/basics/int1.py b/tests/basics/int1.py index 2d92105c73..94723af4d0 100644 --- a/tests/basics/int1.py +++ b/tests/basics/int1.py @@ -13,6 +13,7 @@ print(int('1')) print(int('+1')) print(int('-1')) print(int('01')) +print(int('00')) print(int('9')) print(int('10')) print(int('+10')) @@ -31,6 +32,7 @@ print(int(' -3 ')) print(int('0', 10)) print(int('1', 10)) print(int(' \t 1 \t ', 10)) +print(int(' \t 00 \t ', 10)) print(int('11', 10)) print(int('11', 16)) print(int('11', 8)) @@ -52,6 +54,17 @@ print(int(' \t 0o12', 8)) print(int('0o12 \t ', 8)) print(int(b"12", 10)) print(int(b"12")) +print(int('000 ', 0)) +print(int('000 ', 2)) +print(int('000 ', 8)) +print(int('000 ', 10)) +print(int('000 ', 16)) +print(int('000 ', 36)) +print(int('010 ', 2)) +print(int('010 ', 8)) +print(int('010 ', 10)) +print(int('010 ', 16)) +print(int('010 ', 36)) def test(value, base): @@ -79,6 +92,8 @@ test('0o8', 8) test('0xg', 16) test('1 1', 16) test('123', 37) +test('01', 0) +test('01 ', 0) # check that we don't parse this as a floating point number print(0x1e+1) diff --git a/tests/basics/lexer.py b/tests/basics/lexer.py index 181d62db1a..addb8a13df 100644 --- a/tests/basics/lexer.py +++ b/tests/basics/lexer.py @@ -83,3 +83,11 @@ try: exec(r"'\U0000000'") except SyntaxError: print("SyntaxError") + +# Properly formed integer literals +print(eval("00")) +# badly formed integer literals +try: + eval("01") +except SyntaxError: + print("SyntaxError")