py: Reduce code size from utf8_check.
All sites immediately threw a UnicodeError, so roll that into the new function utf8_require. unicode.c was designed not to require runtime.h, so move the checking function into objstr.c. Reduce the number of #if sites by making a do-nothing variant that is used instead when !STR_UNICODE or !STR_UNICODE_CHECK. Signed-off-by: Jeff Epler <jepler@gmail.com>
This commit is contained in:
parent
90012e7d6a
commit
a614243deb
4 changed files with 49 additions and 49 deletions
56
py/objstr.c
56
py/objstr.c
|
|
@ -208,11 +208,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
|
||||||
if (str_hash == 0) {
|
if (str_hash == 0) {
|
||||||
str_hash = qstr_compute_hash(str_data, str_len);
|
str_hash = qstr_compute_hash(str_data, str_len);
|
||||||
}
|
}
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
mp_utf8_require(str_data, str_len);
|
||||||
if (!utf8_check(str_data, str_len)) {
|
|
||||||
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Check if a qstr with this data already exists
|
// Check if a qstr with this data already exists
|
||||||
qstr q = qstr_find_strn((const char *)str_data, str_len);
|
qstr q = qstr_find_strn((const char *)str_data, str_len);
|
||||||
|
|
@ -2285,17 +2281,13 @@ static mp_obj_t mp_obj_new_str_type_from_vstr(const mp_obj_type_t *type, vstr_t
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr) {
|
mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr) {
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
mp_utf8_require((byte *)vstr->buf, vstr->len);
|
||||||
if (!utf8_check((byte *)vstr->buf, vstr->len)) {
|
|
||||||
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
|
||||||
}
|
|
||||||
#endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
|
||||||
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
|
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr) {
|
mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr) {
|
||||||
// bypasses utf8_check.
|
// bypasses utf8_require.
|
||||||
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
|
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
|
||||||
}
|
}
|
||||||
#endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
#endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
|
@ -2305,11 +2297,7 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_obj_t mp_obj_new_str(const char *data, size_t len) {
|
mp_obj_t mp_obj_new_str(const char *data, size_t len) {
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
mp_utf8_require((byte *)data, len);
|
||||||
if (!utf8_check((byte *)data, len)) {
|
|
||||||
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
qstr q = qstr_find_strn(data, len);
|
qstr q = qstr_find_strn(data, len);
|
||||||
if (q != MP_QSTRnull) {
|
if (q != MP_QSTRnull) {
|
||||||
// qstr with this data already exists
|
// qstr with this data already exists
|
||||||
|
|
@ -2471,3 +2459,39 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
|
||||||
o->cur = 0;
|
o->cur = 0;
|
||||||
return MP_OBJ_FROM_PTR(o);
|
return MP_OBJ_FROM_PTR(o);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
static bool mp_utf8_check(const byte *p, size_t len) {
|
||||||
|
uint8_t need = 0;
|
||||||
|
const byte *end = p + len;
|
||||||
|
for (; p < end; p++) {
|
||||||
|
byte c = *p;
|
||||||
|
if (need) {
|
||||||
|
if (UTF8_IS_CONT(c)) {
|
||||||
|
need--;
|
||||||
|
} else {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (c >= 0xc0) {
|
||||||
|
if (c >= 0xf8) {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
|
||||||
|
} else if (c >= 0x80) {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return need == 0; // no pending fragments allowed
|
||||||
|
}
|
||||||
|
|
||||||
|
void mp_utf8_require(const byte *p, size_t len) {
|
||||||
|
if (!mp_utf8_check(p, len)) {
|
||||||
|
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -119,4 +119,13 @@ extern const mp_obj_dict_t mp_obj_bytearray_locals_dict;
|
||||||
extern const mp_obj_dict_t mp_obj_array_locals_dict;
|
extern const mp_obj_dict_t mp_obj_array_locals_dict;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
// Throws an exception if string content is not UTF-8
|
||||||
|
void mp_utf8_require(const byte *p, size_t len);
|
||||||
|
#else
|
||||||
|
// If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
|
||||||
|
static inline void mp_utf8_require(const byte *p, size_t len) {
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // MICROPY_INCLUDED_PY_OBJSTR_H
|
#endif // MICROPY_INCLUDED_PY_OBJSTR_H
|
||||||
|
|
|
||||||
32
py/unicode.c
32
py/unicode.c
|
|
@ -177,35 +177,3 @@ mp_uint_t unichar_xdigit_value(unichar c) {
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
|
||||||
|
|
||||||
bool utf8_check(const byte *p, size_t len) {
|
|
||||||
uint8_t need = 0;
|
|
||||||
const byte *end = p + len;
|
|
||||||
for (; p < end; p++) {
|
|
||||||
byte c = *p;
|
|
||||||
if (need) {
|
|
||||||
if (UTF8_IS_CONT(c)) {
|
|
||||||
need--;
|
|
||||||
} else {
|
|
||||||
// mismatch
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (c >= 0xc0) {
|
|
||||||
if (c >= 0xf8) {
|
|
||||||
// mismatch
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
|
|
||||||
} else if (c >= 0x80) {
|
|
||||||
// mismatch
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return need == 0; // no pending fragments allowed
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,5 @@
|
||||||
#include "py/misc.h"
|
#include "py/misc.h"
|
||||||
|
|
||||||
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
|
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
|
||||||
bool utf8_check(const byte *p, size_t len);
|
|
||||||
|
|
||||||
#endif // MICROPY_INCLUDED_PY_UNICODE_H
|
#endif // MICROPY_INCLUDED_PY_UNICODE_H
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue