core: Rework MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC.
(note: this should probably end up squashed) Most MCUs apart from Cortex-M0 with Thumb 1 have an instruction for computing the "high part" of a multiplication (e.g., the upper 32 bits of a 32x32 multiply). When they do, gcc uses this to implement a small and fast overflow check using the __builtin_mul_overflow intrinsic, which is preferable to the guard division method used in smallint.c. However, in contrast to the previous mp_small_int_mul_overflow routine, which checks that the result fits not only within mp_int_t but is SMALL_INT_FITS(), __builtin_mul_overflow only checks for overflow of the C type. As a result, a slight change in the code flow is needed for MP_BINARY_OP_MULTIPLY. Other sites using mp_small_int_mul_overflow already had the result value flow through to a SMALL_INT_FITS check so they didn't need any additional changes. Signed-off-by: Jeff Epler <jepler@gmail.com>
This commit is contained in:
parent
bf5312b8da
commit
19000d6e49
6 changed files with 49 additions and 66 deletions
30
py/misc.h
30
py/misc.h
|
|
@ -428,7 +428,7 @@ static inline uint32_t mp_clz_mpi(mp_int_t x) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Overflow-checked operations for long long
|
||||
// Overflow-checked operations
|
||||
|
||||
// Integer overflow builtins were added to GCC 5, but __has_builtin only in GCC 10
|
||||
//
|
||||
|
|
@ -436,8 +436,28 @@ static inline uint32_t mp_clz_mpi(mp_int_t x) {
|
|||
// functions below don't update the result if an overflow would occur (to avoid UB).
|
||||
#define MP_GCC_HAS_BUILTIN_OVERFLOW (__GNUC__ >= 5)
|
||||
|
||||
#if __has_builtin(__builtin_umulll_overflow) || MP_GCC_HAS_BUILTIN_OVERFLOW
|
||||
// <limits.h> may not define these macros when gcc is in C++ mode.
|
||||
#ifndef ULLONG_MAX
|
||||
#define ULLONG_MAX (~0ULL)
|
||||
#endif
|
||||
|
||||
#ifndef LLONG_MAX
|
||||
#define LLONG_MAX ((long long)(ULLONG_MAX >> 1))
|
||||
#endif
|
||||
|
||||
#ifndef LLONG_MIN
|
||||
#define LLONG_MIN (-LLONG_MAX - 1)
|
||||
#endif
|
||||
|
||||
|
||||
#if MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
#define mp_mul_ull_overflow __builtin_umulll_overflow
|
||||
#define mp_mul_ll_overflow __builtin_smulll_overflow
|
||||
inline static bool mp_mul_mp_int_t_overflow(mp_int_t x, mp_int_t y, mp_int_t *res) {
|
||||
// __builtin_mul_overflow is a type-generic function, this inline ensures the argument
|
||||
// types are checked to match mp_int_t.
|
||||
return __builtin_mul_overflow(x, y, res);
|
||||
}
|
||||
#else
|
||||
inline static bool mp_mul_ull_overflow(unsigned long long int x, unsigned long long int y, unsigned long long int *res) {
|
||||
if (y > 0 && x > (ULLONG_MAX / y)) {
|
||||
|
|
@ -446,11 +466,7 @@ inline static bool mp_mul_ull_overflow(unsigned long long int x, unsigned long l
|
|||
*res = x * y;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_smulll_overflow) || MP_GCC_HAS_BUILTIN_OVERFLOW
|
||||
#define mp_mul_ll_overflow __builtin_smulll_overflow
|
||||
#else
|
||||
inline static bool mp_mul_ll_overflow(long long int x, long long int y, long long int *res) {
|
||||
bool overflow;
|
||||
|
||||
|
|
@ -475,6 +491,8 @@ inline static bool mp_mul_ll_overflow(long long int x, long long int y, long lon
|
|||
|
||||
return overflow;
|
||||
}
|
||||
|
||||
extern bool mp_mul_mp_int_t_overflow(mp_int_t x, mp_int_t y, mp_int_t *res);
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_saddll_overflow) || MP_GCC_HAS_BUILTIN_OVERFLOW
|
||||
|
|
|
|||
|
|
@ -2279,19 +2279,19 @@ typedef time_t mp_timestamp_t;
|
|||
#endif
|
||||
|
||||
// If true, use __builtin_mul_overflow (a gcc intrinsic supported by clang) for
|
||||
// overflow checking when multiplying two small ints. Otherwise, use the
|
||||
// routine mp_small_int_mul_overflow.
|
||||
// overflow checking when multiplying two small ints. Otherwise, use a portable
|
||||
// algorithm.
|
||||
//
|
||||
// On MCUs with a 32x32->64 bit multiply instruction (such as Cortex M4, Cortex M33)
|
||||
// this is likely to be faster and generate smaller code.
|
||||
// Most MCUs have a with a 32x32->64 bit multiply instruction, in which case the
|
||||
// intrinsic is likely to be faster and generate smaller code. The main exception is
|
||||
// cortex-m0 with __ARM_ARCH_ISA_THUMB == 1.
|
||||
//
|
||||
// The semantics of mp_small_int_mul_overflow. and__builtin_mul_overflow are not quite the
|
||||
// same: mp_small_int_mul_overflow additionally checks that the result fits within a
|
||||
// small integer, not just within mp_int_t.
|
||||
// The intrinsic is in GCC from version 5. In principle it can be detected instead with
|
||||
// __has_builtin except this is only in GCC from version 5.
|
||||
#ifndef MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
#if defined(__ARM_ARCH_ISA_THUMB) && defined(__GNUC__)
|
||||
#if defined(__ARM_ARCH_ISA_THUMB) && (__GNUC__ >= 5)
|
||||
#define MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC (__ARM_ARCH_ISA_THUMB >= 2)
|
||||
#elif (defined(__riscv_m) || defined(__x86_64__) || defined(__i686__)) && defined(__GNUC__)
|
||||
#elif (__GNUC__ >= 5)
|
||||
#define MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC (1)
|
||||
#else
|
||||
#define MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC (0)
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "py/runtime.h"
|
||||
#include "py/misc.h"
|
||||
#include "py/parsenumbase.h"
|
||||
#include "py/parsenum.h"
|
||||
#include "py/smallint.h"
|
||||
|
|
@ -55,7 +56,7 @@ typedef mp_int_t parsed_int_t;
|
|||
#if MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
#define PARSED_INT_MUL_OVERFLOW __builtin_mul_overflow
|
||||
#else
|
||||
#define PARSED_INT_MUL_OVERFLOW mp_small_int_mul_overflow
|
||||
#define PARSED_INT_MUL_OVERFLOW mp_mul_mp_int_t_overflow
|
||||
#endif
|
||||
#define PARSED_INT_FITS MP_SMALL_INT_FITS
|
||||
#else
|
||||
|
|
|
|||
44
py/runtime.c
44
py/runtime.c
|
|
@ -490,24 +490,8 @@ mp_obj_t MICROPY_WRAP_MP_BINARY_OP(mp_binary_op)(mp_binary_op_t op, mp_obj_t lhs
|
|||
case MP_BINARY_OP_MULTIPLY:
|
||||
case MP_BINARY_OP_INPLACE_MULTIPLY: {
|
||||
|
||||
// If long long type exists and is larger than mp_int_t, then
|
||||
// we can use the following code to perform overflow-checked multiplication.
|
||||
// Otherwise (eg in x64 case) we must use mp_small_int_mul_overflow.
|
||||
#if 0
|
||||
// compute result using long long precision
|
||||
long long res = (long long)lhs_val * (long long)rhs_val;
|
||||
if (res > MP_SMALL_INT_MAX || res < MP_SMALL_INT_MIN) {
|
||||
// result overflowed SMALL_INT, so return higher precision integer
|
||||
return mp_obj_new_int_from_ll(res);
|
||||
} else {
|
||||
// use standard precision
|
||||
lhs_val = (mp_int_t)res;
|
||||
}
|
||||
#endif
|
||||
|
||||
mp_int_t int_res;
|
||||
#if MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
if (__builtin_mul_overflow(lhs_val, rhs_val, &int_res)) {
|
||||
if (mp_mul_mp_int_t_overflow(lhs_val, rhs_val, &int_res)) {
|
||||
lhs = mp_obj_new_int_from_ll(lhs_val);
|
||||
goto generic_binary_op;
|
||||
} else {
|
||||
|
|
@ -515,16 +499,6 @@ mp_obj_t MICROPY_WRAP_MP_BINARY_OP(mp_binary_op)(mp_binary_op_t op, mp_obj_t lhs
|
|||
}
|
||||
|
||||
break; // result fits in mp_int_t but might not be MP_SMALL_INT_FITS
|
||||
#else
|
||||
if (mp_small_int_mul_overflow(lhs_val, rhs_val, &int_res)) {
|
||||
// use higher precision
|
||||
lhs = mp_obj_new_int_from_ll(lhs_val);
|
||||
goto generic_binary_op;
|
||||
} else {
|
||||
// use standard precision
|
||||
return MP_OBJ_NEW_SMALL_INT(int_res);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
case MP_BINARY_OP_FLOOR_DIVIDE:
|
||||
case MP_BINARY_OP_INPLACE_FLOOR_DIVIDE:
|
||||
|
|
@ -564,30 +538,18 @@ mp_obj_t MICROPY_WRAP_MP_BINARY_OP(mp_binary_op)(mp_binary_op_t op, mp_obj_t lhs
|
|||
mp_int_t ans = 1;
|
||||
while (rhs_val > 0) {
|
||||
if (rhs_val & 1) {
|
||||
#if MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
if (__builtin_mul_overflow(ans, lhs_val, &ans)) {
|
||||
if (mp_mul_mp_int_t_overflow(ans, lhs_val, &ans)) {
|
||||
goto power_overflow;
|
||||
}
|
||||
#else
|
||||
if (mp_small_int_mul_overflow(ans, lhs_val, &ans)) {
|
||||
goto power_overflow;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (rhs_val == 1) {
|
||||
break;
|
||||
}
|
||||
rhs_val /= 2;
|
||||
mp_int_t int_res;
|
||||
#if MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
if (__builtin_mul_overflow(lhs_val, lhs_val, &int_res)) {
|
||||
if (mp_mul_mp_int_t_overflow(lhs_val, lhs_val, &int_res)) {
|
||||
goto power_overflow;
|
||||
}
|
||||
#else
|
||||
if (mp_small_int_mul_overflow(lhs_val, lhs_val, &int_res)) {
|
||||
goto power_overflow;
|
||||
}
|
||||
#endif
|
||||
lhs_val = int_res;
|
||||
}
|
||||
lhs_val = ans;
|
||||
|
|
|
|||
|
|
@ -26,25 +26,30 @@
|
|||
|
||||
#include "py/smallint.h"
|
||||
|
||||
bool mp_small_int_mul_overflow(mp_int_t x, mp_int_t y, mp_int_t *res) {
|
||||
#if !MICROPY_USE_GCC_MUL_OVERFLOW_INTRINSIC
|
||||
#define MP_UINT_MAX (~(mp_uint_t)0)
|
||||
#define MP_INT_MAX ((mp_int_t)(MP_UINT_MAX >> 1))
|
||||
#define MP_INT_MIN (-MP_INT_MAX - 1)
|
||||
|
||||
bool mp_mul_mp_int_t_overflow(mp_int_t x, mp_int_t y, mp_int_t *res) {
|
||||
// Check for multiply overflow; see CERT INT32-C
|
||||
if (x > 0) { // x is positive
|
||||
if (y > 0) { // x and y are positive
|
||||
if (x > (MP_SMALL_INT_MAX / y)) {
|
||||
if (x > (MP_INT_MAX / y)) {
|
||||
return true;
|
||||
}
|
||||
} else { // x positive, y nonpositive
|
||||
if (y < (MP_SMALL_INT_MIN / x)) {
|
||||
if (y < (MP_INT_MIN / x)) {
|
||||
return true;
|
||||
}
|
||||
} // x positive, y nonpositive
|
||||
} else { // x is nonpositive
|
||||
if (y > 0) { // x is nonpositive, y is positive
|
||||
if (x < (MP_SMALL_INT_MIN / y)) {
|
||||
if (x < (MP_INT_MIN / y)) {
|
||||
return true;
|
||||
}
|
||||
} else { // x and y are nonpositive
|
||||
if (x != 0 && y < (MP_SMALL_INT_MAX / x)) {
|
||||
if (x != 0 && y < (MP_INT_MAX / x)) {
|
||||
return true;
|
||||
}
|
||||
} // End if x and y are nonpositive
|
||||
|
|
@ -54,6 +59,7 @@ bool mp_small_int_mul_overflow(mp_int_t x, mp_int_t y, mp_int_t *res) {
|
|||
*res = x * y;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
mp_int_t mp_small_int_modulo(mp_int_t dividend, mp_int_t divisor) {
|
||||
// Python specs require that mod has same sign as second operand
|
||||
|
|
|
|||
|
|
@ -68,10 +68,6 @@
|
|||
// The number of bits in a MP_SMALL_INT including the sign bit.
|
||||
#define MP_SMALL_INT_BITS (MP_IMAX_BITS(MP_SMALL_INT_MAX) + 1)
|
||||
|
||||
// Multiply two small ints.
|
||||
// If returns false, the correct result is stored in 'res'
|
||||
// If returns true, the multiplication would have overflowed. 'res' is unchanged.
|
||||
bool mp_small_int_mul_overflow(mp_int_t x, mp_int_t y, mp_int_t *res);
|
||||
mp_int_t mp_small_int_modulo(mp_int_t dividend, mp_int_t divisor);
|
||||
mp_int_t mp_small_int_floor_divide(mp_int_t num, mp_int_t denom);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue