Merge branch 'master' into print

2020-05-19 20:44:54 +02:00 · 2020-05-19 20:44:54 +02:00 · 78438727c4
commit 78438727c4
parent d991d9d248 17b76a60bb
12 changed files with 844 additions and 80 deletions
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@ In addition to the `ndarray`'s operators and methods, seven modules define a gre

 ### vector

-The `vector` sub-module implements all functions of the `math` package (e.g., `acos`, `acosh`, ..., `tan`, `tanh`) of `micropython` for `ndarray`s and iterables.
+The `vector` sub-module implements all functions of the `math` package (e.g., `acos`, `acosh`, ..., `tan`, `tanh`) of `micropython` for `ndarray`s and iterables. In addition, it also provided tools for vectorising generic, user-defined `python` functions. 

 ### numerical

@ -45,7 +45,7 @@ The `filter` sub-module implements one-dimensional convolution.

 ### compare

-The `compare` sub-module contains the implementation of the `minimum`, `maximum`, and `clip` functions.
+The `compare` sub-module contains the implementation of the `equal`, `not_equal`, `minimum`, `maximum`, and `clip` functions.

 ### extras

--- a/code/compare.c
+++ b/code/compare.c
@ -34,6 +34,9 @@ static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t comptype) {
            mp_raise_ValueError(translate("operands could not be broadcast together"));
 		}
 	}
+	if((comptype == MP_BINARY_OP_EQUAL) || (comptype == MP_BINARY_OP_NOT_EQUAL)) {
+		return ndarray_binary_op(comptype, x1, x2);
+	}
 	size_t m = MAX(ndarray1->m, ndarray2->m);
 	size_t n = MAX(ndarray1->n, ndarray2->n);
 	size_t len = MAX(ndarray1->array->len, ndarray2->array->len);
@ -114,6 +117,30 @@ static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t comptype) {
    return mp_const_none; // we should never reach this point
 }

+static mp_obj_t compare_equal_helper(mp_obj_t x1, mp_obj_t x2, uint8_t comptype) {
+	// scalar comparisons should return a single object of mp_obj_t type
+	mp_obj_t result = compare_function(x1, x2, comptype);
+	if((MP_OBJ_IS_INT(x1) || mp_obj_is_float(x1)) && (MP_OBJ_IS_INT(x2) || mp_obj_is_float(x2))) {
+		mp_obj_iter_buf_t iter_buf;
+		mp_obj_t iterable = mp_getiter(result, &iter_buf);
+		mp_obj_t item = mp_iternext(iterable);
+		return item;
+	}
+	return result;	
+
+}
+static mp_obj_t compare_equal(mp_obj_t x1, mp_obj_t x2) {
+	return compare_equal_helper(x1, x2, MP_BINARY_OP_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_equal_obj, compare_equal);
+
+static mp_obj_t compare_not_equal(mp_obj_t x1, mp_obj_t x2) {
+	return compare_equal_helper(x1, x2, MP_BINARY_OP_NOT_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_not_equal_obj, compare_not_equal);
+
 static mp_obj_t compare_minimum(mp_obj_t x1, mp_obj_t x2) {
 	// extra round, so that we can return minimum(3, 4) properly
 	mp_obj_t result = compare_function(x1, x2, COMPARE_MINIMUM);
@ -150,6 +177,8 @@ MP_DEFINE_CONST_FUN_OBJ_3(compare_clip_obj, compare_clip);

 STATIC const mp_rom_map_elem_t ulab_compare_globals_table[] = {
    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_compare) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_equal), (mp_obj_t)&compare_equal_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_not_equal), (mp_obj_t)&compare_not_equal_obj },
 	{ MP_OBJ_NEW_QSTR(MP_QSTR_maximum), (mp_obj_t)&compare_maximum_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_minimum), (mp_obj_t)&compare_minimum_obj },
 	{ MP_OBJ_NEW_QSTR(MP_QSTR_clip), (mp_obj_t)&compare_clip_obj },
--- a/code/compare.h
+++ b/code/compare.h
@ -20,11 +20,13 @@
 enum COMPARE_FUNCTION_TYPE {
    COMPARE_MINIMUM,
    COMPARE_MAXIMUM,
-    COMPARE_CLIP,    
+    COMPARE_CLIP,
 };

 extern mp_obj_module_t ulab_compare_module;

+MP_DECLARE_CONST_FUN_OBJ_2(compare_equal_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_not_equal_obj);
 MP_DECLARE_CONST_FUN_OBJ_2(compare_minimum_obj);
 MP_DECLARE_CONST_FUN_OBJ_2(compare_maximum_obj);
 MP_DECLARE_CONST_FUN_OBJ_3(compare_clip_obj);
--- a/code/ndarray.c
+++ b/code/ndarray.c
@ -874,7 +874,7 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lhs, mp_obj_t rhs) {
 				} else if(or->array->typecode == NDARRAY_INT16) {
 					RUN_BINARY_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, int16_t, ol, or, op, m, n, len, linc, rinc);
 				} else if(or->array->typecode == NDARRAY_FLOAT) {
-					RUN_BINARY_LOOP(NDARRAY_FLOAT, mp_float_t, uint8_t, mp_float_t, ol, or, op, m, n, len, linc, rinc);
+					RUN_BINARY_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, mp_float_t, ol, or, op, m, n, len, linc, rinc);
 				}
 			} else if(ol->array->typecode == NDARRAY_INT16) {
 				if(or->array->typecode == NDARRAY_UINT8) {
--- a/code/ulab.c
+++ b/code/ulab.c
@ -31,7 +31,7 @@
 #include "compare.h"
 #include "extras.h"

-STATIC MP_DEFINE_STR_OBJ(ulab_version_obj, "0.42.0");
+STATIC MP_DEFINE_STR_OBJ(ulab_version_obj, "0.46.0");

 MP_DEFINE_CONST_FUN_OBJ_KW(ndarray_flatten_obj, 1, ndarray_flatten);

--- a/code/ulab.h
+++ b/code/ulab.h
@ -15,7 +15,7 @@
 // the create module is always included
 #define ULAB_CREATE_MODULE (1)

-// vectorise (all functions) takes approx. 4.5 kB of flash space
+// vectorise (all functions) takes approx. 6 kB of flash space
 #ifndef ULAB_VECTORISE_MODULE
 #define ULAB_VECTORISE_MODULE (1)
 #endif
--- a/code/vectorise.c
+++ b/code/vectorise.c
@ -213,6 +213,85 @@ static mp_obj_t vectorise_arctan2(mp_obj_t x, mp_obj_t y) {

 MP_DEFINE_CONST_FUN_OBJ_2(vectorise_arctan2_obj, vectorise_arctan2);

+static mp_obj_t vectorise_vectorized_function_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    vectorized_function_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_obj_t avalue[1];
+    mp_obj_t fvalue;
+    if(MP_OBJ_IS_TYPE(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
+        ndarray_obj_t *target = create_new_ndarray(source->m, source->n, self->otypes);
+        for(size_t i=0; i < source->array->len; i++) {
+            avalue[0] = mp_binary_get_val_array(source->array->typecode, source->array->items, i);
+            fvalue = self->type->call(self->fun, 1, 0, avalue);
+            mp_binary_set_val_array(self->otypes, target->array->items, i, fvalue);
+        }
+        return MP_OBJ_FROM_PTR(target);
+    } else if(MP_OBJ_IS_TYPE(args[0], &mp_type_tuple) || MP_OBJ_IS_TYPE(args[0], &mp_type_list) ||
+        MP_OBJ_IS_TYPE(args[0], &mp_type_range)) { // i.e., the input is a generic iterable
+        size_t len = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        ndarray_obj_t *target = create_new_ndarray(1, len, self->otypes);
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t iterable = mp_getiter(args[0], &iter_buf);
+        size_t i=0;
+        while ((avalue[0] = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            fvalue = self->type->call(self->fun, 1, 0, avalue);
+            mp_binary_set_val_array(self->otypes, target->array->items, i, fvalue);
+            i++;
+        }
+        return MP_OBJ_FROM_PTR(target);
+    } else if(mp_obj_is_int(args[0]) || mp_obj_is_float(args[0])) {
+        ndarray_obj_t *target = create_new_ndarray(1, 1, self->otypes);
+        fvalue = self->type->call(self->fun, 1, 0, args);
+        mp_binary_set_val_array(self->otypes, target->array->items, 0, fvalue);
+        return MP_OBJ_FROM_PTR(target);
+    } else {
+        mp_raise_ValueError(translate("wrong input type"));
+    }
+    return mp_const_none;
+}
+
+const mp_obj_type_t vectorise_function_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_,
+    .call = vectorise_vectorized_function_call,
+};
+
+static mp_obj_t vectorise_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_otypes, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} }
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    const mp_obj_type_t *type = mp_obj_get_type(args[0].u_obj);
+    if(type->call == NULL) {
+        mp_raise_TypeError(translate("first argument must be a callable"));
+    }
+    mp_obj_t _otypes = args[1].u_obj;
+    uint8_t otypes = NDARRAY_FLOAT;
+    if(_otypes == mp_const_none) {
+        // TODO: is this what numpy does?
+        otypes = NDARRAY_FLOAT;
+    } else if(mp_obj_is_int(_otypes)) {
+        otypes = mp_obj_get_int(_otypes);
+        if(otypes != NDARRAY_FLOAT && otypes != NDARRAY_UINT8 && otypes != NDARRAY_INT8 &&
+            otypes != NDARRAY_UINT16 && otypes != NDARRAY_INT16) {
+                mp_raise_ValueError(translate("wrong output type"));
+        }
+    }
+    else {
+        mp_raise_ValueError(translate("wrong output type"));
+    }
+    vectorized_function_obj_t *function = m_new_obj(vectorized_function_obj_t);
+    function->base.type = &vectorise_function_type;
+    function->otypes = otypes;
+    function->fun = args[0].u_obj;
+    function->type = type;
+    return MP_OBJ_FROM_PTR(function);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(vectorise_vectorize_obj, 1, vectorise_vectorize);
+
 STATIC const mp_rom_map_elem_t ulab_vectorise_globals_table[] = {
    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_vector) },
    { MP_OBJ_NEW_QSTR(MP_QSTR_acos), (mp_obj_t)&vectorise_acos_obj },
@ -240,6 +319,7 @@ STATIC const mp_rom_map_elem_t ulab_vectorise_globals_table[] = {
    { MP_OBJ_NEW_QSTR(MP_QSTR_sqrt), (mp_obj_t)&vectorise_sqrt_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_tan), (mp_obj_t)&vectorise_tan_obj },
    { MP_OBJ_NEW_QSTR(MP_QSTR_tanh), (mp_obj_t)&vectorise_tanh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_vectorize), (mp_obj_t)&vectorise_vectorize_obj },
 };

 STATIC MP_DEFINE_CONST_DICT(mp_module_ulab_vectorise_globals, ulab_vectorise_globals_table);
--- a/code/vectorise.h
+++ b/code/vectorise.h
@ -17,6 +17,13 @@

 #if ULAB_VECTORISE_MODULE

+typedef struct _vectorized_function_obj_t {
+    mp_obj_base_t base;
+    uint8_t otypes;
+    mp_obj_t fun;
+    const mp_obj_type_t *type;
+} vectorized_function_obj_t;
+
 mp_obj_module_t ulab_vectorise_module;

 #define ITERATE_VECTOR(type, source, out) do {\
--- a/docs/manual/source/conf.py
+++ b/docs/manual/source/conf.py
@ -22,8 +22,7 @@ copyright = '2019-2020, Zoltán Vörös'
 author = 'Zoltán Vörös'

 # The full version, including alpha/beta/rc tags
-release = '0.43.0'
-
+release = '0.46.1'

 # -- General configuration ---------------------------------------------------

--- a/docs/manual/source/ulab.rst
+++ b/docs/manual/source/ulab.rst
@ -55,7 +55,8 @@ are implemented in a way that

 1. conforms to ``numpy`` as much as possible
 2. is so frugal with RAM as possible,
-3. and yet, fast. Much faster than pure python.
+3. and yet, fast. Much faster than pure python. Think of a number
+   between 30 and 50!

 The main points of ``ulab`` are

@ -69,7 +70,7 @@ The main points of ``ulab`` are
 -  polynomial fits to numerical data
 -  fast Fourier transforms

-At the time of writing this manual (for version 0.42.0), the library
+At the time of writing this manual (for version 0.46.0), the library
 adds approximately 40 kB of extra compiled code to the micropython
 (pyboard.v.11) firmware. However, if you are tight with flash space, you
 can easily shave off a couple of kB. See the section on `customising
@ -162,7 +163,7 @@ The first couple of lines of the file look like this

 .. code:: c

-   // vectorise (all functions) takes approx. 4.5 kB of flash space
+   // vectorise (all functions) takes approx. 6 kB of flash space
   #define ULAB_VECTORISE_MODULE (1)

   // linalg adds around 6 kB
@ -244,7 +245,8 @@ Basic ndarray operations
 `Comparison operators\* <#Comparison-operators>`__

 `Universal functions <#Universal-functions>`__ (also support function
-calls on general iterables)
+calls on general iterables, and vectorisation of user-defined ``python``
+functions.)

 Methods of ndarrays
 -------------------
@ -349,6 +351,10 @@ Filter functions
 Comparison of arrays
 --------------------

+`equal <#equal,-not_equal>`__
+
+`not_equal <#equal,-not_equal>`__
+
 `minimum <#minimum>`__

 `maximum <#maximum>`__
@ -405,7 +411,7 @@ Initialising by passing iterables
 If the iterable is one-dimensional, i.e., one whose elements are
 numbers, then a row vector will be created and returned. If the iterable
 is two-dimensional, i.e., one whose elements are again iterables, a
-matrix will be created. If the lengths of the iterables is not
+matrix will be created. If the lengths of the iterables are not
 consistent, a ``ValueError`` will be raised. Iterables of different
 types can be mixed in the initialisation function.

@ -1418,6 +1424,10 @@ columns the matrix has. This feature will be added in future versions of



+**WARNING:** ``circuitpython`` users should use the ``equal``, and
+``not_equal`` operators instead of ``==``, and ``!=``. See the section
+on `array comparison <#Comparison-of-arrays>`__ for details.
+
 Upcasting
 ~~~~~~~~~

@ -1511,6 +1521,8 @@ take the following snippet from the micropython manual:
        
    # code to be run in micropython
    
+    import utime
+    
    def timeit(f, *args, **kwargs):
        func_name = str(f).split(' ')[1]
        def new_func(*args, **kwargs):
@ -1940,14 +1952,14 @@ column. A couple of examples should make these statements clearer:
 Universal functions
 ===================

-Standard mathematical functions defined in the ``vector`` sub-module,
-and can be calculated on any scalar-valued iterable (ranges, lists,
-tuples containing numbers), and on ``ndarray``\ s without having to
-change the call signature. In all cases the functions return a new
-``ndarray`` of typecode ``float`` (since these functions usually
-generate float values, anyway). The functions execute faster with
-``ndarray`` arguments than with iterables, because the values of the
-input vector can be extracted faster.
+Standard mathematical functions are defined in the ``vector``
+sub-module, and can be calculated on any scalar, scalar-valued iterable
+(ranges, lists, tuples containing numbers), and on ``ndarray``\ s
+without having to change the call signature. In all cases the functions
+return a new ``ndarray`` of typecode ``float`` (since these functions
+usually generate float values, anyway). The functions execute faster
+with ``ndarray`` arguments than with iterables, because the values of
+the input vector can be extracted faster.

 At present, the following functions are supported:

@ -2010,6 +2022,73 @@ microseconds, because internally the function has to create the
 type, and then convert them to floats. All these steps are skipped for
 ``ndarray``\ s, because these pieces of information are already known.

+Doing the same with ``list`` comprehension requires 30 times more time
+than with the ``ndarray``, which would become even more, if we converted
+the resulting list to an ``ndarray``.
+
+.. code::
+        
+    # code to be run in micropython
+    
+    import ulab as np
+    from ulab import vector
+    import math
+    
+    a = [0]*1000
+    b = np.array(a)
+    
+    @timeit
+    def timed_vector(iterable):
+        return vector.exp(iterable)
+    
+    @timeit
+    def timed_list(iterable):
+        return [math.exp(i) for i in iterable]
+    
+    print('iterating over ndarray in ulab')
+    timed_vector(b)
+    
+    print('\niterating over list in ulab')
+    timed_vector(a)
+    
+    print('\niterating over list in python')
+    timed_list(a)
+
+.. parsed-literal::
+
+    iterating over ndarray in ulab
+    execution time:  441  us
+    
+    iterating over list in ulab
+    execution time:  1266  us
+    
+    iterating over list in python
+    execution time:  11379  us
+    
+
+
+Vectorising generic python functions
+------------------------------------
+
+``numpy``:
+https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html
+
+The examples above use factory functions. In fact, they are nothing but
+the vectorised versions of the standard mathematical functions.
+User-defined ``python`` functions can also be vectorised by help of
+``vectorize``. This function takes a positional argument, namely, the
+``python`` function that you want to vectorise, and a non-mandatory
+keyword argument, ``otypes``, which determines the ``dtype`` of the
+output array. The ``otypes`` must be ``None`` (default), or any of the
+``dtypes`` defined in ``ulab``. With ``None``, the output is
+automatically turned into a float array.
+
+The return value of ``vectorize`` is a ``micropython`` object that can
+be called as a standard function, but which now accepts either a scalar,
+an ``ndarray``, or a generic ``micropython`` iterable as its sole
+argument. Note that the function that is to be vectorised must have a
+single argument.
+
 .. code::
        
    # code to be run in micropython
@ -2017,28 +2096,196 @@ type, and then convert them to floats. All these steps are skipped for
    import ulab as np
    from ulab import vector
    
-    a = [0]*1000
-    b = np.array(a)
+    def f(x):
+        return x*x
    
-    @timeit
-    def measure_run_time(x):
-        return vector.exp(x)
+    vf = vector.vectorize(f)
    
-    measure_run_time(a)
+    # calling with a scalar
+    print('{:20}'.format('f on a scalar: '), vf(44.0))
    
-    measure_run_time(b)
+    # calling with an ndarray
+    a = np.array([1, 2, 3, 4])
+    print('{:20}'.format('f on an ndarray: '), vf(a))
+    
+    # calling with a list
+    print('{:20}'.format('f on a list: '), vf([2, 3, 4]))

 .. parsed-literal::

-    execution time:  1259  us
-    execution time:  408  us
+    f on a scalar:       array([1936.0], dtype=float)
+    f on an ndarray:     array([1.0, 4.0, 9.0, 16.0], dtype=float)
+    f on a list:         array([4.0, 9.0, 16.0], dtype=float)
+    
    


-Of course, such a time saving is reasonable only, if the data are
-already available as an ``ndarray``. If one has to initialise the
-``ndarray`` from the list, then there is no gain, because the iterator
-was simply pushed into the initialisation function.
+As mentioned, the ``dtype`` of the resulting ``ndarray`` can be
+specified via the ``otypes`` keyword. The value is bound to the function
+object that ``vectorize`` returns, therefore, if the same function is to
+be vectorised with different output types, then for each type a new
+function object must be created.
+
+.. code::
+        
+    # code to be run in micropython
+    
+    import ulab as np
+    from ulab import vector
+    
+    l = [1, 2, 3, 4]
+    def f(x):
+        return x*x
+    
+    vf1 = vector.vectorize(f, otypes=np.uint8)
+    vf2 = vector.vectorize(f, otypes=np.float)
+    
+    print('{:20}'.format('output is uint8: '), vf1(l))
+    print('{:20}'.format('output is float: '), vf2(l))
+
+.. parsed-literal::
+
+    output is uint8:     array([1, 4, 9, 16], dtype=uint8)
+    output is float:     array([1.0, 4.0, 9.0, 16.0], dtype=float)
+    
+    
+
+
+The ``otypes`` keyword argument cannot be used for type coercion: if the
+function evaluates to a float, but ``otypes`` would dictate an integer
+type, an exception will be raised:
+
+.. code::
+        
+    # code to be run in micropython
+    
+    import ulab as np
+    from ulab import vector
+    
+    int_list = [1, 2, 3, 4]
+    float_list = [1.0, 2.0, 3.0, 4.0]
+    def f(x):
+        return x*x
+    
+    vf = vector.vectorize(f, otypes=np.uint8)
+    
+    print('{:20}'.format('integer list: '), vf(int_list))
+    # this will raise a TypeError exception
+    print(vf(float_list))
+
+.. parsed-literal::
+
+    integer list:        array([1, 4, 9, 16], dtype=uint8)
+    
+    Traceback (most recent call last):
+      File "/dev/shm/micropython.py", line 14, in <module>
+    TypeError: can't convert float to int
+    
+
+
+Benchmarks
+~~~~~~~~~~
+
+It should be pointed out that the ``vectorize`` function produces the
+pseudo-vectorised version of the ``python`` function that is fed into
+it, i.e., on the C level, the same ``python`` function is called, with
+the all-encompassing ``mp_obj_t`` type arguments, and all that happens
+is that the ``for`` loop in ``[f(i) for i in iterable]`` runs purely in
+C. Since type checking and type conversion in ``f()`` is expensive, the
+speed-up is not so spectacular as when iterating over an ``ndarray``
+with a factory function: a gain of approximately 30% can be expected,
+when a native ``python`` type (e.g., ``list``) is returned by the
+function, and this becomes around 50% (a factor of 2), if conversion to
+an ``ndarray`` is also counted.
+
+The following code snippet calculates the square of a 1000 numbers with
+the vectorised function (which returns an ``ndarray``), with ``list``
+comprehension, and with ``list`` comprehension followed by conversion to
+an ``ndarray``. For comparison, the execution time is measured also for
+the case, when the square is calculated entirely in ``ulab``.
+
+.. code::
+        
+    # code to be run in micropython
+    
+    import ulab as np
+    from ulab import vector
+    
+    def f(x):
+        return x*x
+    
+    vf = vector.vectorize(f)
+    
+    @timeit
+    def timed_vectorised_square(iterable):
+        return vf(iterable)
+    
+    @timeit
+    def timed_python_square(iterable):
+        return [f(i) for i in iterable]
+    
+    @timeit
+    def timed_ndarray_square(iterable):
+        return np.array([f(i) for i in iterable])
+    
+    @timeit
+    def timed_ulab_square(ndarray):
+        return ndarray**2
+    
+    print('vectorised function')
+    squares = timed_vectorised_square(range(1000))
+    
+    print('\nlist comprehension')
+    squares = timed_python_square(range(1000))
+    
+    print('\nlist comprehension + ndarray conversion')
+    squares = timed_ndarray_square(range(1000))
+    
+    print('\nsquaring an ndarray entirely in ulab')
+    a = np.array(range(1000))
+    squares = timed_ulab_square(a)
+
+.. parsed-literal::
+
+    vectorised function
+    execution time:  7237  us
+    
+    list comprehension
+    execution time:  10248  us
+    
+    list comprehension + ndarray conversion
+    execution time:  12562  us
+    
+    squaring an ndarray entirely in ulab
+    execution time:  560  us
+    
+
+
+From the comparisons above, it is obvious that ``python`` functions
+should only be vectorised, when the same effect cannot be gotten in
+``ulab`` only. However, although the time savings are not significant,
+there is still a good reason for caring about vectorised functions.
+Namely, user-defined ``python`` functions become universal, i.e., they
+can accept generic iterables as well as ``ndarray``\ s as their
+arguments. A vectorised function is still a one-liner, resulting in
+transparent and elegant code.
+
+A final comment on this subject: the ``f(x)`` that we defined is a
+*generic* ``python`` function. This means that it is not required that
+it just crunches some numbers. It has to return a number object, but it
+can still access the hardware in the meantime. So, e.g.,
+
+.. code:: python
+
+
+   led = pyb.LED(2)
+
+   def f(x):
+       if x < 100:
+           led.toggle()
+       return x*x
+
+is perfectly valid code.

 around
 ------
@ -3552,6 +3799,55 @@ Comparison of arrays
 Functions in the ``compare`` module can be called by importing the
 sub-module first.

+equal, not_equal
+----------------
+
+``numpy``:
+https://numpy.org/doc/stable/reference/generated/numpy.equal.html
+
+``numpy``:
+https://numpy.org/doc/stable/reference/generated/numpy.not_equal.html
+
+In ``micropython``, equality of arrays or scalars can be established by
+utilising the ``==``, ``!=``, ``<``, ``>``, ``<=``, or ``=>`` binary
+operators. In ``circuitpython``, ``==`` and ``!=`` will produce
+unexpected results. In order to avoid this discrepancy, and to maintain
+compatibility with ``numpy``, ``ulab`` implements the ``equal`` and
+``not_equal`` operators that return the same results, irrespective of
+the ``python`` implementation.
+
+These two functions take two ``ndarray``\ s, or scalars as their
+arguments. No keyword arguments are implemented.
+
+.. code::
+        
+    # code to be run in micropython
+    
+    import ulab as np
+    
+    a = np.array(range(9))
+    b = np.zeros(9)
+    
+    print('a: ', a)
+    print('b: ', b)
+    print('\na == b: ', np.compare.equal(a, b))
+    print('a != b: ', np.compare.not_equal(a, b))
+    
+    # comparison with scalars
+    print('a == 2: ', np.compare.equal(a, 2))
+
+.. parsed-literal::
+
+    a:  array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], dtype=float)
+    b:  array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], dtype=float)
+    
+    a == b:  [True, False, False, False, False, False, False, False, False]
+    a != b:  [False, True, True, True, True, True, True, True, True]
+    a == 2:  [False, False, True, False, False, False, False, False, False]
+    
+    
+
+
 minimum
 -------

--- a/docs/ulab-change-log.md
+++ b/docs/ulab-change-log.md
@ -1,3 +1,21 @@
+Tue, 19 May 2020
+
+version 0.46.1
+
+    fixed bad error in binary_op
+
+Wed, 6 May 2020
+
+version 0.46
+
+    added vectorisation of python functions
+
+Sat, 2 May 2020
+
+version 0.45.0
+
+	add equal/not_equal to the compare module
+	
 Tue, 21 Apr 2020

 version 0.42.0
--- a/docs/ulab-manual.ipynb
+++ b/docs/ulab-manual.ipynb
@ -27,8 +27,8 @@
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-05-01T09:27:19.767404Z",
-     "start_time": "2020-05-01T09:27:19.748072Z"
+     "end_time": "2020-05-06T21:53:56.928052Z",
+     "start_time": "2020-05-06T21:53:56.920487Z"
    }
   },
   "outputs": [
@ -66,7 +66,7 @@
    "author = 'Zoltán Vörös'\n",
    "\n",
    "# The full version, including alpha/beta/rc tags\n",
-    "release = '0.43.0'\n",
+    "release = '0.46.1'\n",
    "\n",
    "\n",
    "# -- General configuration ---------------------------------------------------\n",
@ -120,11 +120,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-05-01T16:46:00.764988Z",
-     "start_time": "2020-05-01T16:45:57.343808Z"
+     "end_time": "2020-05-07T07:40:08.312112Z",
+     "start_time": "2020-05-07T07:40:04.392576Z"
    }
   },
   "outputs": [],
@ -300,11 +300,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-05-01T09:33:51.223397Z",
-     "start_time": "2020-05-01T09:33:51.216685Z"
+     "end_time": "2020-05-06T21:54:07.062285Z",
+     "start_time": "2020-05-06T21:54:07.057709Z"
    }
   },
   "outputs": [],
@ -318,11 +318,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-05-01T09:33:52.139197Z",
-     "start_time": "2020-05-01T09:33:52.104168Z"
+     "end_time": "2020-05-06T21:54:10.696369Z",
+     "start_time": "2020-05-06T21:54:10.661413Z"
    }
   },
   "outputs": [],
@ -400,11 +400,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-02-16T18:36:59.172039Z",
-     "start_time": "2020-02-16T18:36:59.144651Z"
+     "end_time": "2020-05-07T07:35:35.126401Z",
+     "start_time": "2020-05-07T07:35:35.105824Z"
    }
   },
   "outputs": [],
@ -416,11 +416,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 60,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-02-16T18:50:42.907664Z",
-     "start_time": "2020-02-16T18:50:42.903709Z"
+     "end_time": "2020-05-07T07:36:13.751040Z",
+     "start_time": "2020-05-07T07:36:13.745081Z"
    }
   },
   "outputs": [],
@ -431,11 +431,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 521,
+   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2019-10-20T06:48:05.984879Z",
-     "start_time": "2019-10-20T06:48:05.619747Z"
+     "end_time": "2020-05-07T07:35:38.725924Z",
+     "start_time": "2020-05-07T07:35:38.645488Z"
    }
   },
   "outputs": [
@ -510,7 +510,7 @@
    "\n",
    "1. conforms to `numpy` as much as possible\n",
    "2. is so frugal with RAM as possible,\n",
-    "3. and yet, fast. Much faster than pure python.\n",
+    "3. and yet, fast. Much faster than pure python. Think of a number between 30 and 50!\n",
    "\n",
    "The main points of `ulab` are \n",
    "\n",
@ -520,7 +520,7 @@
    "- polynomial fits to numerical data\n",
    "- fast Fourier transforms\n",
    "\n",
-    "At the time of writing this manual (for version 0.42.0), the library adds approximately 40 kB of extra compiled code to the micropython (pyboard.v.11) firmware. However, if you are tight with flash space, you can easily shave off a couple of kB. See the section on [customising ulab](#Custom_builds).\n",
+    "At the time of writing this manual (for version 0.46.0), the library adds approximately 40 kB of extra compiled code to the micropython (pyboard.v.11) firmware. However, if you are tight with flash space, you can easily shave off a couple of kB. See the section on [customising ulab](#Custom_builds).\n",
    "\n",
    "## Resources and legal matters\n",
    "\n",
@ -562,7 +562,7 @@
    "The first couple of lines of the file look like this\n",
    "\n",
    "```c\n",
-    "// vectorise (all functions) takes approx. 4.5 kB of flash space\n",
+    "// vectorise (all functions) takes approx. 6 kB of flash space\n",
    "#define ULAB_VECTORISE_MODULE (1)\n",
    "\n",
    "// linalg adds around 6 kB\n",
@ -636,7 +636,7 @@
    "\n",
    "[Comparison operators<sup>*</sup>](#Comparison-operators)\n",
    "\n",
-    "[Universal functions](#Universal-functions) (also support function calls on general iterables)\n",
+    "[Universal functions](#Universal-functions) (also support function calls on general iterables, and vectorisation of user-defined `python` functions.)\n",
    "\n",
    "\n",
    "## Methods of ndarrays\n",
@ -733,6 +733,10 @@
    "\n",
    "## Comparison of arrays\n",
    "\n",
+    "[equal](#equal,-not_equal)\n",
+    "\n",
+    "[not_equal](#equal,-not_equal)\n",
+    "\n",
    "[minimum](#minimum)\n",
    "\n",
    "[maximum](#maximum)\n",
@ -772,7 +776,7 @@
   "source": [
    "### Initialising by passing iterables\n",
    "\n",
-    "If the iterable is one-dimensional, i.e., one whose elements are numbers, then a row vector will be created and returned. If the iterable is two-dimensional, i.e., one whose elements are again iterables, a matrix will be created. If the lengths of the iterables is not consistent, a `ValueError` will be raised. Iterables of different types can be mixed in the initialisation function. \n",
+    "If the iterable is one-dimensional, i.e., one whose elements are numbers, then a row vector will be created and returned. If the iterable is two-dimensional, i.e., one whose elements are again iterables, a matrix will be created. If the lengths of the iterables are not consistent, a `ValueError` will be raised. Iterables of different types can be mixed in the initialisation function. \n",
    "\n",
    "If the `dtype` keyword with the possible `uint8/int8/uint16/int16/float` values is supplied, the new `ndarray` will have that type, otherwise, it assumes `float` as default. "
   ]
@ -2164,11 +2168,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2020-04-04T19:56:31.111796Z",
-     "start_time": "2020-04-04T19:56:31.091142Z"
+     "end_time": "2020-05-03T08:56:42.903058Z",
+     "start_time": "2020-05-03T08:56:42.890546Z"
    }
   },
   "outputs": [
@ -2228,6 +2232,13 @@
    "a+b"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**WARNING:** `circuitpython` users should use the `equal`, and `not_equal` operators instead of `==`, and `!=`. See the section on [array comparison](#Comparison-of-arrays) for details."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -2318,11 +2329,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 522,
+   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2019-10-20T06:49:27.584150Z",
-     "start_time": "2019-10-20T06:49:27.551381Z"
+     "end_time": "2020-05-07T06:39:52.225256Z",
+     "start_time": "2020-05-07T06:39:52.194691Z"
    }
   },
   "outputs": [
@ -2337,6 +2348,8 @@
   "source": [
    "%%micropython -pyboard 1\n",
    "\n",
+    "import utime\n",
+    "\n",
    "def timeit(f, *args, **kwargs):\n",
    "    func_name = str(f).split(' ')[1]\n",
    "    def new_func(*args, **kwargs):\n",
@ -2920,7 +2933,7 @@
   "source": [
    "# Universal functions\n",
    "\n",
-    "Standard mathematical functions defined in the `vector` sub-module, and can be calculated on any scalar-valued iterable (ranges, lists, tuples containing numbers), and on `ndarray`s without having to change the call signature. In all cases the functions return a new `ndarray` of typecode `float` (since these functions usually generate float values, anyway). The functions execute faster with `ndarray` arguments than with iterables, because the values of the input vector can be extracted faster. \n",
+    "Standard mathematical functions are defined in the `vector` sub-module, and can be calculated on any scalar,  scalar-valued iterable (ranges, lists, tuples containing numbers), and on `ndarray`s without having to change the call signature. In all cases the functions return a new `ndarray` of typecode `float` (since these functions usually generate float values, anyway). The functions execute faster with `ndarray` arguments than with iterables, because the values of the input vector can be extracted faster. \n",
    "\n",
    "At present, the following functions are supported:\n",
    "\n",
@ -2988,16 +3001,18 @@
   "source": [
    "## Computation expenses\n",
    "\n",
-    "The overhead for calculating with micropython iterables is quite significant: for the 1000 samples below, the difference is more than 800 microseconds, because internally the function has to create the `ndarray` for the output, has to fetch the iterable's items of unknown type, and then convert them to floats. All these steps are skipped for `ndarray`s, because these pieces of information are already known. "
+    "The overhead for calculating with micropython iterables is quite significant: for the 1000 samples below, the difference is more than 800 microseconds, because internally the function has to create the `ndarray` for the output, has to fetch the iterable's items of unknown type, and then convert them to floats. All these steps are skipped for `ndarray`s, because these pieces of information are already known. \n",
+    "\n",
+    "Doing the same with `list` comprehension requires 30 times more time than with the `ndarray`, which would become even more, if we converted the resulting list to an `ndarray`. "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 526,
+   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2019-10-20T06:51:19.483011Z",
-     "start_time": "2019-10-20T06:51:19.444764Z"
+     "end_time": "2020-05-07T07:35:45.696282Z",
+     "start_time": "2020-05-07T07:35:45.629909Z"
    }
   },
   "outputs": [
@ -3005,8 +3020,232 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "execution time:  1259  us\n",
-      "execution time:  408  us\n",
+      "iterating over ndarray in ulab\r\n",
+      "execution time:  441  us\r\n",
+      "\r\n",
+      "iterating over list in ulab\r\n",
+      "execution time:  1266  us\r\n",
+      "\r\n",
+      "iterating over list in python\r\n",
+      "execution time:  11379  us\r\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%micropython -pyboard 1\n",
+    "\n",
+    "import ulab as np\n",
+    "from ulab import vector\n",
+    "import math\n",
+    "\n",
+    "a = [0]*1000\n",
+    "b = np.array(a)\n",
+    "\n",
+    "@timeit\n",
+    "def timed_vector(iterable):\n",
+    "    return vector.exp(iterable)\n",
+    "\n",
+    "@timeit\n",
+    "def timed_list(iterable):\n",
+    "    return [math.exp(i) for i in iterable]\n",
+    "\n",
+    "print('iterating over ndarray in ulab')\n",
+    "timed_vector(b)\n",
+    "\n",
+    "print('\\niterating over list in ulab')\n",
+    "timed_vector(a)\n",
+    "\n",
+    "print('\\niterating over list in python')\n",
+    "timed_list(a)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Vectorising generic python functions\n",
+    "\n",
+    "`numpy`: https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html\n",
+    "\n",
+    "The examples above use factory functions. In fact, they are nothing but the vectorised versions of the standard mathematical functions. User-defined `python` functions can also be vectorised by help of `vectorize`. This function takes a positional argument, namely, the `python` function that you want to vectorise, and a non-mandatory keyword argument, `otypes`, which determines the `dtype` of the output array. The `otypes` must be `None` (default), or any of the `dtypes` defined in `ulab`. With `None`, the output is automatically turned into a float array. \n",
+    "\n",
+    "The return value of `vectorize` is a `micropython` object that can be called as a standard function, but which now accepts either a scalar, an `ndarray`, or a generic `micropython` iterable as its sole argument. Note that the function that is to be vectorised must have a single argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-05-06T22:13:35.735953Z",
+     "start_time": "2020-05-06T22:13:35.720709Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "f on a scalar:       array([1936.0], dtype=float)\n",
+      "f on an ndarray:     array([1.0, 4.0, 9.0, 16.0], dtype=float)\n",
+      "f on a list:         array([4.0, 9.0, 16.0], dtype=float)\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%micropython -unix 1\n",
+    "\n",
+    "import ulab as np\n",
+    "from ulab import vector\n",
+    "\n",
+    "def f(x):\n",
+    "    return x*x\n",
+    "\n",
+    "vf = vector.vectorize(f)\n",
+    "\n",
+    "# calling with a scalar\n",
+    "print('{:20}'.format('f on a scalar: '), vf(44.0))\n",
+    "\n",
+    "# calling with an ndarray\n",
+    "a = np.array([1, 2, 3, 4])\n",
+    "print('{:20}'.format('f on an ndarray: '), vf(a))\n",
+    "\n",
+    "# calling with a list\n",
+    "print('{:20}'.format('f on a list: '), vf([2, 3, 4]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As mentioned, the `dtype` of the resulting `ndarray` can be specified via the `otypes` keyword. The value is bound to the function object that `vectorize` returns, therefore, if the same function is to be vectorised with different output types, then for each type a new function object must be created."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-05-06T22:17:56.649769Z",
+     "start_time": "2020-05-06T22:17:56.639524Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "output is uint8:     array([1, 4, 9, 16], dtype=uint8)\n",
+      "output is float:     array([1.0, 4.0, 9.0, 16.0], dtype=float)\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%micropython -unix 1\n",
+    "\n",
+    "import ulab as np\n",
+    "from ulab import vector\n",
+    "\n",
+    "l = [1, 2, 3, 4]\n",
+    "def f(x):\n",
+    "    return x*x\n",
+    "\n",
+    "vf1 = vector.vectorize(f, otypes=np.uint8)\n",
+    "vf2 = vector.vectorize(f, otypes=np.float)\n",
+    "\n",
+    "print('{:20}'.format('output is uint8: '), vf1(l))\n",
+    "print('{:20}'.format('output is float: '), vf2(l))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `otypes` keyword argument cannot be used for type coercion: if the function evaluates to a float, but `otypes` would dictate an integer type, an exception will be raised:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-05-06T22:21:43.616220Z",
+     "start_time": "2020-05-06T22:21:43.601280Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "integer list:        array([1, 4, 9, 16], dtype=uint8)\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/dev/shm/micropython.py\", line 14, in <module>\n",
+      "TypeError: can't convert float to int\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%micropython -unix 1\n",
+    "\n",
+    "import ulab as np\n",
+    "from ulab import vector\n",
+    "\n",
+    "int_list = [1, 2, 3, 4]\n",
+    "float_list = [1.0, 2.0, 3.0, 4.0]\n",
+    "def f(x):\n",
+    "    return x*x\n",
+    "\n",
+    "vf = vector.vectorize(f, otypes=np.uint8)\n",
+    "\n",
+    "print('{:20}'.format('integer list: '), vf(int_list))\n",
+    "# this will raise a TypeError exception\n",
+    "print(vf(float_list))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Benchmarks\n",
+    "\n",
+    "It should be pointed out that the `vectorize` function produces the pseudo-vectorised version of the `python` function that is fed into it, i.e., on the C level, the same `python` function is called, with the all-encompassing `mp_obj_t` type arguments, and all that happens is that the `for` loop in `[f(i) for i in iterable]` runs purely in C. Since type checking and type conversion in `f()` is expensive, the speed-up is not so spectacular as when iterating over an `ndarray` with a factory function: a gain of approximately 30% can be expected, when a native `python` type (e.g., `list`) is returned by the function, and this becomes around 50% (a factor of 2), if conversion to an `ndarray` is also counted.\n",
+    "\n",
+    "The following code snippet calculates the square of a 1000 numbers with the vectorised function (which returns an `ndarray`), with `list` comprehension, and with `list` comprehension followed by conversion to an `ndarray`. For comparison, the execution time is measured also for the case, when the square is calculated entirely in `ulab`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-05-07T07:32:20.048553Z",
+     "start_time": "2020-05-07T07:32:19.951851Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "vectorised function\r\n",
+      "execution time:  7237  us\r\n",
+      "\r\n",
+      "list comprehension\r\n",
+      "execution time:  10248  us\r\n",
+      "\r\n",
+      "list comprehension + ndarray conversion\r\n",
+      "execution time:  12562  us\r\n",
+      "\r\n",
+      "squaring an ndarray entirely in ulab\r\n",
+      "execution time:  560  us\r\n",
      "\n"
     ]
    }
@ -3017,23 +3256,60 @@
    "import ulab as np\n",
    "from ulab import vector\n",
    "\n",
-    "a = [0]*1000\n",
-    "b = np.array(a)\n",
+    "def f(x):\n",
+    "    return x*x\n",
+    "\n",
+    "vf = vector.vectorize(f)\n",
    "\n",
    "@timeit\n",
-    "def measure_run_time(x):\n",
-    "    return vector.exp(x)\n",
+    "def timed_vectorised_square(iterable):\n",
+    "    return vf(iterable)\n",
    "\n",
-    "measure_run_time(a)\n",
+    "@timeit\n",
+    "def timed_python_square(iterable):\n",
+    "    return [f(i) for i in iterable]\n",
    "\n",
-    "measure_run_time(b)"
+    "@timeit\n",
+    "def timed_ndarray_square(iterable):\n",
+    "    return np.array([f(i) for i in iterable])\n",
+    "\n",
+    "@timeit\n",
+    "def timed_ulab_square(ndarray):\n",
+    "    return ndarray**2\n",
+    "\n",
+    "print('vectorised function')\n",
+    "squares = timed_vectorised_square(range(1000))\n",
+    "\n",
+    "print('\\nlist comprehension')\n",
+    "squares = timed_python_square(range(1000))\n",
+    "\n",
+    "print('\\nlist comprehension + ndarray conversion')\n",
+    "squares = timed_ndarray_square(range(1000))\n",
+    "\n",
+    "print('\\nsquaring an ndarray entirely in ulab')\n",
+    "a = np.array(range(1000))\n",
+    "squares = timed_ulab_square(a)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Of course, such a time saving is reasonable only, if the data are already available as an `ndarray`. If one has to initialise the `ndarray` from the list, then there is no gain, because the iterator was simply pushed into the initialisation function."
+    "From the comparisons above, it is obvious that `python` functions should only be vectorised, when the same effect cannot be gotten in `ulab` only. However, although the time savings are not significant, there is still a good reason for caring about vectorised functions. Namely, user-defined `python` functions become universal, i.e., they can accept generic iterables as well as `ndarray`s as their arguments. A vectorised function is still a one-liner, resulting in transparent and elegant code.\n",
+    "\n",
+    "A final comment on this subject: the `f(x)` that we defined is a *generic* `python` function. This means that it is not required that it just crunches some numbers. It has to return a number object, but it can still access the hardware in the meantime. So, e.g., \n",
+    "\n",
+    "```python\n",
+    "\n",
+    "led = pyb.LED(2)\n",
+    "\n",
+    "def f(x):\n",
+    "    if x < 100:\n",
+    "        led.toggle()\n",
+    "    return x*x\n",
+    "```\n",
+    "\n",
+    "is perfectly valid code."
   ]
  },
  {
@ -4979,6 +5255,63 @@
    "Functions in the `compare` module can be called by importing the sub-module first."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## equal, not_equal\n",
+    "\n",
+    "`numpy`: https://numpy.org/doc/stable/reference/generated/numpy.equal.html\n",
+    "\n",
+    "`numpy`: https://numpy.org/doc/stable/reference/generated/numpy.not_equal.html\n",
+    "\n",
+    "In `micropython`, equality of arrays or scalars can be established by utilising the `==`, `!=`, `<`, `>`, `<=`, or `=>` binary operators. In `circuitpython`, `==` and `!=` will produce unexpected results. In order to avoid this discrepancy, and to maintain compatibility with `numpy`, `ulab` implements the `equal` and `not_equal` operators that return the same results, irrespective of the `python` implementation.\n",
+    "\n",
+    "These two functions take two `ndarray`s, or scalars as their arguments. No keyword arguments are implemented."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-05-03T08:53:02.668348Z",
+     "start_time": "2020-05-03T08:53:02.656130Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a:  array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], dtype=float)\n",
+      "b:  array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], dtype=float)\n",
+      "\n",
+      "a == b:  [True, False, False, False, False, False, False, False, False]\n",
+      "a != b:  [False, True, True, True, True, True, True, True, True]\n",
+      "a == 2:  [False, False, True, False, False, False, False, False, False]\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%micropython -unix 1\n",
+    "\n",
+    "import ulab as np\n",
+    "\n",
+    "a = np.array(range(9))\n",
+    "b = np.zeros(9)\n",
+    "\n",
+    "print('a: ', a)\n",
+    "print('b: ', b)\n",
+    "print('\\na == b: ', np.compare.equal(a, b))\n",
+    "print('a != b: ', np.compare.not_equal(a, b))\n",
+    "\n",
+    "# comparison with scalars\n",
+    "print('a == 2: ', np.compare.equal(a, 2))"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},