Enable gprof onboard profiling (#2669)

Adds a menu item to enable onboard profiling.  This requires significant
RAM and really only makes sense on devices with PSRAM to store the state.

When the menu item is selected, allocates RAM and tracks function calls and
periodically samples the PC to generate a histogram of application usage.
The onboard gmon.out file can be written over Semihosting or
some other way to transfer to a PC for analysis.

Adds a profiling example with command lines.
This commit is contained in:
Earle F. Philhower, III 2024-12-05 17:30:45 -08:00 committed by GitHub
parent 48bc91af36
commit 0061d3f97f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 1259 additions and 28 deletions

View file

@ -20,7 +20,7 @@ jobs:
uses: codespell-project/actions-codespell@v2 uses: codespell-project/actions-codespell@v2
with: with:
skip: ./ArduinoCore-API,./libraries/ESP8266SdFat,./libraries/Adafruit_TinyUSB_Arduino,./libraries/LittleFS/lib,./tools/pyserial,./pico-sdk,./.github,./docs/i2s.rst,./cores/rp2040/api,./libraries/FreeRTOS,./tools/libbearssl/bearssl,./include,./libraries/WiFi/examples/BearSSL_Server,./ota/uzlib,./libraries/http-parser/lib,./libraries/WebServer/examples/HelloServerBearSSL/HelloServerBearSSL.ino,./libraries/HTTPUpdateServer/examples/SecureBearSSLUpdater/SecureBearSSLUpdater.ino,./.git,./libraries/FatFS/lib/fatfs,./libraries/FatFS/src/diskio.h,./libraries/FatFS/src/ff.cpp,./libraries/FatFS/src/ffconf.h,./libraries/FatFS/src/ffsystem.cpp,./libraries/FatFS/src/ff.h,./libraries/lwIP_WINC1500/src/driver,./libraries/lwIP_WINC1500/src/common,./libraries/lwIP_WINC1500/src/bus_wrapper,./libraries/lwIP_WINC1500/src/spi_flash skip: ./ArduinoCore-API,./libraries/ESP8266SdFat,./libraries/Adafruit_TinyUSB_Arduino,./libraries/LittleFS/lib,./tools/pyserial,./pico-sdk,./.github,./docs/i2s.rst,./cores/rp2040/api,./libraries/FreeRTOS,./tools/libbearssl/bearssl,./include,./libraries/WiFi/examples/BearSSL_Server,./ota/uzlib,./libraries/http-parser/lib,./libraries/WebServer/examples/HelloServerBearSSL/HelloServerBearSSL.ino,./libraries/HTTPUpdateServer/examples/SecureBearSSLUpdater/SecureBearSSLUpdater.ino,./.git,./libraries/FatFS/lib/fatfs,./libraries/FatFS/src/diskio.h,./libraries/FatFS/src/ff.cpp,./libraries/FatFS/src/ffconf.h,./libraries/FatFS/src/ffsystem.cpp,./libraries/FatFS/src/ff.h,./libraries/lwIP_WINC1500/src/driver,./libraries/lwIP_WINC1500/src/common,./libraries/lwIP_WINC1500/src/bus_wrapper,./libraries/lwIP_WINC1500/src/spi_flash
ignore_words_list: ser,dout,shiftIn,acount ignore_words_list: ser,dout,shiftIn,acount,froms
- name: Get submodules for following tests - name: Get submodules for following tests
run: git submodule update --init run: git submodule update --init
- name: Check package references - name: Check package references

View file

@ -137,6 +137,8 @@ Read the [Contributing Guide](https://github.com/earlephilhower/arduino-pico/blo
* printf (i.e. debug) output over USB serial * printf (i.e. debug) output over USB serial
* Transparent use of PSRAM globals and heap (RP2350 only) * Transparent use of PSRAM globals and heap (RP2350 only)
* ARM or RISC-V (Hazard3) support for the RP2350 * ARM or RISC-V (Hazard3) support for the RP2350
* Semihosted serial and file system access
* GPROF profiling support
The RP2040 PIO state machines (SMs) are used to generate jitter-free: The RP2040 PIO state machines (SMs) are used to generate jitter-free:
* Servos * Servos

File diff suppressed because it is too large Load diff

View file

@ -18,9 +18,11 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
#include <Arduino.h>
#include <pico/runtime.h>
#ifdef PICO_RP2040 #ifdef PICO_RP2040
#include <Arduino.h>
#include <hardware/structs/psm.h> #include <hardware/structs/psm.h>
extern "C" void boot_double_tap_check(); extern "C" void boot_double_tap_check();
@ -35,3 +37,17 @@ void RP2040::enableDoubleResetBootloader() {
} }
#endif #endif
#ifdef __PROFILE
Stream *__profileFile;
int __writeProfileCB(const void *data, int len) {
return __profileFile->write((const char *)data, len);
}
#ifdef __PROFILE
extern "C" void runtime_init_setup_profiling();
#define PICO_RUNTIME_INIT_PROFILING "11011" // Towards the end, after PSRAM
PICO_RUNTIME_INIT_FUNC_RUNTIME(runtime_init_setup_profiling, PICO_RUNTIME_INIT_PROFILING);
#endif
#endif

View file

@ -18,6 +18,8 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
#pragma once
#include <hardware/clocks.h> #include <hardware/clocks.h>
#include <hardware/irq.h> #include <hardware/irq.h>
#include <hardware/pio.h> #include <hardware/pio.h>
@ -45,6 +47,13 @@
extern "C" volatile bool __otherCoreIdled; extern "C" volatile bool __otherCoreIdled;
extern "C" {
#ifdef __PROFILE
typedef int (*profileWriteCB)(const void *data, int len);
extern void _writeProfile(profileWriteCB writeCB);
#endif
}
class _MFIFO { class _MFIFO {
public: public:
_MFIFO() { /* noop */ }; _MFIFO() { /* noop */ };
@ -180,7 +189,7 @@ public:
void begin() { void begin() {
_epoch = 0; _epoch = 0;
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
if (!__isFreeRTOS) { if (!__isFreeRTOS) {
// Enable SYSTICK exception // Enable SYSTICK exception
exception_set_exclusive_handler(SYSTICK_EXCEPTION, _SystickHandler); exception_set_exclusive_handler(SYSTICK_EXCEPTION, _SystickHandler);
@ -193,7 +202,7 @@ public:
_ccountPgm->prepare(&_pio, &_sm, &off); _ccountPgm->prepare(&_pio, &_sm, &off);
ccount_program_init(_pio, _sm, off); ccount_program_init(_pio, _sm, off);
pio_sm_set_enabled(_pio, _sm, true); pio_sm_set_enabled(_pio, _sm, true);
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
} }
#endif #endif
} }
@ -217,7 +226,7 @@ public:
// Get CPU cycle count. Needs to do magic to extens 24b HW to something longer // Get CPU cycle count. Needs to do magic to extens 24b HW to something longer
volatile uint64_t _epoch = 0; volatile uint64_t _epoch = 0;
inline uint32_t getCycleCount() { inline uint32_t getCycleCount() {
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
if (!__isFreeRTOS) { if (!__isFreeRTOS) {
uint32_t epoch; uint32_t epoch;
uint32_t ctr; uint32_t ctr;
@ -229,13 +238,13 @@ public:
} else { } else {
#endif #endif
return ccount_read(_pio, _sm); return ccount_read(_pio, _sm);
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
} }
#endif #endif
} }
inline uint64_t getCycleCount64() { inline uint64_t getCycleCount64() {
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
if (!__isFreeRTOS) { if (!__isFreeRTOS) {
uint64_t epoch; uint64_t epoch;
uint64_t ctr; uint64_t ctr;
@ -247,7 +256,7 @@ public:
} else { } else {
#endif #endif
return ccount_read(_pio, _sm); return ccount_read(_pio, _sm);
#if !defined(__riscv) #if !defined(__riscv) && !defined(__PROFILE)
} }
#endif #endif
} }
@ -473,6 +482,21 @@ public:
#endif #endif
} }
#ifdef __PROFILE
void writeProfiling(Stream *f) {
extern Stream *__profileFile;
extern int __writeProfileCB(const void *data, int len);
__profileFile = f;
_writeProfile(__writeProfileCB);
}
size_t getProfileMemoryUsage() {
extern int __profileMemSize;
return (size_t) __profileMemSize;
}
#endif
private: private:
static void _SystickHandler() { static void _SystickHandler() {

View file

@ -30,32 +30,36 @@ extern bool __isFreeRTOS;
// FreeRTOS has been set up // FreeRTOS has been set up
extern volatile bool __freeRTOSinitted; extern volatile bool __freeRTOSinitted;
#ifdef __cplusplus
extern "C" { extern "C" {
struct QueueDefinition; /* Using old naming convention so as not to break kernel aware debuggers. */ #endif // __cplusplus
typedef struct QueueDefinition * QueueHandle_t; struct QueueDefinition; /* Using old naming convention so as not to break kernel aware debuggers. */
typedef QueueHandle_t SemaphoreHandle_t; typedef struct QueueDefinition * QueueHandle_t;
typedef int32_t BaseType_t; typedef QueueHandle_t SemaphoreHandle_t;
typedef int32_t BaseType_t;
extern bool __freertos_check_if_in_isr() __attribute__((weak)); extern bool __freertos_check_if_in_isr() __attribute__((weak));
extern SemaphoreHandle_t __freertos_mutex_create() __attribute__((weak)); extern SemaphoreHandle_t __freertos_mutex_create() __attribute__((weak));
extern SemaphoreHandle_t _freertos_recursive_mutex_create() __attribute__((weak)); extern SemaphoreHandle_t _freertos_recursive_mutex_create() __attribute__((weak));
extern void __freertos_mutex_take(SemaphoreHandle_t mtx) __attribute__((weak)); extern void __freertos_mutex_take(SemaphoreHandle_t mtx) __attribute__((weak));
extern int __freertos_mutex_take_from_isr(SemaphoreHandle_t mtx, BaseType_t* pxHigherPriorityTaskWoken) __attribute__((weak)); extern int __freertos_mutex_take_from_isr(SemaphoreHandle_t mtx, BaseType_t* pxHigherPriorityTaskWoken) __attribute__((weak));
extern int __freertos_mutex_try_take(SemaphoreHandle_t mtx) __attribute__((weak)); extern int __freertos_mutex_try_take(SemaphoreHandle_t mtx) __attribute__((weak));
extern void __freertos_mutex_give(SemaphoreHandle_t mtx) __attribute__((weak)); extern void __freertos_mutex_give(SemaphoreHandle_t mtx) __attribute__((weak));
extern void __freertos_mutex_give_from_isr(SemaphoreHandle_t mtx, BaseType_t* pxHigherPriorityTaskWoken) __attribute__((weak)); extern void __freertos_mutex_give_from_isr(SemaphoreHandle_t mtx, BaseType_t* pxHigherPriorityTaskWoken) __attribute__((weak));
extern void __freertos_recursive_mutex_take(SemaphoreHandle_t mtx) __attribute__((weak)); extern void __freertos_recursive_mutex_take(SemaphoreHandle_t mtx) __attribute__((weak));
extern int __freertos_recursive_mutex_try_take(SemaphoreHandle_t mtx) __attribute__((weak)); extern int __freertos_recursive_mutex_try_take(SemaphoreHandle_t mtx) __attribute__((weak));
extern void __freertos_recursive_mutex_give(SemaphoreHandle_t mtx) __attribute__((weak)); extern void __freertos_recursive_mutex_give(SemaphoreHandle_t mtx) __attribute__((weak));
extern void __freertos_idle_other_core() __attribute__((weak)); extern void __freertos_idle_other_core() __attribute__((weak));
extern void __freertos_resume_other_core() __attribute__((weak)); extern void __freertos_resume_other_core() __attribute__((weak));
extern void __freertos_task_exit_critical() __attribute__((weak)); extern void __freertos_task_exit_critical() __attribute__((weak));
extern void __freertos_task_enter_critical() __attribute__((weak)); extern void __freertos_task_enter_critical() __attribute__((weak));
#ifdef __cplusplus
} }
extern SemaphoreHandle_t __get_freertos_mutex_for_ptr(mutex_t *m, bool recursive = false); extern SemaphoreHandle_t __get_freertos_mutex_for_ptr(mutex_t *m, bool recursive = false);
#endif // __cplusplus

470
cores/rp2040/gprof_gmon.c Normal file
View file

@ -0,0 +1,470 @@
/* -
Copyright (c) 1983, 1992, 1993
The Regents of the University of California. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
4. Neither the name of the University nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
// This code is built as a C file because otherwise G++ would add profiling
// code to the preamble of these functions as well, leading to an infinite
// loop in the mcount routine. Because the Arduino IDE can't (easily)
// apply different compile parameters to different files, we set all C++
// files to "-pg" but leave all C files uninstrumented.
// Original code and organization taken from https://mcuoneclipse.com/2015/08/23/tutorial-using-gnu-profiling-gprof-with-arm-cortex-m/
#include <Arduino.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
// Frequency of sampling PC
#ifndef GMON_HZ
#define GMON_HZ 10000
#endif
// Fraction of text space to allocate for histogram counters here, 1/2
#ifndef HISTFRACTION
#ifdef PICO_RP2350
#define HISTFRACTION 4 // Every 8 bytes of .text
#else
#define HISTFRACTION 8 // Every 16 bytes of .text
#endif
#endif
// Fraction of text space to allocate for from hash buckets.
// The value of HASHFRACTION is based on the minimum number of bytes
// of separation between two subroutine call points in the object code.
// Given MIN_SUBR_SEPARATION bytes of separation the value of
// HASHFRACTION is calculated as:
//
// HASHFRACTION = MIN_SUBR_SEPARATION / (2 * sizeof(short) - 1);
//
// For example, on the VAX, the shortest two call sequence is:
//
// calls $0,(r0)
// calls $0,(r0)
//
// which is separated by only three bytes, thus HASHFRACTION is
// calculated as:
//
// HASHFRACTION = 3 / (2 * 2 - 1) = 1
//
// Note that the division above rounds down, thus if MIN_SUBR_FRACTION
// is less than three, this algorithm will not work!
//
// In practice, however, call instructions are rarely at a minimal
// distance. Hence, we will define HASHFRACTION to be 2 across all
// architectures. This saves a reasonable amount of space for
// profiling data structures without (in practice) sacrificing
// any granularity.
#ifndef HASHFRACTION
#define HASHFRACTION 2
#endif
// Percent of text space to allocate for tostructs with a minimum.
#ifndef ARCDENSITY
#define ARCDENSITY 2 // This is in percentage, relative to text size!
#endif
#define MINARCS 50
#define MAXARCS ((1 << (8 * sizeof(HISTCOUNTER))) - 2)
// Histogram counters are unsigned shorts (according to the kernel)
typedef uint16_t HISTCOUNTER; //#define HISTCOUNTER unsigned short
// In the original profiler code selfpc and count are full 32 bits each
// so the structure actually comes to 12 bytes due to padding (with 2
// bytes wasted per entry). We don't have that much to spare on the Picos,
// so limit the recorded address to 16MB (which is the flash address
// window, anyway) and the counts to 16M (saturating). This saves 4 bytes
// (33%) per entry at the cost of some logic to expand/pack it.
struct tostruct {
uint8_t selfpc[3]; // Callee address/program counter. The caller address is in froms[] array which points to tos[] array
uint8_t count[3]; // How many times it has been called
uint16_t link; // Link to next entry in hash table. For tos[0] this points to the last used entry
};
typedef enum { PROFILE_NOT_INIT = 0, PROFILE_ON, PROFILE_OFF } PROFILE_State;
struct profinfo {
PROFILE_State state; // Profiling state
uint16_t *counter; // Profiling counters
size_t lowpc, highpc; // Range to be profiled
uint32_t scale; // Scale value of bins
};
// Global profinfo for profil() call
static struct profinfo prof = { PROFILE_NOT_INIT, 0, 0, 0, 0 };
// Possible states of profiling
typedef enum { GMON_PROF_ON = 0, GMON_PROF_BUSY, GMON_PROF_ERROR, GMON_PROF_OFF } GMON_State;
// The profiling data structures are housed in this structure.
struct gmonparam {
int state;
uint16_t *kcount; // Histogram PC sample array
size_t kcountsize; // Size of kcount[] array in bytes
uint16_t *froms; // Array of hashed 'from' addresses. The 16bit value is an index into the tos[] array
size_t fromssize; // Size of froms[] array in bytes
struct tostruct *tos; // to struct, contains histogram counter
size_t tossize; // Size of tos[] array in bytes
long tolimit;
size_t lowpc; // Low program counter of area
size_t highpc; // High program counter
size_t textsize; // Code size
};
static struct gmonparam _gmonparam = { GMON_PROF_OFF, NULL, 0, NULL, 0, NULL, 0, 0L, 0, 0, 0};
static bool already_setup = false; // Flag to indicate if we need to init
static bool _perf_in_setup = false; // Are we currently trying to initialize? (avoid infinite recursion)
int __profileMemSize = 0; // Memory allocated by the profiler to store tables
static int s_scale = 0;
#define SCALE_1_TO_1 0x10000L
// Convert an addr to an index
static inline __attribute__((always_inline)) size_t profidx(size_t pc, size_t base, size_t scale) {
size_t i = (pc - base) / 2;
return (unsigned long long int) i * scale / 65536;
}
// Sample the current program counter periodically
#if defined(__riscv)
// TODO - systick-like handler
#else
static void __no_inline_not_in_flash_func(_SystickHandler)(void) {
static size_t pc, idx; // Ensure in heap, not on stack
extern volatile bool __otherCoreIdled;
if (!__otherCoreIdled && (prof.state == PROFILE_ON)) {
pc = ((uint32_t*)(__builtin_frame_address(0)))[14]; // Get SP and use it to get the return address from stack
if ((pc >= prof.lowpc) && (pc < prof.highpc)) {
idx = profidx(pc, prof.lowpc, prof.scale);
prof.counter[idx]++;
}
}
}
#endif
// Convert an index into an address
static inline __attribute__((always_inline)) size_t profaddr(size_t idx, size_t base, size_t scale) {
return base + ((((unsigned long long)(idx) << 16) / (unsigned long long)(scale)) << 1);
}
// Start or stop profiling
// Profiling goes into the SAMPLES buffer of size SIZE (which is treated as an array of uint16_ts of size size/2).
// Each bin represents a range of pc addresses from OFFSET. The number of pc addresses in a bin depends on SCALE.
// (A scale of 65536 maps each bin to two addresses, A scale of 32768 maps each bin to 4 addresses, a scale of
// 1 maps each bin to 128k address). Scale may be 1 - 65536, or zero to turn off profiling
static int __no_inline_not_in_flash_func(profile_ctl)(char *samples, size_t size, size_t offset, uint32_t scale) {
size_t maxbin;
if (scale > 65536) {
return -1;
}
prof.state = PROFILE_OFF;
if (scale) {
bzero(samples, size);
bzero(&prof, sizeof(prof));
maxbin = size >> 1;
prof.counter = (uint16_t*)samples;
prof.lowpc = offset;
prof.highpc = profaddr(maxbin, offset, scale);
prof.scale = scale;
prof.state = PROFILE_ON;
}
return 0;
}
// Control profiling. Profiling is what mcount checks to see if all the data structures are ready.
static void __no_inline_not_in_flash_func(moncontrol)(int mode) {
if (mode) { // Start
profile_ctl((char *)_gmonparam.kcount, _gmonparam.kcountsize, _gmonparam.lowpc, s_scale);
_gmonparam.state = GMON_PROF_ON;
} else { // Stop
profile_ctl((char *)NULL, 0, 0, 0);
_gmonparam.state = GMON_PROF_OFF;
}
}
// General rounding functions
static inline __attribute__((always_inline)) size_t rounddown(size_t x, size_t y) {
return (x / y) * y;
}
static inline __attribute__((always_inline)) size_t roundup(size_t x, size_t y) {
return ((x + y - 1) / y) * y;
}
// Allocate memory and set boundaries before any sampling is performed
void __no_inline_not_in_flash_func(monstartup)(size_t lowpc, size_t highpc) {
register size_t o;
char *cp;
struct gmonparam *p = &_gmonparam;
// Round lowpc and highpc to multiples of the density we're using so the rest of the scaling (here and in gprof) stays in ints.
p->lowpc = rounddown(lowpc, HISTFRACTION * sizeof(HISTCOUNTER));
p->highpc = roundup(highpc, HISTFRACTION * sizeof(HISTCOUNTER));
p->textsize = p->highpc - p->lowpc;
p->kcountsize = p->textsize / HISTFRACTION;
p->fromssize = p->textsize / HASHFRACTION;
p->tolimit = p->textsize * ARCDENSITY / 100;
if (p->tolimit < MINARCS) {
p->tolimit = MINARCS;
} else if (p->tolimit > MAXARCS) {
p->tolimit = MAXARCS;
}
p->tossize = p->tolimit * sizeof(struct tostruct);
__profileMemSize = p->kcountsize + p->fromssize + p->tossize;
#ifdef RP2350_PSRAM_CS
cp = pmalloc(__profileMemSize);
#else
cp = malloc(__profileMemSize);
#endif
if (cp == NULL) {
// OOM
already_setup = false;
return;
}
// Zero out cp as value will be added there
bzero(cp, p->kcountsize + p->fromssize + p->tossize);
p->tos = (struct tostruct *)cp;
cp += p->tossize;
p->kcount = (uint16_t *)cp;
cp += p->kcountsize;
p->froms = (uint16_t *)cp;
p->tos[0].link = 0;
o = p->highpc - p->lowpc;
if (p->kcountsize < o) {
s_scale = ((float)p->kcountsize / o) * SCALE_1_TO_1;
} else {
s_scale = SCALE_1_TO_1;
}
moncontrol(1); // Start
}
// Accessors for the selfpc and count fields
static inline __attribute__((always_inline)) void setselfpc(struct tostruct *x, size_t d) {
x->selfpc[0] = d & 0xff;
x->selfpc[1] = (d >> 8) & 0xff;
x->selfpc[2] = (d >> 16) & 0xff;
}
static inline __attribute__((always_inline))void setcount(struct tostruct *x, size_t d) {
x->count[0] = d & 0xff;
x->count[1] = (d >> 8) & 0xff;
x->count[2] = (d >> 16) & 0xff;
}
static inline __attribute__((always_inline)) uint32_t getselfpc(const struct tostruct *x) {
return 0x10000000 | ((uint32_t)x->selfpc[0]) | (((uint32_t)x->selfpc[1]) << 8) | (((uint32_t)x->selfpc[2]) << 16);
}
static inline __attribute__((always_inline)) uint32_t getcount(const struct tostruct *x) {
return ((uint32_t)x->count[0]) | (((uint32_t)x->count[1]) << 8) | (((uint32_t)x->count[2]) << 16);
}
// Called by the GCC function shim (gprof_shim.S) on function entry to record an arc hit
void __no_inline_not_in_flash_func(_mcount_internal)(uint32_t *frompcindex, uint32_t *selfpc) {
register struct tostruct *top;
register struct tostruct *prevtop;
register long toindex;
struct gmonparam *p = &_gmonparam;
if (_perf_in_setup) {
// Avoid infinite recursion
return;
}
if (!already_setup) {
extern char __flash_binary_start; // Start of flash
extern char __etext; // End of .text
already_setup = true;
_perf_in_setup = true;
monstartup((uint32_t)&__flash_binary_start, (uint32_t)&__etext);
_perf_in_setup = false;
}
// Check that we are profiling and that we aren't recursively invoked.
if (p->state != GMON_PROF_ON) {
return;
}
p->state++;
// Check that frompcindex is a reasonable pc value.
frompcindex = (uint32_t*)((long)frompcindex - (long)p->lowpc);
if ((unsigned long)frompcindex > p->textsize) {
goto done;
}
frompcindex = (uint32_t*)&p->froms[((long)frompcindex) / (HASHFRACTION * sizeof(*p->froms))];
toindex = *((uint16_t*)frompcindex); // Get froms[] value
if (toindex == 0) {
// First time traversing this arc
toindex = ++p->tos[0].link; // The link of tos[0] points to the last used record in the array
if (toindex >= p->tolimit) { // More tos[] entries than we can handle!
goto overflow;
}
*((uint16_t*)frompcindex) = (uint16_t)toindex; // Store new 'to' value into froms[]
top = &p->tos[toindex];
setselfpc(top, (uint32_t)selfpc);
setcount(top, 1);
top->link = 0;
goto done;
}
top = &p->tos[toindex];
if (getselfpc(top) == (size_t)selfpc) {
// Arc at front of chain; usual case.
uint32_t cnt = getcount(top) + 1;
if (cnt >= 1 << 24) {
cnt = (1 << 24) - 1;
}
setcount(top, cnt);
goto done;
}
// Have to go looking down chain for it. top points to what we are looking at, prevtop points to previous top. We know it is not at the head of the chain.
for (; /* goto done */;) {
if (top->link == 0) {
// top is end of the chain and none of the chain had top->selfpc == selfpc, so we allocate a new tostruct and link it to the head of the chain.
toindex = ++p->tos[0].link;
if (toindex >= p->tolimit) {
goto overflow;
}
top = &p->tos[toindex];
setselfpc(top, (uint32_t)selfpc);
setcount(top, 1);
top->link = *((uint16_t*)frompcindex);
*(uint16_t*)frompcindex = (uint16_t)toindex;
goto done;
}
// Otherwise, check the next arc on the chain.
prevtop = top;
top = &p->tos[top->link];
if (getselfpc(top) == (size_t)selfpc) {
// Increment its count, move it to the head of the chain.
uint32_t cnt = getcount(top) + 1;
if (cnt >= 1 << 24) {
cnt = (1 << 24) - 1;
}
setcount(top, cnt);
toindex = prevtop->link;
prevtop->link = top->link;
top->link = *((uint16_t*)frompcindex);
*((uint16_t*)frompcindex) = (uint16_t)toindex;
goto done;
}
}
done:
p->state--;
return;
overflow:
p->state++; // Halt further profiling
return;
}
// Write out the GMON.OUT file using internal state
void _writeProfile(int (*writeCB)(const void *data, int len)) {
struct gmonhdr { // GMON.OUT header
size_t lpc; // base pc address of sample buffer
size_t hpc; // max pc address of sampled buffer
int ncnt; // size of sample buffer (plus this header)
int version; // version number
int profrate; // profiling clock rate
int spare[3]; // reserved
};
const unsigned int GMONVERSION = 0x00051879;
struct rawarc { // Per-arc on-disk data format
size_t raw_frompc;
size_t raw_selfpc;
long raw_count;
};
int fromindex;
int endfrom;
size_t frompc;
int toindex;
struct rawarc rawarc;
const int BS = 64;
struct rawarc rawarcbuff[BS];
int rawarcbuffptr = 0;
struct gmonparam *p = &_gmonparam;
struct gmonhdr hdr;
moncontrol(0); // Stop
hdr.lpc = p->lowpc;
hdr.hpc = p->highpc;
hdr.ncnt = p->kcountsize + sizeof(hdr);
hdr.version = GMONVERSION;
hdr.profrate = GMON_HZ;
writeCB((void *)&hdr, sizeof(hdr));
writeCB((void *)p->kcount, p->kcountsize);
endfrom = p->fromssize / sizeof(*p->froms);
for (fromindex = 0; fromindex < endfrom; fromindex++) {
if (p->froms[fromindex] == 0) {
continue;
}
frompc = p->lowpc;
frompc += fromindex * HASHFRACTION * sizeof(*p->froms);
for (toindex = p->froms[fromindex]; toindex != 0; toindex = p->tos[toindex].link) {
rawarc.raw_frompc = frompc;
rawarc.raw_selfpc = getselfpc(&p->tos[toindex]);
rawarc.raw_count = getcount(&p->tos[toindex]);
// Buffer up writes because Semihosting is really slow per write call
rawarcbuff[rawarcbuffptr++] = rawarc;
if (rawarcbuffptr == BS) {
writeCB((void *)rawarcbuff, BS * sizeof(struct rawarc));
rawarcbuffptr = 0;
}
}
}
// Write any remaining bits
if (rawarcbuffptr) {
writeCB((void *)rawarcbuff, rawarcbuffptr * sizeof(struct rawarc));
}
}
// These are referenced by RP2040Support.cpp and called by the runtime init SDK
// Install a periodic PC sampler at the specified frequency
#if defined(__riscv)
void runtime_init_setup_profiling() {
// TODO - is there an equivalent? Or do we need to build a timer IRQ here?
}
#else
#include <hardware/exception.h>
#include <hardware/structs/systick.h>
void runtime_init_setup_profiling() {
exception_set_exclusive_handler(SYSTICK_EXCEPTION, _SystickHandler);
systick_hw->csr = 0x7;
systick_hw->rvr = (F_CPU / GMON_HZ) - 1;
}
#endif

58
cores/rp2040/gprof_shim.S Normal file
View file

@ -0,0 +1,58 @@
#if defined(__riscv)
// Originally from https://github.com/sbzpro/riscv-gprof
# define RSIZE 4
.section .text
.align 2
.globl _mcount
_mcount:
addi sp,sp,-4*RSIZE
sw ra, 3*RSIZE(sp)
mv a1,ra
call _mcount_internal; //jal _mcount_internal
lw ra, 3*RSIZE(sp)
addi sp,sp,4*RSIZE
ret
#else
/*
* profiler.S
* Implements the gprof profiler arc counting function.
* Created on: 06.08.2015
* Author: Erich Styger
* Modified for RP2040/RP2350 on Dec 3 2024 by Earle F. Philhower, III.
*/
.syntax unified
.arch armv7-m
.cpu cortex-m0plus
.text
.thumb
.thumb_func
.align 2
.globl __gnu_mcount_nc
.type __gnu_mcount_nc, %function
.section .time_critical
__gnu_mcount_nc:
// LR = to return to
// SP = to-replace-LR with
push {r0, r1, r2, r3}
push {lr}
// Swap 24/0
ldr r0, [sp, #20]
ldr r1, [sp, #0]
str r0, [sp, #0]
str r1, [sp, #20]
mov r1, lr
ldr r0, [sp, #0] /* caller - at the top of the stack */
bl _mcount_internal /* when __gnu_mcount_nc is called */
pop {r0}
mov lr, r0
pop {r0, r1, r2, r3}
pop {pc}
.end __gnu_mcount_nc
#endif

View file

@ -44,6 +44,7 @@ For the latest version, always check https://github.com/earlephilhower/arduino-p
USB (Arduino and Adafruit_TinyUSB) <usb> USB (Arduino and Adafruit_TinyUSB) <usb>
Multicore Processing <multicore> Multicore Processing <multicore>
Semihosting <semihosting> Semihosting <semihosting>
Profiling (GPROF) <profiling>
RP2350 Specific Notes <rp2350> RP2350 Specific Notes <rp2350>
RP2350 PSRAM <psram> RP2350 PSRAM <psram>

76
docs/profiling.rst Normal file
View file

@ -0,0 +1,76 @@
Profiling Applications with GPROF
=================================
Applications running on the Pico can be profiled using GNU GPROF to show where the CPU is using its time
on the device and how often certain functions are called. It does this by recompiling the application
and adding a small preamble to each function built to identify what functions call what others (and
how frequently). It also uses the ``SYSTICK`` exception timer to sample and record the PC 10,000 times
per second. When an application is complete, the recorded date can be dumped to the host PC as a
``gmon.,out`` file which can be processed by ``arm-none-eabi-gprof`` into useful date.
s histogram of PCs and tally of function caller/callees can take a significant amount of RAM, from 100KB
to 10000KB depending on the size of the application. As such, while the RP2040 **may** be able to
profile small applications, this is only really recommended on the RP2350 with external PSRAM. The
profiler will automatically use PSRAM when available. Call ``rp2040.getProfileMemoryUsage()`` to get the
memory allocated at runtime.
Profiling also adds processing overhead in terms of the periodic sampling and the function preambles.
In most cases there is no reason to enable (and many reasons to disable) profiling when an application
is deployed to the field.
To transfer the ``GMON.OUT`` data from the Pico to the host HP can be done by having the application
write it out to an SD card or a LittleFS filesystem which is then manually dumped, but for ease of use
semihosting can be used to allow the Pico (under the control of OpenOCD and GDB) to write the
``gmon.out`` file directly on the host PC, ready for use.
**NOTE** Semihosting only works when connected to an OpenOCD + GDB debug session. Running an application
compiled for Semihosting without the debugger will cause a panic and hang the chip.
As of now, only ARM has support for Semihosting or GPROF.
Enabling Profiling in an Application
------------------------------------
The ``Tools->Profiling->Enabled`` menu needs to be selected to enable profiling support in GCC. This will
add the necessary preamble to every function compiled (**Note** that the ``libpico`` and ``libc`` will not
be instrumented because they are pre-built so calls from them will not be fully instrumented. However,
PC data will still be grabbed and decoded from them at runtime.)
The application will automatically start collecting profiling data even before ``setup`` starts in this
mode. It will continue collecting data until you stop and write out the profiling data using
``rp2040.writeProfiling()`` to dump to the host, a file, serial port, etc.
For example, an application which does all its processing in ``setup()`` might look like:
.. code:: cpp
#include <SemiFS.h>
void setup() {
SerialSemi.printf("BEGIN\n");
do_some_work_that_takes_a_long_time_with_many_function_calls();
// Do lots of other work...
// Now all done...
SerialSemi.printf("Writing GMON.OUT\n");
SemiFS.begin();
File gmon = SemiFS.open("gmon.out", "w");
rp2040.writeProfiling(&gmon);
gmon.close();
SerialSemi.printf("END\n");
}
void loop() {}
Collecting and Analyzing Profile Data
-------------------------------------
Running this application under `semihosting <semihosting>`_ GDB and OpenOCD generates a ``gmon.out`` file
in the OpenOCD current working directory. This file, combined with the ``ELF`` binary build in the
IDE and loaded through GDB, can produce profiler output using
.. code::
$ /path/to/arm-none-eabi/bin/arm-none-eabi-gprof /path/to/sketch.ino.elf /path/to/gmon.out
See the ``rp2040/Profiling.ino`` example for more details.

View file

@ -65,6 +65,9 @@ getUsedPSRAMHeap KEYWORD2
getTotalPSRAMHeap KEYWORD2 getTotalPSRAMHeap KEYWORD2
getTotalPSRAM KEYWORD2 getTotalPSRAM KEYWORD2
getProfileMemoryUsage KEYWORD2
writeProfiling KEYWORD2
getChipID KEYWORD2 getChipID KEYWORD2
hwrand32 KEYWORD2 hwrand32 KEYWORD2

View file

@ -0,0 +1,104 @@
// This example should be run with profiling enabled from the IDE and
// under GDB/OpenOCD. It uses semihosting to write a gmon.out file
// the host system with the profiled application results.
//
// Semihosting **ONLY** works with an OpenOCD and GDB setup. If you build
// and run a semihosting app without GDB connected, it **WILL CRASH**
//
// Start OpenOCD normally, but leave the terminal window visible because
// is it OpenOCD, not GDB, which will display the semihosting output.
// OpenOCD will also create files in the current working directory, so
// be sure it is a place you can find and write to.
//
// In GDB,connect to OpenOCD and then enable semihosting
// (gdb) target extended-remote localhost:3333
// (gdb) monitor arm semihosting enable
// (gdb) file /path/to/sketch.ino.elf
// (gdb) load
//
// Run the app from GDB and watch OpenOCD, it will display messages when
// the app is done and "gmon.out" is on the host system.
//
// (gdb) run
// .. pop to OpenOCD window
// [OpenOCD] BEGIN
// [OpenOCD] Result = 2417697592
// [OpenOCD] Writing GMON.OUT
// [OpenOCD] END
//
// From command line, decode the gmon.out using the ELF and gprof tool
//
// $ /path/to/arm-none-eabi/bin/arm-none-eabi-gprof /path/to/sketch.ino.elf /path/to/gmon.out | less
// Flat profile:
//
// Each sample counts as 0.0001 seconds.
// % cumulative self self total
// time seconds seconds calls ms/call ms/call name
// 50.56 1.74 1.74 3500020 0.00 0.00 __wrap___getreent
// 24.05 2.57 0.83 rand
// 8.32 2.86 0.29 5 57.36 57.36 fcn1(unsigned long)
// ...
// index % time self children called name
// <spontaneous>
// [1] 74.6 0.83 1.74 rand [1]
// 1.74 0.00 3500000/3500020 __wrap___getreent [2]
// -----------------------------------------------
// 0.00 0.00 1/3500020 realloc [106]
// 0.00 0.00 3/3500020 vsnprintf [54]
// 0.00 0.00 7/3500020 srand [7]
// 0.00 0.00 9/3500020 malloc [105]
// 1.74 0.00 3500000/3500020 rand [1]
// ...
#ifndef __PROFILE
void setup() {
Serial.printf("Enable profiling to run this example.\n");
}
void loop() {
}
#else
#ifdef __riscv
void setup() {
// No semihosting for RISCV yet
}
void loop() {
}
#else
#include <SemiFS.h>
uint32_t fcn1(uint32_t st) {
srand(st);
for (int i = 0; i < 500000; i++) {
st += rand();
}
return st;
}
uint32_t fcn2(uint32_t st) {
srand(st * st);
for (int i = 0; i < 500000; i++) {
st += rand();
}
return st;
}
void setup() {
SerialSemi.printf("BEGIN\n");
SerialSemi.printf("Result = %lu\n", fcn2(fcn2(fcn1(3)) * fcn1(fcn1(fcn1(fcn1(2))))));
SerialSemi.printf("Writing GMON.OUT\n");
SemiFS.begin();
File gmon = SemiFS.open("gmon.out", "w");
rp2040.writeProfiling(&gmon);
gmon.close();
SerialSemi.printf("END\n");
}
void loop() {
}
#endif
#endif // !__PROFILE

View file

@ -64,7 +64,7 @@ compiler.c.elf.flags={compiler.warning_flags} {compiler.defines} {compiler.flags
compiler.S.cmd={build.toolchain}-gcc compiler.S.cmd={build.toolchain}-gcc
compiler.S.flags=-c {compiler.warning_flags} {compiler.defines} -g -x assembler-with-cpp -MMD {compiler.includes} {build.toolchainopts} -g compiler.S.flags=-c {compiler.warning_flags} {compiler.defines} -g -x assembler-with-cpp -MMD {compiler.includes} {build.toolchainopts} -g
compiler.cpp.cmd={build.toolchain}-g++ compiler.cpp.cmd={build.toolchain}-g++
compiler.cpp.flags=-c {compiler.warning_flags} {compiler.defines} {compiler.flags} -MMD {compiler.includes} {build.flags.rtti} -std=gnu++17 -g -pipe compiler.cpp.flags=-c {compiler.warning_flags} {compiler.defines} {compiler.flags} -MMD {compiler.includes} {build.flags.rtti} {build.flags.profile} -std=gnu++17 -g -pipe
compiler.ar.cmd={build.toolchain}-ar compiler.ar.cmd={build.toolchain}-ar
compiler.ar.flags=rcs compiler.ar.flags=rcs
@ -98,6 +98,7 @@ build.psram_freq=
build.eeprom_start= build.eeprom_start=
build.flags.optimize=-Os build.flags.optimize=-Os
build.flags.rtti=-fno-rtti build.flags.rtti=-fno-rtti
build.flags.profile=
build.fs_start= build.fs_start=
build.fs_end= build.fs_end=
build.usbstack_flags= build.usbstack_flags=

View file

@ -72,6 +72,8 @@ def compile(tmp_dir, sketch, cache, tools_dir, hardware_dir, ide_path, f, args):
fqbn = fqbn.replace("rpipico", "rpipicow") fqbn = fqbn.replace("rpipico", "rpipicow")
if ('/BT' in sketch) or ('/BLE' in sketch) or ('/Bluetooth' in sketch): if ('/BT' in sketch) or ('/BLE' in sketch) or ('/Bluetooth' in sketch):
fqbn = fqbn + ",ipbtstack=ipv4btcble" fqbn = fqbn + ",ipbtstack=ipv4btcble"
if '/Profiling' in sketch:
fqbn = fqbn + ",profile=Enabled"
cmd += [fqbn] cmd += [fqbn]
cmd += ['-built-in-libraries', ide_path + '/libraries'] cmd += ['-built-in-libraries', ide_path + '/libraries']
cmd += ['-ide-version=10607'] cmd += ['-ide-version=10607']

View file

@ -93,6 +93,12 @@ def BuildOptimize(name):
print("%s.menu.opt.%s=%s (%s)%s" % (name, l[0], l[1], l[2], l[3])) print("%s.menu.opt.%s=%s (%s)%s" % (name, l[0], l[1], l[2], l[3]))
print("%s.menu.opt.%s.build.flags.optimize=%s" % (name, l[0], l[2])) print("%s.menu.opt.%s.build.flags.optimize=%s" % (name, l[0], l[2]))
def BuildProfile(name):
print("%s.menu.profile.Disabled=Disabled" % (name))
print("%s.menu.profile.Disabled.build.flags.profile=" % (name))
print("%s.menu.profile.Enabled=Enabled" % (name))
print("%s.menu.profile.Enabled.build.flags.profile=-pg -D__PROFILE" % (name))
def BuildRTTI(name): def BuildRTTI(name):
print("%s.menu.rtti.Disabled=Disabled" % (name)) print("%s.menu.rtti.Disabled=Disabled" % (name))
print("%s.menu.rtti.Disabled.build.flags.rtti=-fno-rtti" % (name)) print("%s.menu.rtti.Disabled.build.flags.rtti=-fno-rtti" % (name))
@ -282,6 +288,7 @@ def BuildGlobalMenuList():
print("menu.freq=CPU Speed") print("menu.freq=CPU Speed")
print("menu.arch=CPU Architecture") print("menu.arch=CPU Architecture")
print("menu.opt=Optimize") print("menu.opt=Optimize")
print("menu.profile=Profiling")
print("menu.rtti=RTTI") print("menu.rtti=RTTI")
print("menu.stackprotect=Stack Protector") print("menu.stackprotect=Stack Protector")
print("menu.exceptions=C++ Exceptions") print("menu.exceptions=C++ Exceptions")
@ -353,6 +360,7 @@ def MakeBoard(name, chip, vendor_name, product_name, vid, pid, pwr, boarddefine,
else: else:
BuildFreq(name, 133) BuildFreq(name, 133)
BuildOptimize(name) BuildOptimize(name)
BuildProfile(name)
BuildRTTI(name) BuildRTTI(name)
BuildStackProtect(name) BuildStackProtect(name)
BuildExceptions(name) BuildExceptions(name)

View file

@ -120,4 +120,7 @@ static const uint8_t SCK = PIN_SPI0_SCK;
#define CRYPTO_WIRE Wire #define CRYPTO_WIRE Wire
#define USB_MAX_POWER (500) #define USB_MAX_POWER (500)
#ifdef __cplusplus
#include "nina_pins.h" #include "nina_pins.h"
#endif

View file

@ -1,6 +1,8 @@
#pragma once #pragma once
#ifdef __cplusplus
#include <Ilabs2040WiFiClass.h> #include <Ilabs2040WiFiClass.h>
#endif
#define PINS_COUNT (26u) #define PINS_COUNT (26u)
#define NUM_DIGITAL_PINS (26u) #define NUM_DIGITAL_PINS (26u)