diff --git a/docs/spi.rst b/docs/spi.rst
index a6fee28..8b87220 100644
--- a/docs/spi.rst
+++ b/docs/spi.rst
@@ -39,7 +39,39 @@ in order to consunme the received data and provide data to transmit.
 * The callbacks operate at IRQ time and may be called very frequently at high SPI frequencies.  So, make then small, fast, and with no memory allocations or locking.
 
 
-Examples
-~~~~~~~~
+Asynchronous Operation
+======================
 
-See the SPItoMyself example for a complete Master and Slave application.
+Applications can use asynchronous SPI calls to allow for processing while long-running SPI transfers are
+being performed.  For example, a game could send a full screen update out over SPI and immediately start
+processing the next frame without waiting for the first one to be sent.  DMA is used to handle
+the transfer to/from the hardware freeing the CPU from bit-banging or busy waiting.
+
+Note that asynchronous operations can not be intersped with normal, synchronous ones.  ``transferAsync``
+should still occur after a ``beginTransaction()`` and when ``finishedAsync()`` returns ``true`` then
+``endTransaction()`` should also be called.
+
+All buffers need to be valid throughout the entire operation.  Read data cannot be accessed until
+the transaction is completed and can't be "peeked" at while the operation is ongoing.
+
+bool transferAsync(const void \*send, void \*recv, size_t bytes)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Begins an SPI asynchronous transaction.  Either ``send`` or ``recv`` can be ``nullptr`` if data only needs
+to be transferred in one direction.
+Check ``finishedAsync()`` to determine when the operation completes and conclude the transaction.
+This operation needs to allocate a buffer from heap equal to ``bytes`` in size if ``LSBMODE`` is used.
+
+bool finishedAsync()
+~~~~~~~~~~~~~~~~~~~~
+Call to check if the asynchronous operations is completed and the buffer passed in can be either read or
+reused.  Frees the allocated memory and completes the asynchronous transaction.
+
+void abortAsync()
+~~~~~~~~~~~~~~~~~
+Cancels the outstanding asynchronous transaction and frees any allocated memory.
+
+
+Examples
+========
+
+See the SPItoMyself and SPItoMyselfAsync examples for a complete Master and Slave application.
diff --git a/libraries/SPI/examples/SPItoMyselfAsync/SPItoMyselfAsync.ino b/libraries/SPI/examples/SPItoMyselfAsync/SPItoMyselfAsync.ino
new file mode 100644
index 0000000..cf8b997
--- /dev/null
+++ b/libraries/SPI/examples/SPItoMyselfAsync/SPItoMyselfAsync.ino
@@ -0,0 +1,93 @@
+// Shows how to use SPISlave on a single device in asynchronous mode
+// Core0 runs as an SPI master and initiates a transmission to the slave
+// Core1 runs the SPI Slave mode and provides a unique reply to messages from the master
+//
+// Released to the public domain 2024 by Earle F. Philhower, III <earlephilhower@yahoo.com>
+
+#include <SPI.h>
+#include <SPISlave.h>
+
+// Wiring:
+// Master RX  GP0 <-> GP11  Slave TX
+// Master CS  GP1 <-> GP9   Slave CS
+// Master CK  GP2 <-> GP10  Slave CK
+// Master TX  GP3 <-> GP8   Slave RX
+
+SPISettings spisettings(1000000, MSBFIRST, SPI_MODE0);
+
+// Core 0 will be SPI master
+void setup() {
+  SPI.setRX(0);
+  SPI.setCS(1);
+  SPI.setSCK(2);
+  SPI.setTX(3);
+  SPI.begin(true);
+
+  delay(5000);
+}
+
+int transmits = 0;
+void loop() {
+  char msg[42];
+  int loops = 0;
+  memset(msg, 0, sizeof(msg));
+  sprintf(msg, "What's up? This is transmission %d", transmits);
+  Serial.printf("\n\nM-SEND: '%s'\n", msg);
+  SPI.beginTransaction(spisettings);
+  SPI.transferAsync(msg, msg, sizeof(msg));
+  while (!SPI.finishedAsync()) {
+    loops++;
+  }
+  SPI.endTransaction();
+  Serial.printf("M-RECV: '%s', idle loops %d\n", msg, loops);
+  transmits++;
+  delay(5000);
+}
+
+// Core 1 will be SPI slave
+
+volatile bool recvBuffReady = false;
+char recvBuff[42] = "";
+int recvIdx = 0;
+void recvCallback(uint8_t *data, size_t len) {
+  memcpy(recvBuff + recvIdx, data, len);
+  recvIdx += len;
+  if (recvIdx == sizeof(recvBuff)) {
+    recvBuffReady = true;
+    recvIdx = 0;
+  }
+}
+
+int sendcbs = 0;
+// Note that the buffer needs to be long lived, the SPISlave doesn't copy it.  So no local stack variables, only globals or heap(malloc/new) allocations.
+char sendBuff[42];
+void sentCallback() {
+  memset(sendBuff, 0, sizeof(sendBuff));
+  sprintf(sendBuff, "Slave to Master Xmission %d", sendcbs++);
+  SPISlave1.setData((uint8_t*)sendBuff, sizeof(sendBuff));
+}
+
+// Note that we use SPISlave1 here **not** because we're running on
+// Core 1, but because SPI0 is being used already.  You can use
+// SPISlave or SPISlave1 on any core.
+void setup1() {
+  SPISlave1.setRX(8);
+  SPISlave1.setCS(9);
+  SPISlave1.setSCK(10);
+  SPISlave1.setTX(11);
+  // Ensure we start with something to send...
+  sentCallback();
+  // Hook our callbacks into the slave
+  SPISlave1.onDataRecv(recvCallback);
+  SPISlave1.onDataSent(sentCallback);
+  SPISlave1.begin(spisettings);
+  delay(3000);
+  Serial.println("S-INFO: SPISlave started");
+}
+
+void loop1() {
+  if (recvBuffReady) {
+    Serial.printf("S-RECV: '%s'\n", recvBuff);
+    recvBuffReady = false;
+  }
+}
diff --git a/libraries/SPI/keywords.txt b/libraries/SPI/keywords.txt
index be95db9..a2800cb 100644
--- a/libraries/SPI/keywords.txt
+++ b/libraries/SPI/keywords.txt
@@ -26,6 +26,9 @@ setRX	KEYWORD2
 setTX	KEYWORD2
 setSCK	KEYWORD2
 setCS	KEYWORD2
+transferAsync	KEYWORD2
+finishedAsync	KEYWORD2
+abortAsync	KEYWORD2
 
 #######################################
 # Constants (LITERAL1)
diff --git a/libraries/SPI/src/SPI.cpp b/libraries/SPI/src/SPI.cpp
index 7e6736f..8252339 100644
--- a/libraries/SPI/src/SPI.cpp
+++ b/libraries/SPI/src/SPI.cpp
@@ -19,6 +19,7 @@
 */
 
 #include "SPI.h"
+#include <hardware/dma.h>
 #include <hardware/spi.h>
 #include <hardware/gpio.h>
 #include <hardware/structs/iobank0.h>
@@ -218,7 +219,7 @@ void SPIClassRP2040::beginTransaction(SPISettings settings) {
 void SPIClassRP2040::endTransaction(void) {
     noInterrupts(); // Avoid race condition so the GPIO IRQs won't come back until all state is restored
     DEBUGSPI("SPI::endTransaction()\n");
-    // Re-enablke IRQs
+    // Re-enable IRQs
     for (auto entry : _usingIRQs) {
         int gpio = entry.first;
         int mode = entry.second;
@@ -230,6 +231,103 @@ void SPIClassRP2040::endTransaction(void) {
     interrupts();
 }
 
+bool SPIClassRP2040::transferAsync(const void *send, void *recv, size_t bytes) {
+    DEBUGSPI("SPI::transferAsync(%p, %p, %d)\n", send, recv, bytes);
+    const uint8_t *txbuff = reinterpret_cast<const uint8_t *>(send);
+    uint8_t *rxbuff = reinterpret_cast<uint8_t *>(recv);
+    _dummy = 0xffffffff;
+
+    if (!_initted || (!send && !recv)) {
+        return false;
+    }
+
+    _channelDMA = dma_claim_unused_channel(false);
+    if (_channelDMA == -1) {
+        return false;
+    }
+    _channelSendDMA = dma_claim_unused_channel(false);
+    if (_channelSendDMA == -1) {
+        dma_channel_unclaim(_channelDMA);
+        return false;
+    }
+
+    if (send && (_spis.getBitOrder() != MSBFIRST)) {
+        _dmaBuffer = (uint8_t *)malloc(bytes);
+        if (!_dmaBuffer) {
+            dma_channel_unclaim(_channelDMA);
+            dma_channel_unclaim(_channelSendDMA);
+            return false;
+        }
+        for (size_t i = 0; i < bytes; i++) {
+            _dmaBuffer[i] = reverseByte(txbuff[i]);
+        }
+    }
+    _dmaBytes = bytes;
+    _rxFinalBuffer = rxbuff;
+
+    hw_write_masked(&spi_get_hw(_spi)->cr0, (8 - 1) << SPI_SSPCR0_DSS_LSB, SPI_SSPCR0_DSS_BITS); // Fast set to 8-bits
+
+    dma_channel_config c = dma_channel_get_default_config(_channelSendDMA);
+    channel_config_set_transfer_data_size(&c, DMA_SIZE_8); // 8b transfers into SPI FIFO
+    channel_config_set_read_increment(&c, send ? true : false); // Reading incrementing addresses
+    channel_config_set_write_increment(&c, false); // Writing to the same FIFO address
+    channel_config_set_dreq(&c, spi_get_dreq(_spi, true)); // Wait for the TX FIFO specified
+    channel_config_set_chain_to(&c, _channelSendDMA); // No chaining
+    channel_config_set_irq_quiet(&c, true); // No need for IRQ
+    dma_channel_configure(_channelSendDMA, &c, &spi_get_hw(_spi)->dr, !send ? (uint8_t *)&_dummy : (_spis.getBitOrder() != MSBFIRST ? _dmaBuffer : txbuff), bytes, false);
+
+    c = dma_channel_get_default_config(_channelDMA);
+    channel_config_set_transfer_data_size(&c, DMA_SIZE_8); // 8b transfers into SPI FIFO
+    channel_config_set_read_increment(&c, false); // Reading same FIFO address
+    channel_config_set_write_increment(&c, recv ? true : false); // Writing to the buffer
+    channel_config_set_dreq(&c, spi_get_dreq(_spi, false)); // Wait for the RX FIFO specified
+    channel_config_set_chain_to(&c, _channelDMA); // No chaining
+    channel_config_set_irq_quiet(&c, true); // No need for IRQ
+    dma_channel_configure(_channelDMA, &c, !recv ? (uint8_t *)&_dummy : rxbuff, &spi_get_hw(_spi)->dr, bytes, false);
+
+    spi_get_hw(_spi)->dmacr = 1 | (1 << 1); // TDMAE | RDMAE
+
+    dma_channel_start(_channelDMA);
+    dma_channel_start(_channelSendDMA);
+    return true;
+}
+
+bool SPIClassRP2040::finishedAsync() {
+    if (!_initted) {
+        return true;
+    }
+    if (dma_channel_is_busy(_channelDMA) || (spi_get_hw(_spi)->sr & SPI_SSPSR_BSY_BITS)) {
+        return false;
+    }
+    dma_channel_cleanup(_channelDMA);
+    dma_channel_unclaim(_channelDMA);
+    dma_channel_cleanup(_channelSendDMA);
+    dma_channel_unclaim(_channelSendDMA);
+    spi_get_hw(_spi)->dmacr = 0;
+    if (_spis.getBitOrder() != MSBFIRST) {
+        for (int i = 0; i < _dmaBytes; i++) {
+            _rxFinalBuffer[i] = reverseByte(_rxFinalBuffer[i]);
+        }
+        free(_dmaBuffer);
+        _dmaBuffer = nullptr;
+    }
+    return true;
+}
+
+void SPIClassRP2040::abortAsync() {
+    if (!_initted) {
+        return;
+    }
+    dma_channel_cleanup(_channelDMA);
+    dma_channel_unclaim(_channelDMA);
+    dma_channel_cleanup(_channelSendDMA);
+    dma_channel_unclaim(_channelSendDMA);
+    spi_get_hw(_spi)->dmacr = 0;
+    free(_dmaBuffer);
+    _dmaBuffer = nullptr;
+}
+
+
 bool SPIClassRP2040::setRX(pin_size_t pin) {
     constexpr uint32_t valid[2] = { __bitset({0, 4, 16, 20}) /* SPI0 */,
                                     __bitset({8, 12, 24, 28})  /* SPI1 */
diff --git a/libraries/SPI/src/SPI.h b/libraries/SPI/src/SPI.h
index 924d872..d67741b 100644
--- a/libraries/SPI/src/SPI.h
+++ b/libraries/SPI/src/SPI.h
@@ -39,6 +39,13 @@ public:
     // Sends one buffer and receives into another, much faster! can set rx or txbuf to nullptr
     void transfer(const void *txbuf, void *rxbuf, size_t count) override;
 
+    // DMA/asynchronous transfers.  Do not combime with synchronous runs or bad stuff will happen
+    // All buffers must be valid for entire DMA and not touched until `finished()` returns true.
+    bool transferAsync(const void *send, void *recv, size_t bytes);
+    bool finishedAsync(); // Call to check if the async operations is completed and the buffer can be reused/read
+    void abortAsync(); // Cancel an outstanding async operation
+
+
     // Call before/after every complete transaction
     void beginTransaction(SPISettings settings) override;
     void endTransaction(void) override;
@@ -51,7 +58,7 @@ public:
     bool setCS(pin_size_t pin);
     bool setSCK(pin_size_t pin);
     bool setTX(pin_size_t pin);
-    bool setMOSI(pin_size_t pin) {
+    inline bool setMOSI(pin_size_t pin) {
         return setTX(pin);
     }
 
@@ -92,6 +99,14 @@ private:
     bool _initted; // Transaction begun
 
     std::map<int, int> _usingIRQs;
+
+    // DMA
+    int _channelDMA;
+    int _channelSendDMA;
+    uint8_t *_dmaBuffer = nullptr;
+    int _dmaBytes;
+    uint8_t *_rxFinalBuffer;
+    uint32_t _dummy;
 };
 
 extern SPIClassRP2040 SPI;