Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

spi: stm32: use STM32 DMA with STM32 MDMA to enhance DDR use

The STM32 DMA doesn't have the ability to generate convenient burst
transfer on the DDR, ensuring the best load of the AXI & DDR.
To avoid this bad load of the AXI & DDR, STM32 MDMA can be used to transfer
data to the DDR, being triggered by STM32 DMA channel transfer
completion.
An SRAM buffer is used between DMA and MDMA. So the MDMA always does
MEM_TO_MEM transfers (from/to SRAM to/from DDR), and the DMA uses SRAM
instead of DDR with DEV_TO_MEM transfers.
SPI RX DMA (DEV_TO_MEM) becomes:
SPI RX FIFO ==DMA==> SRAM ==MDMA==> DDR

In RX (DEV_TO_MEM), EOT interrupt is used to pause the DMA channel (which
will raise a transfer complete) to trigger the MDMA to flush the SRAM (when
transfer length is not aligned on SRAM period).
TX remains on the former implementation.

Signed-off-by: Clément Le Goffic <clement.legoffic@foss.st.com>
Link: https://patch.msgid.link/20250616-spi-upstream-v1-4-7e8593f3f75d@foss.st.com
Signed-off-by: Mark Brown <broonie@kernel.org>

authored by

Clément Le Goffic and committed by
Mark Brown
d17dd2f1 21f1c800

+228 -23
+228 -23
drivers/spi/spi-stm32.c
··· 9 9 #include <linux/debugfs.h> 10 10 #include <linux/clk.h> 11 11 #include <linux/delay.h> 12 + #include <linux/dma-mapping.h> 12 13 #include <linux/dmaengine.h> 14 + #include <linux/genalloc.h> 13 15 #include <linux/interrupt.h> 14 16 #include <linux/iopoll.h> 15 17 #include <linux/module.h> ··· 330 328 * @dma_rx: dma channel for RX transfer 331 329 * @phys_addr: SPI registers physical base address 332 330 * @device_mode: the controller is configured as SPI device 331 + * @sram_pool: SRAM pool for DMA transfers 332 + * @sram_rx_buf_size: size of SRAM buffer for RX transfer 333 + * @sram_rx_buf: SRAM buffer for RX transfer 334 + * @sram_dma_rx_buf: SRAM buffer physical address for RX transfer 335 + * @mdma_rx: MDMA channel for RX transfer 333 336 */ 334 337 struct stm32_spi { 335 338 struct device *dev; ··· 369 362 dma_addr_t phys_addr; 370 363 371 364 bool device_mode; 365 + 366 + struct gen_pool *sram_pool; 367 + size_t sram_rx_buf_size; 368 + void *sram_rx_buf; 369 + dma_addr_t sram_dma_rx_buf; 370 + struct dma_chan *mdma_rx; 372 371 }; 373 372 374 373 static const struct stm32_spi_regspec stm32fx_spi_regspec = { ··· 898 885 899 886 if (spi->cur_usedma && spi->dma_tx) 900 887 dmaengine_terminate_async(spi->dma_tx); 901 - if (spi->cur_usedma && spi->dma_rx) 888 + if (spi->cur_usedma && spi->dma_rx) { 902 889 dmaengine_terminate_async(spi->dma_rx); 890 + if (spi->mdma_rx) 891 + dmaengine_terminate_async(spi->mdma_rx); 892 + } 903 893 904 894 stm32_spi_clr_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_SPE); 905 895 ··· 1114 1098 } 1115 1099 1116 1100 if (sr & STM32H7_SPI_SR_EOT) { 1101 + dev_dbg(spi->dev, "End of transfer\n"); 1117 1102 if (!spi->cur_usedma && (spi->rx_buf && (spi->rx_len > 0))) 1118 1103 stm32h7_spi_read_rxfifo(spi); 1119 1104 if (!spi->cur_usedma || 1120 - (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX)) 1105 + (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) || 1106 + (spi->mdma_rx && (spi->cur_comm == SPI_SIMPLEX_RX || 1107 + spi->cur_comm == SPI_FULL_DUPLEX))) 1121 1108 end = true; 1122 1109 } 1123 1110 ··· 1137 1118 spin_unlock_irqrestore(&spi->lock, flags); 1138 1119 1139 1120 if (end) { 1121 + if (spi->cur_usedma && spi->mdma_rx) { 1122 + dmaengine_pause(spi->dma_rx); 1123 + /* Wait for callback */ 1124 + return IRQ_HANDLED; 1125 + } 1140 1126 stm32h7_spi_disable(spi); 1141 1127 spi_finalize_current_transfer(ctrl); 1142 1128 } ··· 1447 1423 /* Enable the interrupts */ 1448 1424 if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) 1449 1425 ier |= STM32H7_SPI_IER_EOTIE | STM32H7_SPI_IER_TXTFIE; 1426 + if (spi->mdma_rx && (spi->cur_comm == SPI_SIMPLEX_RX || spi->cur_comm == SPI_FULL_DUPLEX)) 1427 + ier |= STM32H7_SPI_IER_EOTIE; 1450 1428 1451 1429 stm32_spi_set_bits(spi, STM32H7_SPI_IER, ier); 1452 1430 ··· 1456 1430 1457 1431 if (STM32_SPI_HOST_MODE(spi)) 1458 1432 stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); 1433 + } 1434 + 1435 + /** 1436 + * stm32_spi_prepare_rx_dma_mdma_chaining - Prepare RX DMA and MDMA chaining 1437 + * @spi: pointer to the spi controller data structure 1438 + * @xfer: pointer to the spi transfer 1439 + * @rx_dma_conf: pointer to the DMA configuration for RX channel 1440 + * @rx_dma_desc: pointer to the RX DMA descriptor 1441 + * @rx_mdma_desc: pointer to the RX MDMA descriptor 1442 + * 1443 + * It must return 0 if the chaining is possible or an error code if not. 1444 + */ 1445 + static int stm32_spi_prepare_rx_dma_mdma_chaining(struct stm32_spi *spi, 1446 + struct spi_transfer *xfer, 1447 + struct dma_slave_config *rx_dma_conf, 1448 + struct dma_async_tx_descriptor **rx_dma_desc, 1449 + struct dma_async_tx_descriptor **rx_mdma_desc) 1450 + { 1451 + struct dma_slave_config rx_mdma_conf = {0}; 1452 + u32 sram_period, nents = 0, spi_s_len; 1453 + struct sg_table dma_sgt, mdma_sgt; 1454 + struct scatterlist *spi_s, *s; 1455 + dma_addr_t dma_buf; 1456 + int i, ret; 1457 + 1458 + sram_period = spi->sram_rx_buf_size / 2; 1459 + 1460 + /* Configure MDMA RX channel */ 1461 + rx_mdma_conf.direction = rx_dma_conf->direction; 1462 + rx_mdma_conf.src_addr = spi->sram_dma_rx_buf; 1463 + rx_mdma_conf.peripheral_config = rx_dma_conf->peripheral_config; 1464 + rx_mdma_conf.peripheral_size = rx_dma_conf->peripheral_size; 1465 + dmaengine_slave_config(spi->mdma_rx, &rx_mdma_conf); 1466 + 1467 + /* Count the number of entries needed */ 1468 + for_each_sg(xfer->rx_sg.sgl, spi_s, xfer->rx_sg.nents, i) 1469 + if (sg_dma_len(spi_s) > sram_period) 1470 + nents += DIV_ROUND_UP(sg_dma_len(spi_s), sram_period); 1471 + else 1472 + nents++; 1473 + 1474 + /* Prepare DMA slave_sg DBM transfer DEV_TO_MEM (RX>MEM=SRAM) */ 1475 + ret = sg_alloc_table(&dma_sgt, nents, GFP_ATOMIC); 1476 + if (ret) 1477 + return ret; 1478 + 1479 + spi_s = xfer->rx_sg.sgl; 1480 + spi_s_len = sg_dma_len(spi_s); 1481 + dma_buf = spi->sram_dma_rx_buf; 1482 + for_each_sg(dma_sgt.sgl, s, dma_sgt.nents, i) { 1483 + size_t bytes = min_t(size_t, spi_s_len, sram_period); 1484 + 1485 + sg_dma_len(s) = bytes; 1486 + sg_dma_address(s) = dma_buf; 1487 + spi_s_len -= bytes; 1488 + 1489 + if (!spi_s_len && sg_next(spi_s)) { 1490 + spi_s = sg_next(spi_s); 1491 + spi_s_len = sg_dma_len(spi_s); 1492 + dma_buf = spi->sram_dma_rx_buf; 1493 + } else { /* DMA configured in DBM: it will swap between the SRAM periods */ 1494 + if (i & 1) 1495 + dma_buf += sram_period; 1496 + else 1497 + dma_buf = spi->sram_dma_rx_buf; 1498 + } 1499 + } 1500 + 1501 + *rx_dma_desc = dmaengine_prep_slave_sg(spi->dma_rx, dma_sgt.sgl, 1502 + dma_sgt.nents, rx_dma_conf->direction, 1503 + DMA_PREP_INTERRUPT); 1504 + sg_free_table(&dma_sgt); 1505 + 1506 + if (!rx_dma_desc) 1507 + return -EINVAL; 1508 + 1509 + /* Prepare MDMA slave_sg transfer MEM_TO_MEM (SRAM>DDR) */ 1510 + ret = sg_alloc_table(&mdma_sgt, nents, GFP_ATOMIC); 1511 + if (ret) { 1512 + rx_dma_desc = NULL; 1513 + return ret; 1514 + } 1515 + 1516 + spi_s = xfer->rx_sg.sgl; 1517 + spi_s_len = sg_dma_len(spi_s); 1518 + dma_buf = sg_dma_address(spi_s); 1519 + for_each_sg(mdma_sgt.sgl, s, mdma_sgt.nents, i) { 1520 + size_t bytes = min_t(size_t, spi_s_len, sram_period); 1521 + 1522 + sg_dma_len(s) = bytes; 1523 + sg_dma_address(s) = dma_buf; 1524 + spi_s_len -= bytes; 1525 + 1526 + if (!spi_s_len && sg_next(spi_s)) { 1527 + spi_s = sg_next(spi_s); 1528 + spi_s_len = sg_dma_len(spi_s); 1529 + dma_buf = sg_dma_address(spi_s); 1530 + } else { 1531 + dma_buf += bytes; 1532 + } 1533 + } 1534 + 1535 + *rx_mdma_desc = dmaengine_prep_slave_sg(spi->mdma_rx, mdma_sgt.sgl, 1536 + mdma_sgt.nents, rx_mdma_conf.direction, 1537 + DMA_PREP_INTERRUPT); 1538 + sg_free_table(&mdma_sgt); 1539 + 1540 + if (!rx_mdma_desc) { 1541 + rx_dma_desc = NULL; 1542 + return -EINVAL; 1543 + } 1544 + 1545 + return 0; 1459 1546 } 1460 1547 1461 1548 /** ··· 1582 1443 static int stm32_spi_transfer_one_dma(struct stm32_spi *spi, 1583 1444 struct spi_transfer *xfer) 1584 1445 { 1446 + struct dma_async_tx_descriptor *rx_mdma_desc = NULL, *rx_dma_desc = NULL; 1447 + struct dma_async_tx_descriptor *tx_dma_desc = NULL; 1585 1448 struct dma_slave_config tx_dma_conf, rx_dma_conf; 1586 - struct dma_async_tx_descriptor *tx_dma_desc, *rx_dma_desc; 1587 1449 unsigned long flags; 1450 + int ret = 0; 1588 1451 1589 1452 spin_lock_irqsave(&spi->lock, flags); 1590 1453 1591 - rx_dma_desc = NULL; 1592 1454 if (spi->rx_buf && spi->dma_rx) { 1593 1455 stm32_spi_dma_config(spi, spi->dma_rx, &rx_dma_conf, DMA_DEV_TO_MEM); 1594 - dmaengine_slave_config(spi->dma_rx, &rx_dma_conf); 1456 + if (spi->mdma_rx) { 1457 + rx_dma_conf.peripheral_size = 1; 1458 + dmaengine_slave_config(spi->dma_rx, &rx_dma_conf); 1595 1459 1596 - /* Enable Rx DMA request */ 1597 - stm32_spi_set_bits(spi, spi->cfg->regs->dma_rx_en.reg, 1598 - spi->cfg->regs->dma_rx_en.mask); 1599 - 1600 - rx_dma_desc = dmaengine_prep_slave_sg( 1601 - spi->dma_rx, xfer->rx_sg.sgl, 1602 - xfer->rx_sg.nents, 1603 - rx_dma_conf.direction, 1604 - DMA_PREP_INTERRUPT); 1460 + ret = stm32_spi_prepare_rx_dma_mdma_chaining(spi, xfer, &rx_dma_conf, 1461 + &rx_dma_desc, &rx_mdma_desc); 1462 + if (ret) { /* RX DMA MDMA chaining not possible, fallback to DMA only */ 1463 + rx_dma_conf.peripheral_config = 0; 1464 + rx_dma_desc = NULL; 1465 + } 1466 + } 1467 + if (!rx_dma_desc) { 1468 + dmaengine_slave_config(spi->dma_rx, &rx_dma_conf); 1469 + rx_dma_desc = dmaengine_prep_slave_sg(spi->dma_rx, xfer->rx_sg.sgl, 1470 + xfer->rx_sg.nents, 1471 + rx_dma_conf.direction, 1472 + DMA_PREP_INTERRUPT); 1473 + } 1605 1474 } 1606 1475 1607 - tx_dma_desc = NULL; 1608 1476 if (spi->tx_buf && spi->dma_tx) { 1609 1477 stm32_spi_dma_config(spi, spi->dma_tx, &tx_dma_conf, DMA_MEM_TO_DEV); 1610 1478 dmaengine_slave_config(spi->dma_tx, &tx_dma_conf); 1611 - 1612 - tx_dma_desc = dmaengine_prep_slave_sg( 1613 - spi->dma_tx, xfer->tx_sg.sgl, 1614 - xfer->tx_sg.nents, 1615 - tx_dma_conf.direction, 1616 - DMA_PREP_INTERRUPT); 1479 + tx_dma_desc = dmaengine_prep_slave_sg(spi->dma_tx, xfer->tx_sg.sgl, 1480 + xfer->tx_sg.nents, 1481 + tx_dma_conf.direction, 1482 + DMA_PREP_INTERRUPT); 1617 1483 } 1618 1484 1619 1485 if ((spi->tx_buf && spi->dma_tx && !tx_dma_desc) || ··· 1629 1485 goto dma_desc_error; 1630 1486 1631 1487 if (rx_dma_desc) { 1632 - rx_dma_desc->callback = spi->cfg->dma_rx_cb; 1633 - rx_dma_desc->callback_param = spi; 1488 + if (rx_mdma_desc) { 1489 + rx_mdma_desc->callback = spi->cfg->dma_rx_cb; 1490 + rx_mdma_desc->callback_param = spi; 1491 + } else { 1492 + rx_dma_desc->callback = spi->cfg->dma_rx_cb; 1493 + rx_dma_desc->callback_param = spi; 1494 + } 1634 1495 1496 + /* Enable Rx DMA request */ 1497 + stm32_spi_set_bits(spi, spi->cfg->regs->dma_rx_en.reg, 1498 + spi->cfg->regs->dma_rx_en.mask); 1499 + if (rx_mdma_desc) { 1500 + if (dma_submit_error(dmaengine_submit(rx_mdma_desc))) { 1501 + dev_err(spi->dev, "Rx MDMA submit failed\n"); 1502 + goto dma_desc_error; 1503 + } 1504 + /* Enable Rx MDMA channel */ 1505 + dma_async_issue_pending(spi->mdma_rx); 1506 + } 1635 1507 if (dma_submit_error(dmaengine_submit(rx_dma_desc))) { 1636 1508 dev_err(spi->dev, "Rx DMA submit failed\n"); 1637 1509 goto dma_desc_error; ··· 1682 1522 return 1; 1683 1523 1684 1524 dma_submit_error: 1525 + if (spi->mdma_rx) 1526 + dmaengine_terminate_sync(spi->mdma_rx); 1685 1527 if (spi->dma_rx) 1686 1528 dmaengine_terminate_sync(spi->dma_rx); 1687 1529 ··· 1694 1532 spin_unlock_irqrestore(&spi->lock, flags); 1695 1533 1696 1534 dev_info(spi->dev, "DMA issue: fall back to irq transfer\n"); 1535 + 1536 + if (spi->sram_rx_buf) 1537 + memset(spi->sram_rx_buf, 0, spi->sram_rx_buf_size); 1697 1538 1698 1539 spi->cur_usedma = false; 1699 1540 return spi->cfg->transfer_one_irq(spi); ··· 2056 1891 2057 1892 spi->cfg->disable(spi); 2058 1893 1894 + if (spi->sram_rx_buf) 1895 + memset(spi->sram_rx_buf, 0, spi->sram_rx_buf_size); 1896 + 2059 1897 return 0; 2060 1898 } 2061 1899 ··· 2413 2245 if (spi->dma_tx || spi->dma_rx) 2414 2246 ctrl->can_dma = stm32_spi_can_dma; 2415 2247 2248 + spi->sram_pool = of_gen_pool_get(pdev->dev.of_node, "sram", 0); 2249 + if (spi->sram_pool) { 2250 + spi->sram_rx_buf_size = gen_pool_size(spi->sram_pool); 2251 + dev_info(&pdev->dev, "SRAM pool: %zu KiB for RX DMA/MDMA chaining\n", 2252 + spi->sram_rx_buf_size / 1024); 2253 + spi->sram_rx_buf = gen_pool_dma_zalloc(spi->sram_pool, spi->sram_rx_buf_size, 2254 + &spi->sram_dma_rx_buf); 2255 + if (!spi->sram_rx_buf) { 2256 + dev_err(&pdev->dev, "failed to allocate SRAM buffer\n"); 2257 + } else { 2258 + spi->mdma_rx = dma_request_chan(spi->dev, "rxm2m"); 2259 + if (IS_ERR(spi->mdma_rx)) { 2260 + ret = PTR_ERR(spi->mdma_rx); 2261 + spi->mdma_rx = NULL; 2262 + if (ret == -EPROBE_DEFER) { 2263 + goto err_pool_free; 2264 + } else { 2265 + gen_pool_free(spi->sram_pool, 2266 + (unsigned long)spi->sram_rx_buf, 2267 + spi->sram_rx_buf_size); 2268 + dev_warn(&pdev->dev, 2269 + "failed to request rx mdma channel, DMA only\n"); 2270 + } 2271 + } 2272 + } 2273 + } 2274 + 2416 2275 pm_runtime_set_autosuspend_delay(&pdev->dev, 2417 2276 STM32_SPI_AUTOSUSPEND_DELAY); 2418 2277 pm_runtime_use_autosuspend(&pdev->dev); ··· 2467 2272 pm_runtime_put_noidle(&pdev->dev); 2468 2273 pm_runtime_set_suspended(&pdev->dev); 2469 2274 pm_runtime_dont_use_autosuspend(&pdev->dev); 2275 + 2276 + if (spi->mdma_rx) 2277 + dma_release_channel(spi->mdma_rx); 2278 + err_pool_free: 2279 + gen_pool_free(spi->sram_pool, (unsigned long)spi->sram_rx_buf, spi->sram_rx_buf_size); 2470 2280 err_dma_release: 2471 2281 if (spi->dma_tx) 2472 2282 dma_release_channel(spi->dma_tx); ··· 2502 2302 dma_release_channel(ctrl->dma_tx); 2503 2303 if (ctrl->dma_rx) 2504 2304 dma_release_channel(ctrl->dma_rx); 2305 + if (spi->mdma_rx) 2306 + dma_release_channel(spi->mdma_rx); 2307 + if (spi->sram_rx_buf) 2308 + gen_pool_free(spi->sram_pool, (unsigned long)spi->sram_rx_buf, 2309 + spi->sram_rx_buf_size); 2505 2310 2506 2311 clk_disable_unprepare(spi->clk); 2507 2312