Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

remoteproc: add rproc_report_crash function to notify rproc crashes

Allow low-level remoteproc drivers to report rproc crashes by exporting
a new rproc_report_crash() function (invoking this from non-rproc drivers
is probably wrong, and should be carefully scrutinized if ever needed).

rproc_report_crash() can be called from any context; it offloads the
tasks of handling the crash to a separate thread.

Handling the crash from a separate thread is helpful because:
- Ability to call invoke rproc_report_crash() from atomic context, due to
the fact that many crashes trigger an interrupt, so this function can be
called directly from ISR context.
- Avoiding deadlocks which could happen if rproc_report_crash() is called
from a function which indirectly holds the rproc lock.

Handling the crash might involve:
- Remoteproc register dump
- Remoteproc stack dump
- Remoteproc core dump
- Saving Remoteproc traces so they can be read after the crash
- Reseting the remoteproc in order to make it functional again (hard recovery)

Right now, we only print the crash type which was detected, and only the
mmufault type is supported. Remoteproc low-level drivers can add more types
when needed.

Signed-off-by: Fernando Guzman Lugo <fernando.lugo@ti.com>
[ohad: some commentary, white space and commit log changes]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>

authored by

Fernando Guzman Lugo and committed by
Ohad Ben-Cohen
8afd519c a1a7e0a3

+100 -4
+7
Documentation/remoteproc.txt
··· 129 129 130 130 Returns 0 on success and -EINVAL if @rproc isn't valid. 131 131 132 + void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type) 133 + - Report a crash in a remoteproc 134 + This function must be called every time a crash is detected by the 135 + platform specific rproc implementation. This should not be called from a 136 + non-remoteproc driver. This function can be called from atomic/interrupt 137 + context. 138 + 132 139 5. Implementation callbacks 133 140 134 141 These callbacks should be provided by platform-specific remoteproc
+75 -4
drivers/remoteproc/remoteproc_core.c
··· 50 50 /* Unique indices for remoteproc devices */ 51 51 static DEFINE_IDA(rproc_dev_index); 52 52 53 + static const char * const rproc_crash_names[] = { 54 + [RPROC_MMUFAULT] = "mmufault", 55 + }; 56 + 57 + /* translate rproc_crash_type to string */ 58 + static const char *rproc_crash_to_string(enum rproc_crash_type type) 59 + { 60 + if (type < ARRAY_SIZE(rproc_crash_names)) 61 + return rproc_crash_names[type]; 62 + return "unkown"; 63 + } 64 + 53 65 /* 54 66 * This is the IOMMU fault handler we register with the IOMMU API 55 67 * (when relevant; not all remote processors access memory through ··· 69 57 * 70 58 * IOMMU core will invoke this handler whenever the remote processor 71 59 * will try to access an unmapped device address. 72 - * 73 - * Currently this is mostly a stub, but it will be later used to trigger 74 - * the recovery of the remote processor. 75 60 */ 76 61 static int rproc_iommu_fault(struct iommu_domain *domain, struct device *dev, 77 62 unsigned long iova, int flags, void *token) 78 63 { 64 + struct rproc *rproc = token; 65 + 79 66 dev_err(dev, "iommu fault: da 0x%lx flags 0x%x\n", iova, flags); 67 + 68 + rproc_report_crash(rproc, RPROC_MMUFAULT); 80 69 81 70 /* 82 71 * Let the iommu core know we're not really handling this fault; 83 - * we just plan to use this as a recovery trigger. 72 + * we just used it as a recovery trigger. 84 73 */ 85 74 return -ENOSYS; 86 75 } ··· 885 872 } 886 873 887 874 /** 875 + * rproc_crash_handler_work() - handle a crash 876 + * 877 + * This function needs to handle everything related to a crash, like cpu 878 + * registers and stack dump, information to help to debug the fatal error, etc. 879 + */ 880 + static void rproc_crash_handler_work(struct work_struct *work) 881 + { 882 + struct rproc *rproc = container_of(work, struct rproc, crash_handler); 883 + struct device *dev = &rproc->dev; 884 + 885 + dev_dbg(dev, "enter %s\n", __func__); 886 + 887 + mutex_lock(&rproc->lock); 888 + 889 + if (rproc->state == RPROC_CRASHED || rproc->state == RPROC_OFFLINE) { 890 + /* handle only the first crash detected */ 891 + mutex_unlock(&rproc->lock); 892 + return; 893 + } 894 + 895 + rproc->state = RPROC_CRASHED; 896 + dev_err(dev, "handling crash #%u in %s\n", ++rproc->crash_cnt, 897 + rproc->name); 898 + 899 + mutex_unlock(&rproc->lock); 900 + 901 + /* TODO: handle crash */ 902 + } 903 + 904 + /** 888 905 * rproc_boot() - boot a remote processor 889 906 * @rproc: handle of a remote processor 890 907 * ··· 1208 1165 INIT_LIST_HEAD(&rproc->traces); 1209 1166 INIT_LIST_HEAD(&rproc->rvdevs); 1210 1167 1168 + INIT_WORK(&rproc->crash_handler, rproc_crash_handler_work); 1169 + 1211 1170 rproc->state = RPROC_OFFLINE; 1212 1171 1213 1172 return rproc; ··· 1265 1220 return 0; 1266 1221 } 1267 1222 EXPORT_SYMBOL(rproc_del); 1223 + 1224 + /** 1225 + * rproc_report_crash() - rproc crash reporter function 1226 + * @rproc: remote processor 1227 + * @type: crash type 1228 + * 1229 + * This function must be called every time a crash is detected by the low-level 1230 + * drivers implementing a specific remoteproc. This should not be called from a 1231 + * non-remoteproc driver. 1232 + * 1233 + * This function can be called from atomic/interrupt context. 1234 + */ 1235 + void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type) 1236 + { 1237 + if (!rproc) { 1238 + pr_err("NULL rproc pointer\n"); 1239 + return; 1240 + } 1241 + 1242 + dev_err(&rproc->dev, "crash detected in %s: type %s\n", 1243 + rproc->name, rproc_crash_to_string(type)); 1244 + 1245 + /* create a new task to handle the error */ 1246 + schedule_work(&rproc->crash_handler); 1247 + } 1248 + EXPORT_SYMBOL(rproc_report_crash); 1268 1249 1269 1250 static int __init remoteproc_init(void) 1270 1251 {
+18
include/linux/remoteproc.h
··· 361 361 }; 362 362 363 363 /** 364 + * enum rproc_crash_type - remote processor crash types 365 + * @RPROC_MMUFAULT: iommu fault 366 + * 367 + * Each element of the enum is used as an array index. So that, the value of 368 + * the elements should be always something sane. 369 + * 370 + * Feel free to add more types when needed. 371 + */ 372 + enum rproc_crash_type { 373 + RPROC_MMUFAULT, 374 + }; 375 + 376 + /** 364 377 * struct rproc - represents a physical remote processor device 365 378 * @node: klist node of this rproc object 366 379 * @domain: iommu domain ··· 396 383 * @rvdevs: list of remote virtio devices 397 384 * @notifyids: idr for dynamically assigning rproc-wide unique notify ids 398 385 * @index: index of this rproc device 386 + * @crash_handler: workqueue for handling a crash 387 + * @crash_cnt: crash counter 399 388 */ 400 389 struct rproc { 401 390 struct klist_node node; ··· 421 406 struct list_head rvdevs; 422 407 struct idr notifyids; 423 408 int index; 409 + struct work_struct crash_handler; 410 + unsigned crash_cnt; 424 411 }; 425 412 426 413 /* we currently support only two vrings per rvdev */ ··· 477 460 478 461 int rproc_boot(struct rproc *rproc); 479 462 void rproc_shutdown(struct rproc *rproc); 463 + void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type); 480 464 481 465 static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev) 482 466 {