Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.6-rc7 399 lines 12 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Intel MIC Platform Software Stack (MPSS) 4 * 5 * Copyright(c) 2015 Intel Corporation. 6 * 7 * Intel MIC Coprocessor State Management (COSM) Driver 8 */ 9#include <linux/kthread.h> 10#include <linux/sched/signal.h> 11 12#include "cosm_main.h" 13 14/* 15 * The COSM driver uses SCIF to communicate between the management node and the 16 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b) 17 * receive a shutdown status back from the card upon completion of shutdown and 18 * (c) receive periodic heartbeat messages from the card used to deduce if the 19 * card has crashed. 20 * 21 * A COSM server consisting of a SCIF listening endpoint waits for incoming 22 * connections from the card. Upon acceptance of the connection, a separate 23 * work-item is scheduled to handle SCIF message processing for that card. The 24 * life-time of this work-item is therefore the time from which the connection 25 * from a card is accepted to the time at which the connection is closed. A new 26 * work-item starts each time the card boots and is alive till the card (a) 27 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is 28 * unloaded. 29 * 30 * From the point of view of COSM interactions with SCIF during card 31 * shutdown, reset and crash are as follows: 32 * 33 * Card shutdown 34 * ------------- 35 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN 36 * message from the host. 37 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting 38 * in scif_remove(..) getting called on the card 39 * 3. scif_remove -> scif_stop -> scif_handle_remove_node -> 40 * scif_peer_unregister_device -> device_unregister for the host peer device 41 * 4. During device_unregister remove(..) method of cosm_client is invoked which 42 * closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT 43 * message being sent to host SCIF. SCIF_DISCNCT message processing on the 44 * host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes 45 * up the host COSM thread blocked in scif_poll(..) resulting in 46 * scif_poll(..) returning EPOLLHUP. 47 * 5. On the card, scif_peer_release_dev is next called which results in an 48 * SCIF_EXIT message being sent to the host and after receiving the 49 * SCIF_EXIT_ACK from the host the peer device teardown on the card is 50 * complete. 51 * 6. As part of the SCIF_EXIT message processing on the host, host sends a 52 * SCIF_REMOVE_NODE to itself corresponding to the card being removed. This 53 * starts a similar SCIF peer device teardown sequence on the host 54 * corresponding to the card being shut down. 55 * 56 * Card reset 57 * ---------- 58 * The case of interest here is when the card has not been previously shut down 59 * since most of the steps below are skipped in that case: 60 61 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver 62 * which unregisters the SCIF HW device resulting in scif_remove(..) being 63 * called on the host. 64 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a 65 * SCIF_EXIT message being sent to the card. 66 * 3. The card executes scif_stop() as part of SCIF_EXIT message 67 * processing. This results in the COSM endpoint on the card being closed and 68 * the SCIF host peer device on the card getting unregistered similar to 69 * steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the 70 * host returns EPOLLHUP as a result. 71 * 4. On the host, card peer device unregister and SCIF HW remove(..) also 72 * subsequently complete. 73 * 74 * Card crash 75 * ---------- 76 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT 77 * message from the card which would result in scif_poll(..) returning 78 * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE 79 * message to itself resulting in the card SCIF peer device being unregistered, 80 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev-> 81 * scif_invalidate_ep call sequence which sets the endpoint state to 82 * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP. 83 */ 84 85#define COSM_SCIF_BACKLOG 16 86#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10 87#define COSM_HEARTBEAT_TIMEOUT_SEC \ 88 (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC) 89#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC) 90 91static struct task_struct *server_thread; 92static scif_epd_t listen_epd; 93 94/* Publish MIC card's shutdown status to user space MIC daemon */ 95static void cosm_update_mic_status(struct cosm_device *cdev) 96{ 97 if (cdev->shutdown_status_int != MIC_NOP) { 98 cosm_set_shutdown_status(cdev, cdev->shutdown_status_int); 99 cdev->shutdown_status_int = MIC_NOP; 100 } 101} 102 103/* Store MIC card's shutdown status internally when it is received */ 104static void cosm_shutdown_status_int(struct cosm_device *cdev, 105 enum mic_status shutdown_status) 106{ 107 switch (shutdown_status) { 108 case MIC_HALTED: 109 case MIC_POWER_OFF: 110 case MIC_RESTART: 111 case MIC_CRASHED: 112 break; 113 default: 114 dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n", 115 __func__, __LINE__, shutdown_status); 116 return; 117 }; 118 cdev->shutdown_status_int = shutdown_status; 119 cdev->heartbeat_watchdog_enable = false; 120 121 if (cdev->state != MIC_SHUTTING_DOWN) 122 cosm_set_state(cdev, MIC_SHUTTING_DOWN); 123} 124 125/* Non-blocking recv. Read and process all available messages */ 126static void cosm_scif_recv(struct cosm_device *cdev) 127{ 128 struct cosm_msg msg; 129 int rc; 130 131 while (1) { 132 rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0); 133 if (!rc) { 134 break; 135 } else if (rc < 0) { 136 dev_dbg(&cdev->dev, "%s: %d rc %d\n", 137 __func__, __LINE__, rc); 138 break; 139 } 140 dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n", 141 __func__, __LINE__, rc, msg.id); 142 143 switch (msg.id) { 144 case COSM_MSG_SHUTDOWN_STATUS: 145 cosm_shutdown_status_int(cdev, msg.shutdown_status); 146 break; 147 case COSM_MSG_HEARTBEAT: 148 /* Nothing to do, heartbeat only unblocks scif_poll */ 149 break; 150 default: 151 dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n", 152 __func__, __LINE__, msg.id); 153 break; 154 } 155 } 156} 157 158/* Publish crashed status for this MIC card */ 159static void cosm_set_crashed(struct cosm_device *cdev) 160{ 161 dev_err(&cdev->dev, "node alive timeout\n"); 162 cosm_shutdown_status_int(cdev, MIC_CRASHED); 163 cosm_update_mic_status(cdev); 164} 165 166/* Send host time to the MIC card to sync system time between host and MIC */ 167static void cosm_send_time(struct cosm_device *cdev) 168{ 169 struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME }; 170 struct timespec64 ts; 171 int rc; 172 173 ktime_get_real_ts64(&ts); 174 msg.timespec.tv_sec = ts.tv_sec; 175 msg.timespec.tv_nsec = ts.tv_nsec; 176 177 rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); 178 if (rc < 0) 179 dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", 180 __func__, __LINE__, rc); 181} 182 183/* 184 * Close this cosm_device's endpoint after its peer endpoint on the card has 185 * been closed. In all cases except MIC card crash EPOLLHUP on the host is 186 * triggered by the client's endpoint being closed. 187 */ 188static void cosm_scif_close(struct cosm_device *cdev) 189{ 190 /* 191 * Because SHUTDOWN_STATUS message is sent by the MIC cards in the 192 * reboot notifier when shutdown is still not complete, we notify mpssd 193 * to reset the card when SCIF endpoint is closed. 194 */ 195 cosm_update_mic_status(cdev); 196 scif_close(cdev->epd); 197 cdev->epd = NULL; 198 dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); 199} 200 201/* 202 * Set card state to ONLINE when a new SCIF connection from a MIC card is 203 * received. Normally the state is BOOTING when the connection comes in, but can 204 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded. 205 */ 206static int cosm_set_online(struct cosm_device *cdev) 207{ 208 int rc = 0; 209 210 if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) { 211 cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable; 212 cdev->epd = cdev->newepd; 213 if (cdev->state == MIC_BOOTING) 214 cosm_set_state(cdev, MIC_ONLINE); 215 cosm_send_time(cdev); 216 dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); 217 } else { 218 dev_warn(&cdev->dev, "%s %d not going online in state: %s\n", 219 __func__, __LINE__, cosm_state_string[cdev->state]); 220 rc = -EINVAL; 221 } 222 /* Drop reference acquired by bus_find_device in the server thread */ 223 put_device(&cdev->dev); 224 return rc; 225} 226 227/* 228 * Work function for handling work for a SCIF connection from a particular MIC 229 * card. It first sets the card state to ONLINE and then calls scif_poll to 230 * block on activity such as incoming messages on the SCIF endpoint. When the 231 * endpoint is closed, the work function exits, completing its life cycle, from 232 * MIC card boot to card shutdown/reset/crash. 233 */ 234void cosm_scif_work(struct work_struct *work) 235{ 236 struct cosm_device *cdev = container_of(work, struct cosm_device, 237 scif_work); 238 struct scif_pollepd pollepd; 239 int rc; 240 241 mutex_lock(&cdev->cosm_mutex); 242 if (cosm_set_online(cdev)) 243 goto exit; 244 245 while (1) { 246 pollepd.epd = cdev->epd; 247 pollepd.events = EPOLLIN; 248 249 /* Drop the mutex before blocking in scif_poll(..) */ 250 mutex_unlock(&cdev->cosm_mutex); 251 /* poll(..) with timeout on our endpoint */ 252 rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC); 253 mutex_lock(&cdev->cosm_mutex); 254 if (rc < 0) { 255 dev_err(&cdev->dev, "%s %d scif_poll rc %d\n", 256 __func__, __LINE__, rc); 257 continue; 258 } 259 260 /* There is a message from the card */ 261 if (pollepd.revents & EPOLLIN) 262 cosm_scif_recv(cdev); 263 264 /* The peer endpoint is closed or this endpoint disconnected */ 265 if (pollepd.revents & EPOLLHUP) { 266 cosm_scif_close(cdev); 267 break; 268 } 269 270 /* Did we timeout from poll? */ 271 if (!rc && cdev->heartbeat_watchdog_enable) 272 cosm_set_crashed(cdev); 273 } 274exit: 275 dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__); 276 mutex_unlock(&cdev->cosm_mutex); 277} 278 279/* 280 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC 281 * cards, finds the correct cosm_device to associate that connection with and 282 * schedules individual work items for each MIC card. 283 */ 284static int cosm_scif_server(void *unused) 285{ 286 struct cosm_device *cdev; 287 scif_epd_t newepd; 288 struct scif_port_id port_id; 289 int rc; 290 291 allow_signal(SIGKILL); 292 293 while (!kthread_should_stop()) { 294 rc = scif_accept(listen_epd, &port_id, &newepd, 295 SCIF_ACCEPT_SYNC); 296 if (rc < 0) { 297 if (-ERESTARTSYS != rc) 298 pr_err("%s %d rc %d\n", __func__, __LINE__, rc); 299 continue; 300 } 301 302 /* 303 * Associate the incoming connection with a particular 304 * cosm_device, COSM device ID == SCIF node ID - 1 305 */ 306 cdev = cosm_find_cdev_by_id(port_id.node - 1); 307 if (!cdev) 308 continue; 309 cdev->newepd = newepd; 310 schedule_work(&cdev->scif_work); 311 } 312 313 pr_debug("%s %d Server thread stopped\n", __func__, __LINE__); 314 return 0; 315} 316 317static int cosm_scif_listen(void) 318{ 319 int rc; 320 321 listen_epd = scif_open(); 322 if (!listen_epd) { 323 pr_err("%s %d scif_open failed\n", __func__, __LINE__); 324 return -ENOMEM; 325 } 326 327 rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT); 328 if (rc < 0) { 329 pr_err("%s %d scif_bind failed rc %d\n", 330 __func__, __LINE__, rc); 331 goto err; 332 } 333 334 rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG); 335 if (rc < 0) { 336 pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc); 337 goto err; 338 } 339 pr_debug("%s %d listen_epd set up\n", __func__, __LINE__); 340 return 0; 341err: 342 scif_close(listen_epd); 343 listen_epd = NULL; 344 return rc; 345} 346 347static void cosm_scif_listen_exit(void) 348{ 349 pr_debug("%s %d closing listen_epd\n", __func__, __LINE__); 350 if (listen_epd) { 351 scif_close(listen_epd); 352 listen_epd = NULL; 353 } 354} 355 356/* 357 * Create a listening SCIF endpoint and a server kthread which accepts incoming 358 * SCIF connections from MIC cards 359 */ 360int cosm_scif_init(void) 361{ 362 int rc = cosm_scif_listen(); 363 364 if (rc) { 365 pr_err("%s %d cosm_scif_listen rc %d\n", 366 __func__, __LINE__, rc); 367 goto err; 368 } 369 370 server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server"); 371 if (IS_ERR(server_thread)) { 372 rc = PTR_ERR(server_thread); 373 pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc); 374 goto listen_exit; 375 } 376 return 0; 377listen_exit: 378 cosm_scif_listen_exit(); 379err: 380 return rc; 381} 382 383/* Stop the running server thread and close the listening SCIF endpoint */ 384void cosm_scif_exit(void) 385{ 386 int rc; 387 388 if (!IS_ERR_OR_NULL(server_thread)) { 389 rc = send_sig(SIGKILL, server_thread, 0); 390 if (rc) { 391 pr_err("%s %d send_sig rc %d\n", 392 __func__, __LINE__, rc); 393 return; 394 } 395 kthread_stop(server_thread); 396 } 397 398 cosm_scif_listen_exit(); 399}