Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

accel/qaic: Support for 0 resize slice execution in BO

Add support to partially execute a slice which is resized to zero.
Executing a zero size slice in a BO should mean that there is no DMA
transfers involved but you should still configure doorbell and semaphores.

For example consider a BO of size 18K and it is sliced into 3 6K slices
and user calls partial execute ioctl with resize as 10K.
slice 0 - size is 6k and offset is 0, so resize of 10K will not cut short
this slice hence we send the entire slice for execution.
slice 1 - size is 6k and offset is 6k, so resize of 10K will cut short this
slice and only the first 4k should be DMA along with configuring
doorbell and semaphores.
slice 2 - size is 6k and offset is 12k, so resize of 10k will cut short
this slice and no DMA transfer would be involved but we should
would configure doorbell and semaphores.

This change begs to change the behavior of 0 resize. Currently, 0 resize
partial execute ioctl behaves exactly like execute ioctl i.e. no resize.
After this patch all the slice in BO should behave exactly like slice 2 in
above example.

Refactor copy_partial_exec_reqs() to make it more readable and less
complex.

Signed-off-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231027164330.11978-1-quic_jhugo@quicinc.com

authored by

Pranjal Ramajor Asha Kanojiya and committed by
Jeffrey Hugo
3b511278 44793c6a

+46 -63
+43 -61
drivers/accel/qaic/qaic_data.c
··· 51 51 }) 52 52 #define NUM_EVENTS 128 53 53 #define NUM_DELAYS 10 54 + #define fifo_at(base, offset) ((base) + (offset) * get_dbc_req_elem_size()) 54 55 55 56 static unsigned int wait_exec_default_timeout_ms = 5000; /* 5 sec default */ 56 57 module_param(wait_exec_default_timeout_ms, uint, 0600); ··· 1059 1058 return ret; 1060 1059 } 1061 1060 1061 + static inline u32 fifo_space_avail(u32 head, u32 tail, u32 q_size) 1062 + { 1063 + u32 avail = head - tail - 1; 1064 + 1065 + if (head <= tail) 1066 + avail += q_size; 1067 + 1068 + return avail; 1069 + } 1070 + 1062 1071 static inline int copy_exec_reqs(struct qaic_device *qdev, struct bo_slice *slice, u32 dbc_id, 1063 1072 u32 head, u32 *ptail) 1064 1073 { ··· 1077 1066 u32 tail = *ptail; 1078 1067 u32 avail; 1079 1068 1080 - avail = head - tail; 1081 - if (head <= tail) 1082 - avail += dbc->nelem; 1083 - 1084 - --avail; 1085 - 1069 + avail = fifo_space_avail(head, tail, dbc->nelem); 1086 1070 if (avail < slice->nents) 1087 1071 return -EAGAIN; 1088 1072 1089 1073 if (tail + slice->nents > dbc->nelem) { 1090 1074 avail = dbc->nelem - tail; 1091 1075 avail = min_t(u32, avail, slice->nents); 1092 - memcpy(dbc->req_q_base + tail * get_dbc_req_elem_size(), reqs, 1093 - sizeof(*reqs) * avail); 1076 + memcpy(fifo_at(dbc->req_q_base, tail), reqs, sizeof(*reqs) * avail); 1094 1077 reqs += avail; 1095 1078 avail = slice->nents - avail; 1096 1079 if (avail) 1097 1080 memcpy(dbc->req_q_base, reqs, sizeof(*reqs) * avail); 1098 1081 } else { 1099 - memcpy(dbc->req_q_base + tail * get_dbc_req_elem_size(), reqs, 1100 - sizeof(*reqs) * slice->nents); 1082 + memcpy(fifo_at(dbc->req_q_base, tail), reqs, sizeof(*reqs) * slice->nents); 1101 1083 } 1102 1084 1103 1085 *ptail = (tail + slice->nents) % dbc->nelem; ··· 1098 1094 return 0; 1099 1095 } 1100 1096 1101 - /* 1102 - * Based on the value of resize we may only need to transmit first_n 1103 - * entries and the last entry, with last_bytes to send from the last entry. 1104 - * Note that first_n could be 0. 1105 - */ 1106 1097 static inline int copy_partial_exec_reqs(struct qaic_device *qdev, struct bo_slice *slice, 1107 - u64 resize, u32 dbc_id, u32 head, u32 *ptail) 1098 + u64 resize, struct dma_bridge_chan *dbc, u32 head, 1099 + u32 *ptail) 1108 1100 { 1109 - struct dma_bridge_chan *dbc = &qdev->dbc[dbc_id]; 1110 1101 struct dbc_req *reqs = slice->reqs; 1111 1102 struct dbc_req *last_req; 1112 1103 u32 tail = *ptail; 1113 - u64 total_bytes; 1114 1104 u64 last_bytes; 1115 1105 u32 first_n; 1116 1106 u32 avail; 1117 - int ret; 1118 - int i; 1119 1107 1120 - avail = head - tail; 1121 - if (head <= tail) 1122 - avail += dbc->nelem; 1108 + avail = fifo_space_avail(head, tail, dbc->nelem); 1123 1109 1124 - --avail; 1125 - 1126 - total_bytes = 0; 1127 - for (i = 0; i < slice->nents; i++) { 1128 - total_bytes += le32_to_cpu(reqs[i].len); 1129 - if (total_bytes >= resize) 1110 + /* 1111 + * After this for loop is complete, first_n represents the index 1112 + * of the last DMA request of this slice that needs to be 1113 + * transferred after resizing and last_bytes represents DMA size 1114 + * of that request. 1115 + */ 1116 + last_bytes = resize; 1117 + for (first_n = 0; first_n < slice->nents; first_n++) 1118 + if (last_bytes > le32_to_cpu(reqs[first_n].len)) 1119 + last_bytes -= le32_to_cpu(reqs[first_n].len); 1120 + else 1130 1121 break; 1131 - } 1132 - 1133 - if (total_bytes < resize) { 1134 - /* User space should have used the full buffer path. */ 1135 - ret = -EINVAL; 1136 - return ret; 1137 - } 1138 - 1139 - first_n = i; 1140 - last_bytes = i ? resize + le32_to_cpu(reqs[i].len) - total_bytes : resize; 1141 1122 1142 1123 if (avail < (first_n + 1)) 1143 1124 return -EAGAIN; ··· 1131 1142 if (tail + first_n > dbc->nelem) { 1132 1143 avail = dbc->nelem - tail; 1133 1144 avail = min_t(u32, avail, first_n); 1134 - memcpy(dbc->req_q_base + tail * get_dbc_req_elem_size(), reqs, 1135 - sizeof(*reqs) * avail); 1145 + memcpy(fifo_at(dbc->req_q_base, tail), reqs, sizeof(*reqs) * avail); 1136 1146 last_req = reqs + avail; 1137 1147 avail = first_n - avail; 1138 1148 if (avail) 1139 1149 memcpy(dbc->req_q_base, last_req, sizeof(*reqs) * avail); 1140 1150 } else { 1141 - memcpy(dbc->req_q_base + tail * get_dbc_req_elem_size(), reqs, 1142 - sizeof(*reqs) * first_n); 1151 + memcpy(fifo_at(dbc->req_q_base, tail), reqs, sizeof(*reqs) * first_n); 1143 1152 } 1144 1153 } 1145 1154 1146 - /* Copy over the last entry. Here we need to adjust len to the left over 1155 + /* 1156 + * Copy over the last entry. Here we need to adjust len to the left over 1147 1157 * size, and set src and dst to the entry it is copied to. 1148 1158 */ 1149 - last_req = dbc->req_q_base + (tail + first_n) % dbc->nelem * get_dbc_req_elem_size(); 1159 + last_req = fifo_at(dbc->req_q_base, (tail + first_n) % dbc->nelem); 1150 1160 memcpy(last_req, reqs + slice->nents - 1, sizeof(*reqs)); 1151 1161 1152 1162 /* ··· 1156 1168 last_req->len = cpu_to_le32((u32)last_bytes); 1157 1169 last_req->src_addr = reqs[first_n].src_addr; 1158 1170 last_req->dest_addr = reqs[first_n].dest_addr; 1171 + if (!last_bytes) 1172 + /* Disable DMA transfer */ 1173 + last_req->cmd = GENMASK(7, 2) & reqs[first_n].cmd; 1159 1174 1160 1175 *ptail = (tail + first_n + 1) % dbc->nelem; 1161 1176 ··· 1218 1227 bo->req_id = dbc->next_req_id++; 1219 1228 1220 1229 list_for_each_entry(slice, &bo->slices, slice) { 1221 - /* 1222 - * If this slice does not fall under the given 1223 - * resize then skip this slice and continue the loop 1224 - */ 1225 - if (is_partial && pexec[i].resize && pexec[i].resize <= slice->offset) 1226 - continue; 1227 - 1228 1230 for (j = 0; j < slice->nents; j++) 1229 1231 slice->reqs[j].req_id = cpu_to_le16(bo->req_id); 1230 1232 1231 - /* 1232 - * If it is a partial execute ioctl call then check if 1233 - * resize has cut this slice short then do a partial copy 1234 - * else do complete copy 1235 - */ 1236 - if (is_partial && pexec[i].resize && 1237 - pexec[i].resize < slice->offset + slice->size) 1233 + if (is_partial && (!pexec[i].resize || pexec[i].resize <= slice->offset)) 1234 + /* Configure the slice for no DMA transfer */ 1235 + ret = copy_partial_exec_reqs(qdev, slice, 0, dbc, head, tail); 1236 + else if (is_partial && pexec[i].resize < slice->offset + slice->size) 1237 + /* Configure the slice to be partially DMA transferred */ 1238 1238 ret = copy_partial_exec_reqs(qdev, slice, 1239 - pexec[i].resize - slice->offset, 1240 - dbc->id, head, tail); 1239 + pexec[i].resize - slice->offset, dbc, 1240 + head, tail); 1241 1241 else 1242 1242 ret = copy_exec_reqs(qdev, slice, dbc->id, head, tail); 1243 1243 if (ret) {
+3 -2
include/uapi/drm/qaic_accel.h
··· 287 287 * struct qaic_partial_execute_entry - Defines a BO to resize and submit. 288 288 * @handle: In. GEM handle of the BO to commit to the device. 289 289 * @dir: In. Direction of data. 1 = to device, 2 = from device. 290 - * @resize: In. New size of the BO. Must be <= the original BO size. 0 is 291 - * short for no resize. 290 + * @resize: In. New size of the BO. Must be <= the original BO size. 291 + * @resize as 0 would be interpreted as no DMA transfer is 292 + * involved. 292 293 */ 293 294 struct qaic_partial_execute_entry { 294 295 __u32 handle;