sme_helper.c (43633B)
1 /* 2 * ARM SME Operations 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "exec/helper-proto.h" 25 #include "exec/cpu_ldst.h" 26 #include "exec/exec-all.h" 27 #include "qemu/int128.h" 28 #include "fpu/softfloat.h" 29 #include "vec_internal.h" 30 #include "sve_ldst_internal.h" 31 32 /* ResetSVEState */ 33 void arm_reset_sve_state(CPUARMState *env) 34 { 35 memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs)); 36 /* Recall that FFR is stored as pregs[16]. */ 37 memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs)); 38 vfp_set_fpcr(env, 0x0800009f); 39 } 40 41 void helper_set_pstate_sm(CPUARMState *env, uint32_t i) 42 { 43 if (i == FIELD_EX64(env->svcr, SVCR, SM)) { 44 return; 45 } 46 env->svcr ^= R_SVCR_SM_MASK; 47 arm_reset_sve_state(env); 48 } 49 50 void helper_set_pstate_za(CPUARMState *env, uint32_t i) 51 { 52 if (i == FIELD_EX64(env->svcr, SVCR, ZA)) { 53 return; 54 } 55 env->svcr ^= R_SVCR_ZA_MASK; 56 57 /* 58 * ResetSMEState. 59 * 60 * SetPSTATE_ZA zeros on enable and disable. We can zero this only 61 * on enable: while disabled, the storage is inaccessible and the 62 * value does not matter. We're not saving the storage in vmstate 63 * when disabled either. 64 */ 65 if (i) { 66 memset(env->zarray, 0, sizeof(env->zarray)); 67 } 68 } 69 70 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) 71 { 72 uint32_t i; 73 74 /* 75 * Special case clearing the entire ZA space. 76 * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any 77 * parts of the ZA storage outside of SVL. 78 */ 79 if (imm == 0xff) { 80 memset(env->zarray, 0, sizeof(env->zarray)); 81 return; 82 } 83 84 /* 85 * Recall that ZAnH.D[m] is spread across ZA[n+8*m], 86 * so each row is discontiguous within ZA[]. 87 */ 88 for (i = 0; i < svl; i++) { 89 if (imm & (1 << (i % 8))) { 90 memset(&env->zarray[i], 0, svl); 91 } 92 } 93 } 94 95 96 /* 97 * When considering the ZA storage as an array of elements of 98 * type T, the index within that array of the Nth element of 99 * a vertical slice of a tile can be calculated like this, 100 * regardless of the size of type T. This is because the tiles 101 * are interleaved, so if type T is size N bytes then row 1 of 102 * the tile is N rows away from row 0. The division by N to 103 * convert a byte offset into an array index and the multiplication 104 * by N to convert from vslice-index-within-the-tile to 105 * the index within the ZA storage cancel out. 106 */ 107 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg)) 108 109 /* 110 * When doing byte arithmetic on the ZA storage, the element 111 * byteoff bytes away in a tile vertical slice is always this 112 * many bytes away in the ZA storage, regardless of the 113 * size of the tile element, assuming that byteoff is a multiple 114 * of the element size. Again this is because of the interleaving 115 * of the tiles. For instance if we have 1 byte per element then 116 * each row of the ZA storage has one byte of the vslice data, 117 * and (counting from 0) byte 8 goes in row 8 of the storage 118 * at offset (8 * row-size-in-bytes). 119 * If we have 8 bytes per element then each row of the ZA storage 120 * has 8 bytes of the data, but there are 8 interleaved tiles and 121 * so byte 8 of the data goes into row 1 of the tile, 122 * which is again row 8 of the storage, so the offset is still 123 * (8 * row-size-in-bytes). Similarly for other element sizes. 124 */ 125 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg)) 126 127 128 /* 129 * Move Zreg vector to ZArray column. 130 */ 131 #define DO_MOVA_C(NAME, TYPE, H) \ 132 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \ 133 { \ 134 int i, oprsz = simd_oprsz(desc); \ 135 for (i = 0; i < oprsz; ) { \ 136 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 137 do { \ 138 if (pg & 1) { \ 139 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \ 140 } \ 141 i += sizeof(TYPE); \ 142 pg >>= sizeof(TYPE); \ 143 } while (i & 15); \ 144 } \ 145 } 146 147 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1) 148 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2) 149 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4) 150 151 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc) 152 { 153 int i, oprsz = simd_oprsz(desc) / 8; 154 uint8_t *pg = vg; 155 uint64_t *n = vn; 156 uint64_t *a = za; 157 158 for (i = 0; i < oprsz; i++) { 159 if (pg[H1(i)] & 1) { 160 a[tile_vslice_index(i)] = n[i]; 161 } 162 } 163 } 164 165 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc) 166 { 167 int i, oprsz = simd_oprsz(desc) / 16; 168 uint16_t *pg = vg; 169 Int128 *n = vn; 170 Int128 *a = za; 171 172 /* 173 * Int128 is used here simply to copy 16 bytes, and to simplify 174 * the address arithmetic. 175 */ 176 for (i = 0; i < oprsz; i++) { 177 if (pg[H2(i)] & 1) { 178 a[tile_vslice_index(i)] = n[i]; 179 } 180 } 181 } 182 183 #undef DO_MOVA_C 184 185 /* 186 * Move ZArray column to Zreg vector. 187 */ 188 #define DO_MOVA_Z(NAME, TYPE, H) \ 189 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \ 190 { \ 191 int i, oprsz = simd_oprsz(desc); \ 192 for (i = 0; i < oprsz; ) { \ 193 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 194 do { \ 195 if (pg & 1) { \ 196 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \ 197 } \ 198 i += sizeof(TYPE); \ 199 pg >>= sizeof(TYPE); \ 200 } while (i & 15); \ 201 } \ 202 } 203 204 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1) 205 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2) 206 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4) 207 208 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc) 209 { 210 int i, oprsz = simd_oprsz(desc) / 8; 211 uint8_t *pg = vg; 212 uint64_t *d = vd; 213 uint64_t *a = za; 214 215 for (i = 0; i < oprsz; i++) { 216 if (pg[H1(i)] & 1) { 217 d[i] = a[tile_vslice_index(i)]; 218 } 219 } 220 } 221 222 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) 223 { 224 int i, oprsz = simd_oprsz(desc) / 16; 225 uint16_t *pg = vg; 226 Int128 *d = vd; 227 Int128 *a = za; 228 229 /* 230 * Int128 is used here simply to copy 16 bytes, and to simplify 231 * the address arithmetic. 232 */ 233 for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) { 234 if (pg[H2(i)] & 1) { 235 d[i] = a[tile_vslice_index(i)]; 236 } 237 } 238 } 239 240 #undef DO_MOVA_Z 241 242 /* 243 * Clear elements in a tile slice comprising len bytes. 244 */ 245 246 typedef void ClearFn(void *ptr, size_t off, size_t len); 247 248 static void clear_horizontal(void *ptr, size_t off, size_t len) 249 { 250 memset(ptr + off, 0, len); 251 } 252 253 static void clear_vertical_b(void *vptr, size_t off, size_t len) 254 { 255 for (size_t i = 0; i < len; ++i) { 256 *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0; 257 } 258 } 259 260 static void clear_vertical_h(void *vptr, size_t off, size_t len) 261 { 262 for (size_t i = 0; i < len; i += 2) { 263 *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0; 264 } 265 } 266 267 static void clear_vertical_s(void *vptr, size_t off, size_t len) 268 { 269 for (size_t i = 0; i < len; i += 4) { 270 *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0; 271 } 272 } 273 274 static void clear_vertical_d(void *vptr, size_t off, size_t len) 275 { 276 for (size_t i = 0; i < len; i += 8) { 277 *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0; 278 } 279 } 280 281 static void clear_vertical_q(void *vptr, size_t off, size_t len) 282 { 283 for (size_t i = 0; i < len; i += 16) { 284 memset(vptr + tile_vslice_offset(i + off), 0, 16); 285 } 286 } 287 288 /* 289 * Copy elements from an array into a tile slice comprising len bytes. 290 */ 291 292 typedef void CopyFn(void *dst, const void *src, size_t len); 293 294 static void copy_horizontal(void *dst, const void *src, size_t len) 295 { 296 memcpy(dst, src, len); 297 } 298 299 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len) 300 { 301 const uint8_t *src = vsrc; 302 uint8_t *dst = vdst; 303 size_t i; 304 305 for (i = 0; i < len; ++i) { 306 dst[tile_vslice_index(i)] = src[i]; 307 } 308 } 309 310 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len) 311 { 312 const uint16_t *src = vsrc; 313 uint16_t *dst = vdst; 314 size_t i; 315 316 for (i = 0; i < len / 2; ++i) { 317 dst[tile_vslice_index(i)] = src[i]; 318 } 319 } 320 321 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len) 322 { 323 const uint32_t *src = vsrc; 324 uint32_t *dst = vdst; 325 size_t i; 326 327 for (i = 0; i < len / 4; ++i) { 328 dst[tile_vslice_index(i)] = src[i]; 329 } 330 } 331 332 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len) 333 { 334 const uint64_t *src = vsrc; 335 uint64_t *dst = vdst; 336 size_t i; 337 338 for (i = 0; i < len / 8; ++i) { 339 dst[tile_vslice_index(i)] = src[i]; 340 } 341 } 342 343 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) 344 { 345 for (size_t i = 0; i < len; i += 16) { 346 memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16); 347 } 348 } 349 350 /* 351 * Host and TLB primitives for vertical tile slice addressing. 352 */ 353 354 #define DO_LD(NAME, TYPE, HOST, TLB) \ 355 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ 356 { \ 357 TYPE val = HOST(host); \ 358 *(TYPE *)(za + tile_vslice_offset(off)) = val; \ 359 } \ 360 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ 361 intptr_t off, target_ulong addr, uintptr_t ra) \ 362 { \ 363 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \ 364 *(TYPE *)(za + tile_vslice_offset(off)) = val; \ 365 } 366 367 #define DO_ST(NAME, TYPE, HOST, TLB) \ 368 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ 369 { \ 370 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ 371 HOST(host, val); \ 372 } \ 373 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ 374 intptr_t off, target_ulong addr, uintptr_t ra) \ 375 { \ 376 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ 377 TLB(env, useronly_clean_ptr(addr), val, ra); \ 378 } 379 380 /* 381 * The ARMVectorReg elements are stored in host-endian 64-bit units. 382 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode 383 * corresponds to storing the two 64-bit pieces in little-endian order. 384 */ 385 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ 386 static inline void HNAME##_host(void *za, intptr_t off, void *host) \ 387 { \ 388 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ 389 uint64_t *ptr = za + off; \ 390 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ 391 } \ 392 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ 393 { \ 394 HNAME##_host(za, tile_vslice_offset(off), host); \ 395 } \ 396 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ 397 target_ulong addr, uintptr_t ra) \ 398 { \ 399 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ 400 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ 401 uint64_t *ptr = za + off; \ 402 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ 403 } \ 404 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ 405 target_ulong addr, uintptr_t ra) \ 406 { \ 407 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ 408 } 409 410 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ 411 static inline void HNAME##_host(void *za, intptr_t off, void *host) \ 412 { \ 413 uint64_t *ptr = za + off; \ 414 HOST(host, ptr[BE]); \ 415 HOST(host + 1, ptr[!BE]); \ 416 } \ 417 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ 418 { \ 419 HNAME##_host(za, tile_vslice_offset(off), host); \ 420 } \ 421 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ 422 target_ulong addr, uintptr_t ra) \ 423 { \ 424 uint64_t *ptr = za + off; \ 425 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ 426 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ 427 } \ 428 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ 429 target_ulong addr, uintptr_t ra) \ 430 { \ 431 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ 432 } 433 434 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra) 435 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra) 436 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra) 437 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra) 438 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) 439 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) 440 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) 441 442 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) 443 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) 444 445 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) 446 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) 447 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra) 448 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra) 449 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) 450 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) 451 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) 452 453 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) 454 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) 455 456 #undef DO_LD 457 #undef DO_ST 458 #undef DO_LDQ 459 #undef DO_STQ 460 461 /* 462 * Common helper for all contiguous predicated loads. 463 */ 464 465 static inline QEMU_ALWAYS_INLINE 466 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, 467 const target_ulong addr, uint32_t desc, const uintptr_t ra, 468 const int esz, uint32_t mtedesc, bool vertical, 469 sve_ldst1_host_fn *host_fn, 470 sve_ldst1_tlb_fn *tlb_fn, 471 ClearFn *clr_fn, 472 CopyFn *cpy_fn) 473 { 474 const intptr_t reg_max = simd_oprsz(desc); 475 const intptr_t esize = 1 << esz; 476 intptr_t reg_off, reg_last; 477 SVEContLdSt info; 478 void *host; 479 int flags; 480 481 /* Find the active elements. */ 482 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { 483 /* The entire predicate was false; no load occurs. */ 484 clr_fn(za, 0, reg_max); 485 return; 486 } 487 488 /* Probe the page(s). Exit with exception for any invalid page. */ 489 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); 490 491 /* Handle watchpoints for all active elements. */ 492 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, 493 BP_MEM_READ, ra); 494 495 /* 496 * Handle mte checks for all active elements. 497 * Since TBI must be set for MTE, !mtedesc => !mte_active. 498 */ 499 if (mtedesc) { 500 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, 501 mtedesc, ra); 502 } 503 504 flags = info.page[0].flags | info.page[1].flags; 505 if (unlikely(flags != 0)) { 506 #ifdef CONFIG_USER_ONLY 507 g_assert_not_reached(); 508 #else 509 /* 510 * At least one page includes MMIO. 511 * Any bus operation can fail with cpu_transaction_failed, 512 * which for ARM will raise SyncExternal. Perform the load 513 * into scratch memory to preserve register state until the end. 514 */ 515 ARMVectorReg scratch = { }; 516 517 reg_off = info.reg_off_first[0]; 518 reg_last = info.reg_off_last[1]; 519 if (reg_last < 0) { 520 reg_last = info.reg_off_split; 521 if (reg_last < 0) { 522 reg_last = info.reg_off_last[0]; 523 } 524 } 525 526 do { 527 uint64_t pg = vg[reg_off >> 6]; 528 do { 529 if ((pg >> (reg_off & 63)) & 1) { 530 tlb_fn(env, &scratch, reg_off, addr + reg_off, ra); 531 } 532 reg_off += esize; 533 } while (reg_off & 63); 534 } while (reg_off <= reg_last); 535 536 cpy_fn(za, &scratch, reg_max); 537 return; 538 #endif 539 } 540 541 /* The entire operation is in RAM, on valid pages. */ 542 543 reg_off = info.reg_off_first[0]; 544 reg_last = info.reg_off_last[0]; 545 host = info.page[0].host; 546 547 if (!vertical) { 548 memset(za, 0, reg_max); 549 } else if (reg_off) { 550 clr_fn(za, 0, reg_off); 551 } 552 553 while (reg_off <= reg_last) { 554 uint64_t pg = vg[reg_off >> 6]; 555 do { 556 if ((pg >> (reg_off & 63)) & 1) { 557 host_fn(za, reg_off, host + reg_off); 558 } else if (vertical) { 559 clr_fn(za, reg_off, esize); 560 } 561 reg_off += esize; 562 } while (reg_off <= reg_last && (reg_off & 63)); 563 } 564 565 /* 566 * Use the slow path to manage the cross-page misalignment. 567 * But we know this is RAM and cannot trap. 568 */ 569 reg_off = info.reg_off_split; 570 if (unlikely(reg_off >= 0)) { 571 tlb_fn(env, za, reg_off, addr + reg_off, ra); 572 } 573 574 reg_off = info.reg_off_first[1]; 575 if (unlikely(reg_off >= 0)) { 576 reg_last = info.reg_off_last[1]; 577 host = info.page[1].host; 578 579 do { 580 uint64_t pg = vg[reg_off >> 6]; 581 do { 582 if ((pg >> (reg_off & 63)) & 1) { 583 host_fn(za, reg_off, host + reg_off); 584 } else if (vertical) { 585 clr_fn(za, reg_off, esize); 586 } 587 reg_off += esize; 588 } while (reg_off & 63); 589 } while (reg_off <= reg_last); 590 } 591 } 592 593 static inline QEMU_ALWAYS_INLINE 594 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, 595 target_ulong addr, uint32_t desc, uintptr_t ra, 596 const int esz, bool vertical, 597 sve_ldst1_host_fn *host_fn, 598 sve_ldst1_tlb_fn *tlb_fn, 599 ClearFn *clr_fn, 600 CopyFn *cpy_fn) 601 { 602 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 603 int bit55 = extract64(addr, 55, 1); 604 605 /* Remove mtedesc from the normal sve descriptor. */ 606 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 607 608 /* Perform gross MTE suppression early. */ 609 if (!tbi_check(desc, bit55) || 610 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 611 mtedesc = 0; 612 } 613 614 sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical, 615 host_fn, tlb_fn, clr_fn, cpy_fn); 616 } 617 618 #define DO_LD(L, END, ESZ) \ 619 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ 620 target_ulong addr, uint32_t desc) \ 621 { \ 622 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ 623 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ 624 clear_horizontal, copy_horizontal); \ 625 } \ 626 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ 627 target_ulong addr, uint32_t desc) \ 628 { \ 629 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ 630 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ 631 clear_vertical_##L, copy_vertical_##L); \ 632 } \ 633 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ 634 target_ulong addr, uint32_t desc) \ 635 { \ 636 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ 637 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ 638 clear_horizontal, copy_horizontal); \ 639 } \ 640 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ 641 target_ulong addr, uint32_t desc) \ 642 { \ 643 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ 644 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ 645 clear_vertical_##L, copy_vertical_##L); \ 646 } 647 648 DO_LD(b, , MO_8) 649 DO_LD(h, _be, MO_16) 650 DO_LD(h, _le, MO_16) 651 DO_LD(s, _be, MO_32) 652 DO_LD(s, _le, MO_32) 653 DO_LD(d, _be, MO_64) 654 DO_LD(d, _le, MO_64) 655 DO_LD(q, _be, MO_128) 656 DO_LD(q, _le, MO_128) 657 658 #undef DO_LD 659 660 /* 661 * Common helper for all contiguous predicated stores. 662 */ 663 664 static inline QEMU_ALWAYS_INLINE 665 void sme_st1(CPUARMState *env, void *za, uint64_t *vg, 666 const target_ulong addr, uint32_t desc, const uintptr_t ra, 667 const int esz, uint32_t mtedesc, bool vertical, 668 sve_ldst1_host_fn *host_fn, 669 sve_ldst1_tlb_fn *tlb_fn) 670 { 671 const intptr_t reg_max = simd_oprsz(desc); 672 const intptr_t esize = 1 << esz; 673 intptr_t reg_off, reg_last; 674 SVEContLdSt info; 675 void *host; 676 int flags; 677 678 /* Find the active elements. */ 679 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { 680 /* The entire predicate was false; no store occurs. */ 681 return; 682 } 683 684 /* Probe the page(s). Exit with exception for any invalid page. */ 685 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); 686 687 /* Handle watchpoints for all active elements. */ 688 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, 689 BP_MEM_WRITE, ra); 690 691 /* 692 * Handle mte checks for all active elements. 693 * Since TBI must be set for MTE, !mtedesc => !mte_active. 694 */ 695 if (mtedesc) { 696 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, 697 mtedesc, ra); 698 } 699 700 flags = info.page[0].flags | info.page[1].flags; 701 if (unlikely(flags != 0)) { 702 #ifdef CONFIG_USER_ONLY 703 g_assert_not_reached(); 704 #else 705 /* 706 * At least one page includes MMIO. 707 * Any bus operation can fail with cpu_transaction_failed, 708 * which for ARM will raise SyncExternal. We cannot avoid 709 * this fault and will leave with the store incomplete. 710 */ 711 reg_off = info.reg_off_first[0]; 712 reg_last = info.reg_off_last[1]; 713 if (reg_last < 0) { 714 reg_last = info.reg_off_split; 715 if (reg_last < 0) { 716 reg_last = info.reg_off_last[0]; 717 } 718 } 719 720 do { 721 uint64_t pg = vg[reg_off >> 6]; 722 do { 723 if ((pg >> (reg_off & 63)) & 1) { 724 tlb_fn(env, za, reg_off, addr + reg_off, ra); 725 } 726 reg_off += esize; 727 } while (reg_off & 63); 728 } while (reg_off <= reg_last); 729 return; 730 #endif 731 } 732 733 reg_off = info.reg_off_first[0]; 734 reg_last = info.reg_off_last[0]; 735 host = info.page[0].host; 736 737 while (reg_off <= reg_last) { 738 uint64_t pg = vg[reg_off >> 6]; 739 do { 740 if ((pg >> (reg_off & 63)) & 1) { 741 host_fn(za, reg_off, host + reg_off); 742 } 743 reg_off += 1 << esz; 744 } while (reg_off <= reg_last && (reg_off & 63)); 745 } 746 747 /* 748 * Use the slow path to manage the cross-page misalignment. 749 * But we know this is RAM and cannot trap. 750 */ 751 reg_off = info.reg_off_split; 752 if (unlikely(reg_off >= 0)) { 753 tlb_fn(env, za, reg_off, addr + reg_off, ra); 754 } 755 756 reg_off = info.reg_off_first[1]; 757 if (unlikely(reg_off >= 0)) { 758 reg_last = info.reg_off_last[1]; 759 host = info.page[1].host; 760 761 do { 762 uint64_t pg = vg[reg_off >> 6]; 763 do { 764 if ((pg >> (reg_off & 63)) & 1) { 765 host_fn(za, reg_off, host + reg_off); 766 } 767 reg_off += 1 << esz; 768 } while (reg_off & 63); 769 } while (reg_off <= reg_last); 770 } 771 } 772 773 static inline QEMU_ALWAYS_INLINE 774 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, 775 uint32_t desc, uintptr_t ra, int esz, bool vertical, 776 sve_ldst1_host_fn *host_fn, 777 sve_ldst1_tlb_fn *tlb_fn) 778 { 779 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 780 int bit55 = extract64(addr, 55, 1); 781 782 /* Remove mtedesc from the normal sve descriptor. */ 783 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 784 785 /* Perform gross MTE suppression early. */ 786 if (!tbi_check(desc, bit55) || 787 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 788 mtedesc = 0; 789 } 790 791 sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc, 792 vertical, host_fn, tlb_fn); 793 } 794 795 #define DO_ST(L, END, ESZ) \ 796 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ 797 target_ulong addr, uint32_t desc) \ 798 { \ 799 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ 800 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ 801 } \ 802 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ 803 target_ulong addr, uint32_t desc) \ 804 { \ 805 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ 806 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ 807 } \ 808 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ 809 target_ulong addr, uint32_t desc) \ 810 { \ 811 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ 812 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ 813 } \ 814 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ 815 target_ulong addr, uint32_t desc) \ 816 { \ 817 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ 818 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ 819 } 820 821 DO_ST(b, , MO_8) 822 DO_ST(h, _be, MO_16) 823 DO_ST(h, _le, MO_16) 824 DO_ST(s, _be, MO_32) 825 DO_ST(s, _le, MO_32) 826 DO_ST(d, _be, MO_64) 827 DO_ST(d, _le, MO_64) 828 DO_ST(q, _be, MO_128) 829 DO_ST(q, _le, MO_128) 830 831 #undef DO_ST 832 833 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, 834 void *vpm, uint32_t desc) 835 { 836 intptr_t row, col, oprsz = simd_oprsz(desc) / 4; 837 uint64_t *pn = vpn, *pm = vpm; 838 uint32_t *zda = vzda, *zn = vzn; 839 840 for (row = 0; row < oprsz; ) { 841 uint64_t pa = pn[row >> 4]; 842 do { 843 if (pa & 1) { 844 for (col = 0; col < oprsz; ) { 845 uint64_t pb = pm[col >> 4]; 846 do { 847 if (pb & 1) { 848 zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)]; 849 } 850 pb >>= 4; 851 } while (++col & 15); 852 } 853 } 854 pa >>= 4; 855 } while (++row & 15); 856 } 857 } 858 859 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, 860 void *vpm, uint32_t desc) 861 { 862 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 863 uint8_t *pn = vpn, *pm = vpm; 864 uint64_t *zda = vzda, *zn = vzn; 865 866 for (row = 0; row < oprsz; ++row) { 867 if (pn[H1(row)] & 1) { 868 for (col = 0; col < oprsz; ++col) { 869 if (pm[H1(col)] & 1) { 870 zda[tile_vslice_index(row) + col] += zn[col]; 871 } 872 } 873 } 874 } 875 } 876 877 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, 878 void *vpm, uint32_t desc) 879 { 880 intptr_t row, col, oprsz = simd_oprsz(desc) / 4; 881 uint64_t *pn = vpn, *pm = vpm; 882 uint32_t *zda = vzda, *zn = vzn; 883 884 for (row = 0; row < oprsz; ) { 885 uint64_t pa = pn[row >> 4]; 886 do { 887 if (pa & 1) { 888 uint32_t zn_row = zn[H4(row)]; 889 for (col = 0; col < oprsz; ) { 890 uint64_t pb = pm[col >> 4]; 891 do { 892 if (pb & 1) { 893 zda[tile_vslice_index(row) + H4(col)] += zn_row; 894 } 895 pb >>= 4; 896 } while (++col & 15); 897 } 898 } 899 pa >>= 4; 900 } while (++row & 15); 901 } 902 } 903 904 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, 905 void *vpm, uint32_t desc) 906 { 907 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 908 uint8_t *pn = vpn, *pm = vpm; 909 uint64_t *zda = vzda, *zn = vzn; 910 911 for (row = 0; row < oprsz; ++row) { 912 if (pn[H1(row)] & 1) { 913 uint64_t zn_row = zn[row]; 914 for (col = 0; col < oprsz; ++col) { 915 if (pm[H1(col)] & 1) { 916 zda[tile_vslice_index(row) + col] += zn_row; 917 } 918 } 919 } 920 } 921 } 922 923 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, 924 void *vpm, void *vst, uint32_t desc) 925 { 926 intptr_t row, col, oprsz = simd_maxsz(desc); 927 uint32_t neg = simd_data(desc) << 31; 928 uint16_t *pn = vpn, *pm = vpm; 929 float_status fpst; 930 931 /* 932 * Make a copy of float_status because this operation does not 933 * update the cumulative fp exception status. It also produces 934 * default nans. 935 */ 936 fpst = *(float_status *)vst; 937 set_default_nan_mode(true, &fpst); 938 939 for (row = 0; row < oprsz; ) { 940 uint16_t pa = pn[H2(row >> 4)]; 941 do { 942 if (pa & 1) { 943 void *vza_row = vza + tile_vslice_offset(row); 944 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; 945 946 for (col = 0; col < oprsz; ) { 947 uint16_t pb = pm[H2(col >> 4)]; 948 do { 949 if (pb & 1) { 950 uint32_t *a = vza_row + H1_4(col); 951 uint32_t *m = vzm + H1_4(col); 952 *a = float32_muladd(n, *m, *a, 0, vst); 953 } 954 col += 4; 955 pb >>= 4; 956 } while (col & 15); 957 } 958 } 959 row += 4; 960 pa >>= 4; 961 } while (row & 15); 962 } 963 } 964 965 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, 966 void *vpm, void *vst, uint32_t desc) 967 { 968 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 969 uint64_t neg = (uint64_t)simd_data(desc) << 63; 970 uint64_t *za = vza, *zn = vzn, *zm = vzm; 971 uint8_t *pn = vpn, *pm = vpm; 972 float_status fpst = *(float_status *)vst; 973 974 set_default_nan_mode(true, &fpst); 975 976 for (row = 0; row < oprsz; ++row) { 977 if (pn[H1(row)] & 1) { 978 uint64_t *za_row = &za[tile_vslice_index(row)]; 979 uint64_t n = zn[row] ^ neg; 980 981 for (col = 0; col < oprsz; ++col) { 982 if (pm[H1(col)] & 1) { 983 uint64_t *a = &za_row[col]; 984 *a = float64_muladd(n, zm[col], *a, 0, &fpst); 985 } 986 } 987 } 988 } 989 } 990 991 /* 992 * Alter PAIR as needed for controlling predicates being false, 993 * and for NEG on an enabled row element. 994 */ 995 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) 996 { 997 /* 998 * The pseudocode uses a conditional negate after the conditional zero. 999 * It is simpler here to unconditionally negate before conditional zero. 1000 */ 1001 pair ^= neg; 1002 if (!(pg & 1)) { 1003 pair &= 0xffff0000u; 1004 } 1005 if (!(pg & 4)) { 1006 pair &= 0x0000ffffu; 1007 } 1008 return pair; 1009 } 1010 1011 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, 1012 float_status *s_std, float_status *s_odd) 1013 { 1014 float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std); 1015 float64 e1c = float16_to_float64(e1 >> 16, true, s_std); 1016 float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std); 1017 float64 e2c = float16_to_float64(e2 >> 16, true, s_std); 1018 float64 t64; 1019 float32 t32; 1020 1021 /* 1022 * The ARM pseudocode function FPDot performs both multiplies 1023 * and the add with a single rounding operation. Emulate this 1024 * by performing the first multiply in round-to-odd, then doing 1025 * the second multiply as fused multiply-add, and rounding to 1026 * float32 all in one step. 1027 */ 1028 t64 = float64_mul(e1r, e2r, s_odd); 1029 t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); 1030 1031 /* This conversion is exact, because we've already rounded. */ 1032 t32 = float64_to_float32(t64, s_std); 1033 1034 /* The final accumulation step is not fused. */ 1035 return float32_add(sum, t32, s_std); 1036 } 1037 1038 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, 1039 void *vpm, void *vst, uint32_t desc) 1040 { 1041 intptr_t row, col, oprsz = simd_maxsz(desc); 1042 uint32_t neg = simd_data(desc) * 0x80008000u; 1043 uint16_t *pn = vpn, *pm = vpm; 1044 float_status fpst_odd, fpst_std; 1045 1046 /* 1047 * Make a copy of float_status because this operation does not 1048 * update the cumulative fp exception status. It also produces 1049 * default nans. Make a second copy with round-to-odd -- see above. 1050 */ 1051 fpst_std = *(float_status *)vst; 1052 set_default_nan_mode(true, &fpst_std); 1053 fpst_odd = fpst_std; 1054 set_float_rounding_mode(float_round_to_odd, &fpst_odd); 1055 1056 for (row = 0; row < oprsz; ) { 1057 uint16_t prow = pn[H2(row >> 4)]; 1058 do { 1059 void *vza_row = vza + tile_vslice_offset(row); 1060 uint32_t n = *(uint32_t *)(vzn + H1_4(row)); 1061 1062 n = f16mop_adj_pair(n, prow, neg); 1063 1064 for (col = 0; col < oprsz; ) { 1065 uint16_t pcol = pm[H2(col >> 4)]; 1066 do { 1067 if (prow & pcol & 0b0101) { 1068 uint32_t *a = vza_row + H1_4(col); 1069 uint32_t m = *(uint32_t *)(vzm + H1_4(col)); 1070 1071 m = f16mop_adj_pair(m, pcol, 0); 1072 *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd); 1073 1074 col += 4; 1075 pcol >>= 4; 1076 } 1077 } while (col & 15); 1078 } 1079 row += 4; 1080 prow >>= 4; 1081 } while (row & 15); 1082 } 1083 } 1084 1085 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, 1086 void *vpm, uint32_t desc) 1087 { 1088 intptr_t row, col, oprsz = simd_maxsz(desc); 1089 uint32_t neg = simd_data(desc) * 0x80008000u; 1090 uint16_t *pn = vpn, *pm = vpm; 1091 1092 for (row = 0; row < oprsz; ) { 1093 uint16_t prow = pn[H2(row >> 4)]; 1094 do { 1095 void *vza_row = vza + tile_vslice_offset(row); 1096 uint32_t n = *(uint32_t *)(vzn + H1_4(row)); 1097 1098 n = f16mop_adj_pair(n, prow, neg); 1099 1100 for (col = 0; col < oprsz; ) { 1101 uint16_t pcol = pm[H2(col >> 4)]; 1102 do { 1103 if (prow & pcol & 0b0101) { 1104 uint32_t *a = vza_row + H1_4(col); 1105 uint32_t m = *(uint32_t *)(vzm + H1_4(col)); 1106 1107 m = f16mop_adj_pair(m, pcol, 0); 1108 *a = bfdotadd(*a, n, m); 1109 1110 col += 4; 1111 pcol >>= 4; 1112 } 1113 } while (col & 15); 1114 } 1115 row += 4; 1116 prow >>= 4; 1117 } while (row & 15); 1118 } 1119 } 1120 1121 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool); 1122 1123 static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm, 1124 uint8_t *pn, uint8_t *pm, 1125 uint32_t desc, IMOPFn *fn) 1126 { 1127 intptr_t row, col, oprsz = simd_oprsz(desc) / 8; 1128 bool neg = simd_data(desc); 1129 1130 for (row = 0; row < oprsz; ++row) { 1131 uint8_t pa = pn[H1(row)]; 1132 uint64_t *za_row = &za[tile_vslice_index(row)]; 1133 uint64_t n = zn[row]; 1134 1135 for (col = 0; col < oprsz; ++col) { 1136 uint8_t pb = pm[H1(col)]; 1137 uint64_t *a = &za_row[col]; 1138 1139 *a = fn(n, zm[col], *a, pa & pb, neg); 1140 } 1141 } 1142 } 1143 1144 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ 1145 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ 1146 { \ 1147 uint32_t sum0 = 0, sum1 = 0; \ 1148 /* Apply P to N as a mask, making the inactive elements 0. */ \ 1149 n &= expand_pred_b(p); \ 1150 sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ 1151 sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \ 1152 sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ 1153 sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \ 1154 sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ 1155 sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \ 1156 sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ 1157 sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \ 1158 if (neg) { \ 1159 sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \ 1160 } else { \ 1161 sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \ 1162 } \ 1163 return ((uint64_t)sum1 << 32) | sum0; \ 1164 } 1165 1166 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ 1167 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ 1168 { \ 1169 uint64_t sum = 0; \ 1170 /* Apply P to N as a mask, making the inactive elements 0. */ \ 1171 n &= expand_pred_h(p); \ 1172 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ 1173 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ 1174 sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ 1175 sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ 1176 return neg ? a - sum : a + sum; \ 1177 } 1178 1179 DEF_IMOP_32(smopa_s, int8_t, int8_t) 1180 DEF_IMOP_32(umopa_s, uint8_t, uint8_t) 1181 DEF_IMOP_32(sumopa_s, int8_t, uint8_t) 1182 DEF_IMOP_32(usmopa_s, uint8_t, int8_t) 1183 1184 DEF_IMOP_64(smopa_d, int16_t, int16_t) 1185 DEF_IMOP_64(umopa_d, uint16_t, uint16_t) 1186 DEF_IMOP_64(sumopa_d, int16_t, uint16_t) 1187 DEF_IMOP_64(usmopa_d, uint16_t, int16_t) 1188 1189 #define DEF_IMOPH(NAME) \ 1190 void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \ 1191 void *vpm, uint32_t desc) \ 1192 { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); } 1193 1194 DEF_IMOPH(smopa_s) 1195 DEF_IMOPH(umopa_s) 1196 DEF_IMOPH(sumopa_s) 1197 DEF_IMOPH(usmopa_s) 1198 DEF_IMOPH(smopa_d) 1199 DEF_IMOPH(umopa_d) 1200 DEF_IMOPH(sumopa_d) 1201 DEF_IMOPH(usmopa_d)