simd_util-inl.h (12991B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // Misc utilities for SIMD operations 7 8 #if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE) 9 #ifdef LIB_JXL_SIMD_UTIL_INL_H_ 10 #undef LIB_JXL_SIMD_UTIL_INL_H_ 11 #else 12 #define LIB_JXL_SIMD_UTIL_INL_H_ 13 #endif 14 15 #include <hwy/highway.h> 16 17 #include "lib/jxl/base/compiler_specific.h" 18 19 HWY_BEFORE_NAMESPACE(); 20 namespace jxl { 21 namespace HWY_NAMESPACE { 22 23 #if HWY_CAP_GE512 24 using hwy::HWY_NAMESPACE::Half; 25 using hwy::HWY_NAMESPACE::Vec; 26 template <size_t i, class DF, class V> 27 HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) { 28 using HF = Half<DF>; 29 using HHF = Half<HF>; 30 auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v); 31 return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half); 32 } 33 34 template <class DF, class V> 35 HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) { 36 using HF = Half<DF>; 37 return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0)); 38 } 39 40 #endif 41 42 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be 43 // aligned. 44 template <class DF, class V, typename T> 45 void StoreInterleaved(const DF df, V v0, V v1, T* mem) { 46 static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); 47 #if HWY_TARGET == HWY_SCALAR 48 Store(v0, df, mem); 49 Store(v1, df, mem + 1); 50 #elif !HWY_CAP_GE256 51 Store(InterleaveLower(df, v0, v1), df, mem); 52 Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); 53 #else 54 if (!HWY_CAP_GE512 || Lanes(df) == 8) { 55 auto t0 = InterleaveLower(df, v0, v1); 56 auto t1 = InterleaveUpper(df, v0, v1); 57 Store(ConcatLowerLower(df, t1, t0), df, mem); 58 Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); 59 } else { 60 #if HWY_CAP_GE512 61 auto t0 = InterleaveLower(df, v0, v1); 62 auto t1 = InterleaveUpper(df, v0, v1); 63 Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), 64 Quarter<1>(df, t0), Quarter<1>(df, t1)), 65 df, mem); 66 Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), 67 Quarter<3>(df, t0), Quarter<3>(df, t1)), 68 df, mem + Lanes(df)); 69 #endif 70 } 71 #endif 72 } 73 74 // Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must 75 // be aligned. 76 template <class DF, class V, typename T> 77 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) { 78 static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); 79 #if HWY_TARGET == HWY_SCALAR 80 Store(v0, df, mem); 81 Store(v1, df, mem + 1); 82 Store(v2, df, mem + 2); 83 Store(v3, df, mem + 3); 84 #elif !HWY_CAP_GE256 85 auto t0 = InterleaveLower(df, v0, v2); 86 auto t1 = InterleaveLower(df, v1, v3); 87 auto t2 = InterleaveUpper(df, v0, v2); 88 auto t3 = InterleaveUpper(df, v1, v3); 89 Store(InterleaveLower(df, t0, t1), df, mem); 90 Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df)); 91 Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df)); 92 Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df)); 93 #elif !HWY_CAP_GE512 94 auto t0 = InterleaveLower(df, v0, v2); 95 auto t1 = InterleaveLower(df, v1, v3); 96 auto t2 = InterleaveUpper(df, v0, v2); 97 auto t3 = InterleaveUpper(df, v1, v3); 98 99 auto m0 = InterleaveLower(df, t0, t1); 100 auto m1 = InterleaveUpper(df, t0, t1); 101 auto m2 = InterleaveLower(df, t2, t3); 102 auto m3 = InterleaveUpper(df, t2, t3); 103 104 Store(ConcatLowerLower(df, m1, m0), df, mem); 105 Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); 106 Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df)); 107 Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df)); 108 #else 109 auto t0 = InterleaveLower(df, v0, v2); 110 auto t1 = InterleaveLower(df, v1, v3); 111 auto t2 = InterleaveUpper(df, v0, v2); 112 auto t3 = InterleaveUpper(df, v1, v3); 113 114 auto m0 = InterleaveLower(df, t0, t1); 115 auto m1 = InterleaveUpper(df, t0, t1); 116 auto m2 = InterleaveLower(df, t2, t3); 117 auto m3 = InterleaveUpper(df, t2, t3); 118 119 Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), 120 Quarter<0>(df, m3)), 121 df, mem); 122 Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), 123 Quarter<1>(df, m3)), 124 df, mem + Lanes(df)); 125 Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), 126 Quarter<2>(df, m3)), 127 df, mem + 2 * Lanes(df)); 128 Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), 129 Quarter<3>(df, m3)), 130 df, mem + 3 * Lanes(df)); 131 #endif 132 } 133 134 // Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to 135 // mem, in this order. Mem must be aligned. 136 template <class DF, class V> 137 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6, 138 V v7, float* mem) { 139 #if HWY_TARGET == HWY_SCALAR 140 Store(v0, df, mem); 141 Store(v1, df, mem + 1); 142 Store(v2, df, mem + 2); 143 Store(v3, df, mem + 3); 144 Store(v4, df, mem + 4); 145 Store(v5, df, mem + 5); 146 Store(v6, df, mem + 6); 147 Store(v7, df, mem + 7); 148 #elif !HWY_CAP_GE256 149 auto t0 = InterleaveLower(df, v0, v4); 150 auto t1 = InterleaveLower(df, v1, v5); 151 auto t2 = InterleaveLower(df, v2, v6); 152 auto t3 = InterleaveLower(df, v3, v7); 153 auto t4 = InterleaveUpper(df, v0, v4); 154 auto t5 = InterleaveUpper(df, v1, v5); 155 auto t6 = InterleaveUpper(df, v2, v6); 156 auto t7 = InterleaveUpper(df, v3, v7); 157 158 auto w0 = InterleaveLower(df, t0, t2); 159 auto w1 = InterleaveLower(df, t1, t3); 160 auto w2 = InterleaveUpper(df, t0, t2); 161 auto w3 = InterleaveUpper(df, t1, t3); 162 auto w4 = InterleaveLower(df, t4, t6); 163 auto w5 = InterleaveLower(df, t5, t7); 164 auto w6 = InterleaveUpper(df, t4, t6); 165 auto w7 = InterleaveUpper(df, t5, t7); 166 167 Store(InterleaveLower(df, w0, w1), df, mem); 168 Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df)); 169 Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df)); 170 Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df)); 171 Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df)); 172 Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df)); 173 Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df)); 174 Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df)); 175 #elif !HWY_CAP_GE512 176 auto t0 = InterleaveLower(df, v0, v4); 177 auto t1 = InterleaveLower(df, v1, v5); 178 auto t2 = InterleaveLower(df, v2, v6); 179 auto t3 = InterleaveLower(df, v3, v7); 180 auto t4 = InterleaveUpper(df, v0, v4); 181 auto t5 = InterleaveUpper(df, v1, v5); 182 auto t6 = InterleaveUpper(df, v2, v6); 183 auto t7 = InterleaveUpper(df, v3, v7); 184 185 auto w0 = InterleaveLower(df, t0, t2); 186 auto w1 = InterleaveLower(df, t1, t3); 187 auto w2 = InterleaveUpper(df, t0, t2); 188 auto w3 = InterleaveUpper(df, t1, t3); 189 auto w4 = InterleaveLower(df, t4, t6); 190 auto w5 = InterleaveLower(df, t5, t7); 191 auto w6 = InterleaveUpper(df, t4, t6); 192 auto w7 = InterleaveUpper(df, t5, t7); 193 194 auto m0 = InterleaveLower(df, w0, w1); 195 auto m1 = InterleaveUpper(df, w0, w1); 196 auto m2 = InterleaveLower(df, w2, w3); 197 auto m3 = InterleaveUpper(df, w2, w3); 198 auto m4 = InterleaveLower(df, w4, w5); 199 auto m5 = InterleaveUpper(df, w4, w5); 200 auto m6 = InterleaveLower(df, w6, w7); 201 auto m7 = InterleaveUpper(df, w6, w7); 202 203 Store(ConcatLowerLower(df, m1, m0), df, mem); 204 Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); 205 Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df)); 206 Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df)); 207 Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df)); 208 Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df)); 209 Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df)); 210 Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df)); 211 #else 212 auto t0 = InterleaveLower(df, v0, v4); 213 auto t1 = InterleaveLower(df, v1, v5); 214 auto t2 = InterleaveLower(df, v2, v6); 215 auto t3 = InterleaveLower(df, v3, v7); 216 auto t4 = InterleaveUpper(df, v0, v4); 217 auto t5 = InterleaveUpper(df, v1, v5); 218 auto t6 = InterleaveUpper(df, v2, v6); 219 auto t7 = InterleaveUpper(df, v3, v7); 220 221 auto w0 = InterleaveLower(df, t0, t2); 222 auto w1 = InterleaveLower(df, t1, t3); 223 auto w2 = InterleaveUpper(df, t0, t2); 224 auto w3 = InterleaveUpper(df, t1, t3); 225 auto w4 = InterleaveLower(df, t4, t6); 226 auto w5 = InterleaveLower(df, t5, t7); 227 auto w6 = InterleaveUpper(df, t4, t6); 228 auto w7 = InterleaveUpper(df, t5, t7); 229 230 auto m0 = InterleaveLower(df, w0, w1); 231 auto m1 = InterleaveUpper(df, w0, w1); 232 auto m2 = InterleaveLower(df, w2, w3); 233 auto m3 = InterleaveUpper(df, w2, w3); 234 auto m4 = InterleaveLower(df, w4, w5); 235 auto m5 = InterleaveUpper(df, w4, w5); 236 auto m6 = InterleaveLower(df, w6, w7); 237 auto m7 = InterleaveUpper(df, w6, w7); 238 239 Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), 240 Quarter<0>(df, m3)), 241 df, mem); 242 Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6), 243 Quarter<0>(df, m7)), 244 df, mem + Lanes(df)); 245 Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), 246 Quarter<1>(df, m3)), 247 df, mem + 2 * Lanes(df)); 248 Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6), 249 Quarter<1>(df, m7)), 250 df, mem + 3 * Lanes(df)); 251 Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), 252 Quarter<2>(df, m3)), 253 df, mem + 4 * Lanes(df)); 254 Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6), 255 Quarter<2>(df, m7)), 256 df, mem + 5 * Lanes(df)); 257 Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), 258 Quarter<3>(df, m3)), 259 df, mem + 6 * Lanes(df)); 260 Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6), 261 Quarter<3>(df, m7)), 262 df, mem + 7 * Lanes(df)); 263 #endif 264 } 265 266 #if HWY_CAP_GE256 267 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, 268 int32_t* JXL_RESTRICT to, size_t fromstride) { 269 const HWY_CAPPED(int32_t, 8) d; 270 auto i0 = Load(d, from); 271 auto i1 = Load(d, from + 1 * fromstride); 272 auto i2 = Load(d, from + 2 * fromstride); 273 auto i3 = Load(d, from + 3 * fromstride); 274 auto i4 = Load(d, from + 4 * fromstride); 275 auto i5 = Load(d, from + 5 * fromstride); 276 auto i6 = Load(d, from + 6 * fromstride); 277 auto i7 = Load(d, from + 7 * fromstride); 278 279 const auto q0 = InterleaveLower(d, i0, i2); 280 const auto q1 = InterleaveLower(d, i1, i3); 281 const auto q2 = InterleaveUpper(d, i0, i2); 282 const auto q3 = InterleaveUpper(d, i1, i3); 283 const auto q4 = InterleaveLower(d, i4, i6); 284 const auto q5 = InterleaveLower(d, i5, i7); 285 const auto q6 = InterleaveUpper(d, i4, i6); 286 const auto q7 = InterleaveUpper(d, i5, i7); 287 288 const auto r0 = InterleaveLower(d, q0, q1); 289 const auto r1 = InterleaveUpper(d, q0, q1); 290 const auto r2 = InterleaveLower(d, q2, q3); 291 const auto r3 = InterleaveUpper(d, q2, q3); 292 const auto r4 = InterleaveLower(d, q4, q5); 293 const auto r5 = InterleaveUpper(d, q4, q5); 294 const auto r6 = InterleaveLower(d, q6, q7); 295 const auto r7 = InterleaveUpper(d, q6, q7); 296 297 i0 = ConcatLowerLower(d, r4, r0); 298 i1 = ConcatLowerLower(d, r5, r1); 299 i2 = ConcatLowerLower(d, r6, r2); 300 i3 = ConcatLowerLower(d, r7, r3); 301 i4 = ConcatUpperUpper(d, r4, r0); 302 i5 = ConcatUpperUpper(d, r5, r1); 303 i6 = ConcatUpperUpper(d, r6, r2); 304 i7 = ConcatUpperUpper(d, r7, r3); 305 306 Store(i0, d, to); 307 Store(i1, d, to + 1 * 8); 308 Store(i2, d, to + 2 * 8); 309 Store(i3, d, to + 3 * 8); 310 Store(i4, d, to + 4 * 8); 311 Store(i5, d, to + 5 * 8); 312 Store(i6, d, to + 6 * 8); 313 Store(i7, d, to + 7 * 8); 314 } 315 #elif HWY_TARGET != HWY_SCALAR 316 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, 317 int32_t* JXL_RESTRICT to, size_t fromstride) { 318 const HWY_CAPPED(int32_t, 4) d; 319 for (size_t n = 0; n < 8; n += 4) { 320 for (size_t m = 0; m < 8; m += 4) { 321 auto p0 = Load(d, from + n * fromstride + m); 322 auto p1 = Load(d, from + (n + 1) * fromstride + m); 323 auto p2 = Load(d, from + (n + 2) * fromstride + m); 324 auto p3 = Load(d, from + (n + 3) * fromstride + m); 325 const auto q0 = InterleaveLower(d, p0, p2); 326 const auto q1 = InterleaveLower(d, p1, p3); 327 const auto q2 = InterleaveUpper(d, p0, p2); 328 const auto q3 = InterleaveUpper(d, p1, p3); 329 330 const auto r0 = InterleaveLower(d, q0, q1); 331 const auto r1 = InterleaveUpper(d, q0, q1); 332 const auto r2 = InterleaveLower(d, q2, q3); 333 const auto r3 = InterleaveUpper(d, q2, q3); 334 Store(r0, d, to + m * 8 + n); 335 Store(r1, d, to + (1 + m) * 8 + n); 336 Store(r2, d, to + (2 + m) * 8 + n); 337 Store(r3, d, to + (3 + m) * 8 + n); 338 } 339 } 340 } 341 342 #endif 343 344 // NOLINTNEXTLINE(google-readability-namespace-comments) 345 } // namespace HWY_NAMESPACE 346 } // namespace jxl 347 HWY_AFTER_NAMESPACE(); 348 349 #endif // LIB_JXL_SIMD_UTIL_INL_H_