qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

hvx_misc.c (17251B)


      1 /*
      2  *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
      3  *
      4  *  This program is free software; you can redistribute it and/or modify
      5  *  it under the terms of the GNU General Public License as published by
      6  *  the Free Software Foundation; either version 2 of the License, or
      7  *  (at your option) any later version.
      8  *
      9  *  This program is distributed in the hope that it will be useful,
     10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  *  GNU General Public License for more details.
     13  *
     14  *  You should have received a copy of the GNU General Public License
     15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
     16  */
     17 
     18 #include <stdio.h>
     19 #include <stdint.h>
     20 #include <stdbool.h>
     21 #include <string.h>
     22 #include <limits.h>
     23 
     24 int err;
     25 
     26 static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
     27 {
     28     if (result != expect) {
     29         printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
     30                line, i, j, result, expect);
     31         err++;
     32     }
     33 }
     34 
     35 #define check(RES, EXP) __check(__LINE__, RES, EXP)
     36 
     37 #define MAX_VEC_SIZE_BYTES         128
     38 
     39 typedef union {
     40     uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
     41     int64_t   d[MAX_VEC_SIZE_BYTES / 8];
     42     uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
     43     int32_t   w[MAX_VEC_SIZE_BYTES / 4];
     44     uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
     45     int16_t   h[MAX_VEC_SIZE_BYTES / 2];
     46     uint8_t  ub[MAX_VEC_SIZE_BYTES / 1];
     47     int8_t    b[MAX_VEC_SIZE_BYTES / 1];
     48 } MMVector;
     49 
     50 #define BUFSIZE      16
     51 #define OUTSIZE      16
     52 #define MASKMOD      3
     53 
     54 MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
     55 MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
     56 MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
     57 MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
     58 MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
     59 
     60 #define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
     61 static void check_output_##FIELD(int line, size_t num_vectors) \
     62 { \
     63     for (int i = 0; i < num_vectors; i++) { \
     64         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
     65             __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
     66         } \
     67     } \
     68 }
     69 
     70 CHECK_OUTPUT_FUNC(d,  8)
     71 CHECK_OUTPUT_FUNC(w,  4)
     72 CHECK_OUTPUT_FUNC(h,  2)
     73 CHECK_OUTPUT_FUNC(b,  1)
     74 
     75 static void init_buffers(void)
     76 {
     77     int counter0 = 0;
     78     int counter1 = 17;
     79     for (int i = 0; i < BUFSIZE; i++) {
     80         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
     81             buffer0[i].b[j] = counter0++;
     82             buffer1[i].b[j] = counter1++;
     83         }
     84         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
     85             mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
     86         }
     87     }
     88 }
     89 
     90 static void test_load_tmp(void)
     91 {
     92     void *p0 = buffer0;
     93     void *p1 = buffer1;
     94     void *pout = output;
     95 
     96     for (int i = 0; i < BUFSIZE; i++) {
     97         /*
     98          * Load into v12 as .tmp, then use it in the next packet
     99          * Should get the new value within the same packet and
    100          * the old value in the next packet
    101          */
    102         asm("v3 = vmem(%0 + #0)\n\t"
    103             "r1 = #1\n\t"
    104             "v12 = vsplat(r1)\n\t"
    105             "{\n\t"
    106             "    v12.tmp = vmem(%1 + #0)\n\t"
    107             "    v4.w = vadd(v12.w, v3.w)\n\t"
    108             "}\n\t"
    109             "v4.w = vadd(v4.w, v12.w)\n\t"
    110             "vmem(%2 + #0) = v4\n\t"
    111             : : "r"(p0), "r"(p1), "r"(pout)
    112             : "r1", "v12", "v3", "v4", "v6", "memory");
    113         p0 += sizeof(MMVector);
    114         p1 += sizeof(MMVector);
    115         pout += sizeof(MMVector);
    116 
    117         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
    118             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
    119         }
    120     }
    121 
    122     check_output_w(__LINE__, BUFSIZE);
    123 }
    124 
    125 static void test_load_cur(void)
    126 {
    127     void *p0 = buffer0;
    128     void *pout = output;
    129 
    130     for (int i = 0; i < BUFSIZE; i++) {
    131         asm("{\n\t"
    132             "    v2.cur = vmem(%0 + #0)\n\t"
    133             "    vmem(%1 + #0) = v2\n\t"
    134             "}\n\t"
    135             : : "r"(p0), "r"(pout) : "v2", "memory");
    136         p0 += sizeof(MMVector);
    137         pout += sizeof(MMVector);
    138 
    139         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
    140             expect[i].uw[j] = buffer0[i].uw[j];
    141         }
    142     }
    143 
    144     check_output_w(__LINE__, BUFSIZE);
    145 }
    146 
    147 static void test_load_aligned(void)
    148 {
    149     /* Aligned loads ignore the low bits of the address */
    150     void *p0 = buffer0;
    151     void *pout = output;
    152     const size_t offset = 13;
    153 
    154     p0 += offset;    /* Create an unaligned address */
    155     asm("v2 = vmem(%0 + #0)\n\t"
    156         "vmem(%1 + #0) = v2\n\t"
    157         : : "r"(p0), "r"(pout) : "v2", "memory");
    158 
    159     expect[0] = buffer0[0];
    160 
    161     check_output_w(__LINE__, 1);
    162 }
    163 
    164 static void test_load_unaligned(void)
    165 {
    166     void *p0 = buffer0;
    167     void *pout = output;
    168     const size_t offset = 12;
    169 
    170     p0 += offset;    /* Create an unaligned address */
    171     asm("v2 = vmemu(%0 + #0)\n\t"
    172         "vmem(%1 + #0) = v2\n\t"
    173         : : "r"(p0), "r"(pout) : "v2", "memory");
    174 
    175     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
    176 
    177     check_output_w(__LINE__, 1);
    178 }
    179 
    180 static void test_store_aligned(void)
    181 {
    182     /* Aligned stores ignore the low bits of the address */
    183     void *p0 = buffer0;
    184     void *pout = output;
    185     const size_t offset = 13;
    186 
    187     pout += offset;    /* Create an unaligned address */
    188     asm("v2 = vmem(%0 + #0)\n\t"
    189         "vmem(%1 + #0) = v2\n\t"
    190         : : "r"(p0), "r"(pout) : "v2", "memory");
    191 
    192     expect[0] = buffer0[0];
    193 
    194     check_output_w(__LINE__, 1);
    195 }
    196 
    197 static void test_store_unaligned(void)
    198 {
    199     void *p0 = buffer0;
    200     void *pout = output;
    201     const size_t offset = 12;
    202 
    203     pout += offset;    /* Create an unaligned address */
    204     asm("v2 = vmem(%0 + #0)\n\t"
    205         "vmemu(%1 + #0) = v2\n\t"
    206         : : "r"(p0), "r"(pout) : "v2", "memory");
    207 
    208     memcpy(expect, buffer0, 2 * sizeof(MMVector));
    209     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
    210 
    211     check_output_w(__LINE__, 2);
    212 }
    213 
    214 static void test_masked_store(bool invert)
    215 {
    216     void *p0 = buffer0;
    217     void *pmask = mask;
    218     void *pout = output;
    219 
    220     memset(expect, 0xff, sizeof(expect));
    221     memset(output, 0xff, sizeof(expect));
    222 
    223     for (int i = 0; i < BUFSIZE; i++) {
    224         if (invert) {
    225             asm("r4 = #0\n\t"
    226                 "v4 = vsplat(r4)\n\t"
    227                 "v5 = vmem(%0 + #0)\n\t"
    228                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
    229                 "v5 = vmem(%1)\n\t"
    230                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
    231                 : : "r"(pmask), "r"(p0), "r"(pout)
    232                 : "r4", "v4", "v5", "q0", "memory");
    233         } else {
    234             asm("r4 = #0\n\t"
    235                 "v4 = vsplat(r4)\n\t"
    236                 "v5 = vmem(%0 + #0)\n\t"
    237                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
    238                 "v5 = vmem(%1)\n\t"
    239                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
    240                 : : "r"(pmask), "r"(p0), "r"(pout)
    241                 : "r4", "v4", "v5", "q0", "memory");
    242         }
    243         p0 += sizeof(MMVector);
    244         pmask += sizeof(MMVector);
    245         pout += sizeof(MMVector);
    246 
    247         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
    248             if (invert) {
    249                 if (i + j % MASKMOD != 0) {
    250                     expect[i].w[j] = buffer0[i].w[j];
    251                 }
    252             } else {
    253                 if (i + j % MASKMOD == 0) {
    254                     expect[i].w[j] = buffer0[i].w[j];
    255                 }
    256             }
    257         }
    258     }
    259 
    260     check_output_w(__LINE__, BUFSIZE);
    261 }
    262 
    263 static void test_new_value_store(void)
    264 {
    265     void *p0 = buffer0;
    266     void *pout = output;
    267 
    268     asm("{\n\t"
    269         "    v2 = vmem(%0 + #0)\n\t"
    270         "    vmem(%1 + #0) = v2.new\n\t"
    271         "}\n\t"
    272         : : "r"(p0), "r"(pout) : "v2", "memory");
    273 
    274     expect[0] = buffer0[0];
    275 
    276     check_output_w(__LINE__, 1);
    277 }
    278 
    279 static void test_max_temps()
    280 {
    281     void *p0 = buffer0;
    282     void *pout = output;
    283 
    284     asm("v0 = vmem(%0 + #0)\n\t"
    285         "v1 = vmem(%0 + #1)\n\t"
    286         "v2 = vmem(%0 + #2)\n\t"
    287         "v3 = vmem(%0 + #3)\n\t"
    288         "v4 = vmem(%0 + #4)\n\t"
    289         "{\n\t"
    290         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
    291         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
    292         "    v3.w = vadd(v1.w, v4.w)\n\t"
    293         "    v4.tmp = vmem(%0 + #5)\n\t"
    294         "}\n\t"
    295         "vmem(%1 + #0) = v0\n\t"
    296         "vmem(%1 + #1) = v1\n\t"
    297         "vmem(%1 + #2) = v2\n\t"
    298         "vmem(%1 + #3) = v3\n\t"
    299         "vmem(%1 + #4) = v4\n\t"
    300         : : "r"(p0), "r"(pout) : "memory");
    301 
    302         /* The first two vectors come from the vadd-pair instruction */
    303         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
    304             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
    305             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
    306         }
    307         /* The third vector comes from the vshuffe instruction */
    308         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
    309             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
    310                               (buffer0[3].uh[i] & 0xff) << 8;
    311         }
    312         /* The fourth vector comes from the vadd-single instruction */
    313         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
    314             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
    315         }
    316         /*
    317          * The fifth vector comes from the load to v4
    318          * make sure the .tmp is dropped
    319          */
    320         expect[4] = buffer0[4];
    321 
    322         check_output_b(__LINE__, 5);
    323 }
    324 
    325 #define VEC_OP1(ASM, EL, IN, OUT) \
    326     asm("v2 = vmem(%0 + #0)\n\t" \
    327         "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
    328         "vmem(%1 + #0) = v2\n\t" \
    329         : : "r"(IN), "r"(OUT) : "v2", "memory")
    330 
    331 #define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
    332     asm("v2 = vmem(%0 + #0)\n\t" \
    333         "v3 = vmem(%1 + #0)\n\t" \
    334         "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
    335         "vmem(%2 + #0) = v2\n\t" \
    336         : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
    337 
    338 #define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
    339 static void test_##NAME(void) \
    340 { \
    341     void *pin = buffer0; \
    342     void *pout = output; \
    343     for (int i = 0; i < BUFSIZE; i++) { \
    344         VEC_OP1(ASM, EL, pin, pout); \
    345         pin += sizeof(MMVector); \
    346         pout += sizeof(MMVector); \
    347     } \
    348     for (int i = 0; i < BUFSIZE; i++) { \
    349         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
    350             expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
    351         } \
    352     } \
    353     check_output_##FIELD(__LINE__, BUFSIZE); \
    354 }
    355 
    356 #define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
    357 static void test_##NAME(void) \
    358 { \
    359     void *p0 = buffer0; \
    360     void *p1 = buffer1; \
    361     void *pout = output; \
    362     for (int i = 0; i < BUFSIZE; i++) { \
    363         VEC_OP2(ASM, EL, p0, p1, pout); \
    364         p0 += sizeof(MMVector); \
    365         p1 += sizeof(MMVector); \
    366         pout += sizeof(MMVector); \
    367     } \
    368     for (int i = 0; i < BUFSIZE; i++) { \
    369         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
    370             expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
    371         } \
    372     } \
    373     check_output_##FIELD(__LINE__, BUFSIZE); \
    374 }
    375 
    376 #define THRESHOLD        31
    377 
    378 #define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
    379     asm("r4 = #%3\n\t" \
    380         "v1.b = vsplat(r4)\n\t" \
    381         "v2 = vmem(%0 + #0)\n\t" \
    382         "q0 = vcmp.gt(v2.b, v1.b)\n\t" \
    383         "v3 = vmem(%1 + #0)\n\t" \
    384         "q1 = vcmp.gt(v3.b, v1.b)\n\t" \
    385         "q2 = " #ASM "(q0, " INV "q1)\n\t" \
    386         "r4 = #0xff\n\t" \
    387         "v1.b = vsplat(r4)\n\t" \
    388         "if (q2) vmem(%2 + #0) = v1\n\t" \
    389         : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
    390         : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
    391 
    392 #define TEST_PRED_OP2(NAME, ASM, OP, INV) \
    393 static void test_##NAME(bool invert) \
    394 { \
    395     void *p0 = buffer0; \
    396     void *p1 = buffer1; \
    397     void *pout = output; \
    398     memset(output, 0, sizeof(expect)); \
    399     for (int i = 0; i < BUFSIZE; i++) { \
    400         PRED_OP2(ASM, p0, p1, pout, INV); \
    401         p0 += sizeof(MMVector); \
    402         p1 += sizeof(MMVector); \
    403         pout += sizeof(MMVector); \
    404     } \
    405     for (int i = 0; i < BUFSIZE; i++) { \
    406         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
    407             bool p0 = (buffer0[i].b[j] > THRESHOLD); \
    408             bool p1 = (buffer1[i].b[j] > THRESHOLD); \
    409             if (invert) { \
    410                 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
    411             } else { \
    412                 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
    413             } \
    414         } \
    415     } \
    416     check_output_b(__LINE__, BUFSIZE); \
    417 }
    418 
    419 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
    420 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
    421 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
    422 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
    423 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
    424 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
    425 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
    426 TEST_VEC_OP2(vand, vand, , d, 8, &)
    427 TEST_VEC_OP2(vor, vor, , d, 8, |)
    428 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
    429 
    430 TEST_PRED_OP2(pred_or, or, |, "")
    431 TEST_PRED_OP2(pred_or_n, or, |, "!")
    432 TEST_PRED_OP2(pred_and, and, &, "")
    433 TEST_PRED_OP2(pred_and_n, and, &, "!")
    434 TEST_PRED_OP2(pred_xor, xor, ^, "")
    435 
    436 static void test_vadduwsat(void)
    437 {
    438     /*
    439      * Test for saturation by adding two numbers that add to more than UINT_MAX
    440      * and make sure the result saturates to UINT_MAX
    441      */
    442     const uint32_t x = 0xffff0000;
    443     const uint32_t y = 0x000fffff;
    444 
    445     memset(expect, 0x12, sizeof(MMVector));
    446     memset(output, 0x34, sizeof(MMVector));
    447 
    448     asm volatile ("v10 = vsplat(%0)\n\t"
    449                   "v11 = vsplat(%1)\n\t"
    450                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
    451                   "vmem(%2+#0) = v21\n\t"
    452                   : /* no outputs */
    453                   : "r"(x), "r"(y), "r"(output)
    454                   : "v10", "v11", "v21", "memory");
    455 
    456     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
    457         expect[0].uw[j] = UINT_MAX;
    458     }
    459 
    460     check_output_w(__LINE__, 1);
    461 }
    462 
    463 static void test_vsubuwsat_dv(void)
    464 {
    465     /*
    466      * Test for saturation by subtracting two numbers where the result is
    467      * negative and make sure the result saturates to zero
    468      *
    469      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
    470      * pair of subtractions
    471      *     w - x < 0
    472      *     y - z < 0
    473      */
    474     const uint32_t w = 0x000000b7;
    475     const uint32_t x = 0xffffff4e;
    476     const uint32_t y = 0x31fe88e7;
    477     const uint32_t z = 0x7fffff79;
    478 
    479     memset(expect, 0x12, sizeof(MMVector) * 2);
    480     memset(output, 0x34, sizeof(MMVector) * 2);
    481 
    482     asm volatile ("v16 = vsplat(%0)\n\t"
    483                   "v17 = vsplat(%1)\n\t"
    484                   "v26 = vsplat(%2)\n\t"
    485                   "v27 = vsplat(%3)\n\t"
    486                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
    487                   "vmem(%4+#0) = v24\n\t"
    488                   "vmem(%4+#1) = v25\n\t"
    489                   : /* no outputs */
    490                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
    491                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
    492 
    493     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
    494         expect[0].uw[j] = 0x00000000;
    495         expect[1].uw[j] = 0x00000000;
    496     }
    497 
    498     check_output_w(__LINE__, 2);
    499 }
    500 
    501 static void test_vshuff(void)
    502 {
    503     /* Test that vshuff works when the two operands are the same register */
    504     const uint32_t splat = 0x089be55c;
    505     const uint32_t shuff = 0x454fa926;
    506     MMVector v0, v1;
    507 
    508     memset(expect, 0x12, sizeof(MMVector));
    509     memset(output, 0x34, sizeof(MMVector));
    510 
    511     asm volatile("v25 = vsplat(%0)\n\t"
    512                  "vshuff(v25, v25, %1)\n\t"
    513                  "vmem(%2 + #0) = v25\n\t"
    514                  : /* no outputs */
    515                  : "r"(splat), "r"(shuff), "r"(output)
    516                  : "v25", "memory");
    517 
    518     /*
    519      * The semantics of Hexagon are the operands are pass-by-value, so create
    520      * two copies of the vsplat result.
    521      */
    522     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
    523         v0.uw[i] = splat;
    524         v1.uw[i] = splat;
    525     }
    526     /* Do the vshuff operation */
    527     for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
    528         if (shuff & offset) {
    529             for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
    530                 if (!(k & offset)) {
    531                     uint8_t tmp = v0.ub[k];
    532                     v0.ub[k] = v1.ub[k + offset];
    533                     v1.ub[k + offset] = tmp;
    534                 }
    535             }
    536         }
    537     }
    538     /* Put the result in the expect buffer for verification */
    539     expect[0] = v1;
    540 
    541     check_output_b(__LINE__, 1);
    542 }
    543 
    544 int main()
    545 {
    546     init_buffers();
    547 
    548     test_load_tmp();
    549     test_load_cur();
    550     test_load_aligned();
    551     test_load_unaligned();
    552     test_store_aligned();
    553     test_store_unaligned();
    554     test_masked_store(false);
    555     test_masked_store(true);
    556     test_new_value_store();
    557     test_max_temps();
    558 
    559     test_vadd_w();
    560     test_vadd_h();
    561     test_vadd_b();
    562     test_vsub_w();
    563     test_vsub_h();
    564     test_vsub_b();
    565     test_vxor();
    566     test_vand();
    567     test_vor();
    568     test_vnot();
    569 
    570     test_pred_or(false);
    571     test_pred_or_n(true);
    572     test_pred_and(false);
    573     test_pred_and_n(true);
    574     test_pred_xor(false);
    575 
    576     test_vadduwsat();
    577     test_vsubuwsat_dv();
    578 
    579     test_vshuff();
    580 
    581     puts(err ? "FAIL" : "PASS");
    582     return err ? 1 : 0;
    583 }