qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

scatter_gather.c (30881B)


      1 /*
      2  *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
      3  *
      4  *  This program is free software; you can redistribute it and/or modify
      5  *  it under the terms of the GNU General Public License as published by
      6  *  the Free Software Foundation; either version 2 of the License, or
      7  *  (at your option) any later version.
      8  *
      9  *  This program is distributed in the hope that it will be useful,
     10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  *  GNU General Public License for more details.
     13  *
     14  *  You should have received a copy of the GNU General Public License
     15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
     16  */
     17 
     18 /*
     19  * This example tests the HVX scatter/gather instructions
     20  *
     21  * See section 5.13 of the V68 HVX Programmer's Reference
     22  *
     23  * There are 3 main classes operations
     24  *     _16                 16-bit elements and 16-bit offsets
     25  *     _32                 32-bit elements and 32-bit offsets
     26  *     _16_32              16-bit elements and 32-bit offsets
     27  *
     28  * There are also masked and accumulate versions
     29  */
     30 
     31 #include <stdio.h>
     32 #include <string.h>
     33 #include <stdlib.h>
     34 #include <inttypes.h>
     35 
     36 typedef long HVX_Vector       __attribute__((__vector_size__(128)))
     37                               __attribute__((aligned(128)));
     38 typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
     39                               __attribute__((aligned(128)));
     40 typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
     41                               __attribute__((aligned(128)));
     42 
     43 #define VSCATTER_16(BASE, RGN, OFF, VALS) \
     44     __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
     45 #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
     46     __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
     47 #define VSCATTER_32(BASE, RGN, OFF, VALS) \
     48     __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
     49 #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
     50     __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
     51 #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
     52     __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
     53 #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
     54     __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
     55 #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
     56     __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
     57 #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
     58     __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
     59 #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
     60     __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
     61 
     62 #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
     63     __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
     64 #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
     65     __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
     66 #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
     67     __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
     68 #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
     69     __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
     70 #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
     71     __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
     72 #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
     73     __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
     74 
     75 #define VSHUFF_H(V) \
     76     __builtin_HEXAGON_V6_vshuffh_128B(V)
     77 #define VSPLAT_H(X) \
     78     __builtin_HEXAGON_V6_lvsplath_128B(X)
     79 #define VAND_VAL(PRED, VAL) \
     80     __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
     81 #define VDEAL_H(V) \
     82     __builtin_HEXAGON_V6_vdealh_128B(V)
     83 
     84 int err;
     85 
     86 /* define the number of rows/cols in a square matrix */
     87 #define MATRIX_SIZE 64
     88 
     89 /* define the size of the scatter buffer */
     90 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
     91 
     92 /* fake vtcm - put buffers together and force alignment */
     93 static struct {
     94     unsigned short vscatter16[SCATTER_BUFFER_SIZE];
     95     unsigned short vgather16[MATRIX_SIZE];
     96     unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
     97     unsigned int   vgather32[MATRIX_SIZE];
     98     unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
     99     unsigned short vgather16_32[MATRIX_SIZE];
    100 } vtcm __attribute__((aligned(0x10000)));
    101 
    102 /* declare the arrays of reference values */
    103 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
    104 unsigned short vgather16_ref[MATRIX_SIZE];
    105 unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
    106 unsigned int   vgather32_ref[MATRIX_SIZE];
    107 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
    108 unsigned short vgather16_32_ref[MATRIX_SIZE];
    109 
    110 /* declare the arrays of offsets */
    111 unsigned short half_offsets[MATRIX_SIZE];
    112 unsigned int   word_offsets[MATRIX_SIZE];
    113 
    114 /* declare the arrays of values */
    115 unsigned short half_values[MATRIX_SIZE];
    116 unsigned short half_values_acc[MATRIX_SIZE];
    117 unsigned short half_values_masked[MATRIX_SIZE];
    118 unsigned int   word_values[MATRIX_SIZE];
    119 unsigned int   word_values_acc[MATRIX_SIZE];
    120 unsigned int   word_values_masked[MATRIX_SIZE];
    121 
    122 /* declare the arrays of predicates */
    123 unsigned short half_predicates[MATRIX_SIZE];
    124 unsigned int   word_predicates[MATRIX_SIZE];
    125 
    126 /* make this big enough for all the intrinsics */
    127 const size_t region_len = sizeof(vtcm);
    128 
    129 /* optionally add sync instructions */
    130 #define SYNC_VECTOR 1
    131 
    132 static void sync_scatter(void *addr)
    133 {
    134 #if SYNC_VECTOR
    135     /*
    136      * Do the scatter release followed by a dummy load to complete the
    137      * synchronization.  Normally the dummy load would be deferred as
    138      * long as possible to minimize stalls.
    139      */
    140     asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
    141     /* use volatile to force the load */
    142     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
    143 #endif
    144 }
    145 
    146 static void sync_gather(void *addr)
    147 {
    148 #if SYNC_VECTOR
    149     /* use volatile to force the load */
    150     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
    151 #endif
    152 }
    153 
    154 /* optionally print the results */
    155 #define PRINT_DATA 0
    156 
    157 #define FILL_CHAR       '.'
    158 
    159 /* fill vtcm scratch with ee */
    160 void prefill_vtcm_scratch(void)
    161 {
    162     memset(&vtcm, FILL_CHAR, sizeof(vtcm));
    163 }
    164 
    165 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
    166 void create_offsets_values_preds_16(void)
    167 {
    168     unsigned short half_element = 0;
    169     unsigned short half_element_masked = 0;
    170     char letter = 'A';
    171     char letter_masked = '@';
    172 
    173     for (int i = 0; i < MATRIX_SIZE; i++) {
    174         half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
    175 
    176         half_element = 0;
    177         half_element_masked = 0;
    178         for (int j = 0; j < 2; j++) {
    179             half_element |= letter << j * 8;
    180             half_element_masked |= letter_masked << j * 8;
    181         }
    182 
    183         half_values[i] = half_element;
    184         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
    185         half_values_masked[i] = half_element_masked;
    186 
    187         letter++;
    188         /* reset to 'A' */
    189         if (letter == 'M') {
    190             letter = 'A';
    191         }
    192 
    193         half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
    194     }
    195 }
    196 
    197 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
    198 void create_offsets_values_preds_32(void)
    199 {
    200     unsigned int word_element = 0;
    201     unsigned int word_element_masked = 0;
    202     char letter = 'A';
    203     char letter_masked = '&';
    204 
    205     for (int i = 0; i < MATRIX_SIZE; i++) {
    206         word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
    207 
    208         word_element = 0;
    209         word_element_masked = 0;
    210         for (int j = 0; j < 4; j++) {
    211             word_element |= letter << j * 8;
    212             word_element_masked |= letter_masked << j * 8;
    213         }
    214 
    215         word_values[i] = word_element;
    216         word_values_acc[i] = ((i % 10) << 8) + (i % 10);
    217         word_values_masked[i] = word_element_masked;
    218 
    219         letter++;
    220         /* reset to 'A' */
    221         if (letter == 'M') {
    222             letter = 'A';
    223         }
    224 
    225         word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
    226     }
    227 }
    228 
    229 /*
    230  * create byte offsets to be a diagonal of the matrix with 16 bit elements
    231  * and 32 bit offsets
    232  */
    233 void create_offsets_values_preds_16_32(void)
    234 {
    235     unsigned short half_element = 0;
    236     unsigned short half_element_masked = 0;
    237     char letter = 'D';
    238     char letter_masked = '$';
    239 
    240     for (int i = 0; i < MATRIX_SIZE; i++) {
    241         word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
    242 
    243         half_element = 0;
    244         half_element_masked = 0;
    245         for (int j = 0; j < 2; j++) {
    246             half_element |= letter << j * 8;
    247             half_element_masked |= letter_masked << j * 8;
    248         }
    249 
    250         half_values[i] = half_element;
    251         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
    252         half_values_masked[i] = half_element_masked;
    253 
    254         letter++;
    255         /* reset to 'A' */
    256         if (letter == 'P') {
    257             letter = 'D';
    258         }
    259 
    260         half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
    261     }
    262 }
    263 
    264 /* scatter the 16 bit elements using intrinsics */
    265 void vector_scatter_16(void)
    266 {
    267     /* copy the offsets and values to vectors */
    268     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
    269     HVX_Vector values = *(HVX_Vector *)half_values;
    270 
    271     VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
    272 
    273     sync_scatter(vtcm.vscatter16);
    274 }
    275 
    276 /* scatter-accumulate the 16 bit elements using intrinsics */
    277 void vector_scatter_16_acc(void)
    278 {
    279     /* copy the offsets and values to vectors */
    280     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
    281     HVX_Vector values = *(HVX_Vector *)half_values_acc;
    282 
    283     VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
    284 
    285     sync_scatter(vtcm.vscatter16);
    286 }
    287 
    288 /* scatter the 16 bit elements using intrinsics */
    289 void vector_scatter_16_masked(void)
    290 {
    291     /* copy the offsets and values to vectors */
    292     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
    293     HVX_Vector values = *(HVX_Vector *)half_values_masked;
    294     HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
    295     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
    296 
    297     VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
    298 
    299     sync_scatter(vtcm.vscatter16);
    300 }
    301 
    302 /* scatter the 32 bit elements using intrinsics */
    303 void vector_scatter_32(void)
    304 {
    305     /* copy the offsets and values to vectors */
    306     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
    307     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
    308     HVX_Vector valueslo = *(HVX_Vector *)word_values;
    309     HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
    310 
    311     VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
    312     VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
    313 
    314     sync_scatter(vtcm.vscatter32);
    315 }
    316 
    317 /* scatter-acc the 32 bit elements using intrinsics */
    318 void vector_scatter_32_acc(void)
    319 {
    320     /* copy the offsets and values to vectors */
    321     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
    322     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
    323     HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
    324     HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
    325 
    326     VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
    327     VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
    328 
    329     sync_scatter(vtcm.vscatter32);
    330 }
    331 
    332 /* scatter the 32 bit elements using intrinsics */
    333 void vector_scatter_32_masked(void)
    334 {
    335     /* copy the offsets and values to vectors */
    336     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
    337     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
    338     HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
    339     HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
    340     HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
    341     HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
    342     HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
    343     HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
    344 
    345     VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
    346                        valueslo);
    347     VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
    348                        valueshi);
    349 
    350     sync_scatter(vtcm.vscatter16);
    351 }
    352 
    353 /* scatter the 16 bit elements with 32 bit offsets using intrinsics */
    354 void vector_scatter_16_32(void)
    355 {
    356     HVX_VectorPair offsets;
    357     HVX_Vector values;
    358 
    359     /* get the word offsets in a vector pair */
    360     offsets = *(HVX_VectorPair *)word_offsets;
    361 
    362     /* these values need to be shuffled for the scatter */
    363     values = *(HVX_Vector *)half_values;
    364     values = VSHUFF_H(values);
    365 
    366     VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
    367 
    368     sync_scatter(vtcm.vscatter16_32);
    369 }
    370 
    371 /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
    372 void vector_scatter_16_32_acc(void)
    373 {
    374     HVX_VectorPair offsets;
    375     HVX_Vector values;
    376 
    377     /* get the word offsets in a vector pair */
    378     offsets = *(HVX_VectorPair *)word_offsets;
    379 
    380     /* these values need to be shuffled for the scatter */
    381     values = *(HVX_Vector *)half_values_acc;
    382     values = VSHUFF_H(values);
    383 
    384     VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
    385 
    386     sync_scatter(vtcm.vscatter16_32);
    387 }
    388 
    389 /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
    390 void vector_scatter_16_32_masked(void)
    391 {
    392     HVX_VectorPair offsets;
    393     HVX_Vector values;
    394     HVX_Vector pred_reg;
    395 
    396     /* get the word offsets in a vector pair */
    397     offsets = *(HVX_VectorPair *)word_offsets;
    398 
    399     /* these values need to be shuffled for the scatter */
    400     values = *(HVX_Vector *)half_values_masked;
    401     values = VSHUFF_H(values);
    402 
    403     pred_reg = *(HVX_Vector *)half_predicates;
    404     pred_reg = VSHUFF_H(pred_reg);
    405     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
    406 
    407     VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
    408                           values);
    409 
    410     sync_scatter(vtcm.vscatter16_32);
    411 }
    412 
    413 /* gather the elements from the scatter16 buffer */
    414 void vector_gather_16(void)
    415 {
    416     HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
    417     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
    418 
    419     VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
    420 
    421     sync_gather(vgather);
    422 }
    423 
    424 static unsigned short gather_16_masked_init(void)
    425 {
    426     char letter = '?';
    427     return letter | (letter << 8);
    428 }
    429 
    430 void vector_gather_16_masked(void)
    431 {
    432     HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
    433     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
    434     HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
    435     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
    436 
    437     *vgather = VSPLAT_H(gather_16_masked_init());
    438     VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
    439 
    440     sync_gather(vgather);
    441 }
    442 
    443 /* gather the elements from the scatter32 buffer */
    444 void vector_gather_32(void)
    445 {
    446     HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
    447     HVX_Vector *vgatherhi =
    448         (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
    449     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
    450     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
    451 
    452     VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
    453     VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
    454 
    455     sync_gather(vgatherhi);
    456 }
    457 
    458 static unsigned int gather_32_masked_init(void)
    459 {
    460     char letter = '?';
    461     return letter | (letter << 8) | (letter << 16) | (letter << 24);
    462 }
    463 
    464 void vector_gather_32_masked(void)
    465 {
    466     HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
    467     HVX_Vector *vgatherhi =
    468         (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
    469     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
    470     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
    471     HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
    472     HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
    473     HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
    474     HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
    475 
    476     *vgatherlo = VSPLAT_H(gather_32_masked_init());
    477     *vgatherhi = VSPLAT_H(gather_32_masked_init());
    478     VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
    479                       offsetslo);
    480     VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
    481                       offsetshi);
    482 
    483     sync_gather(vgatherlo);
    484     sync_gather(vgatherhi);
    485 }
    486 
    487 /* gather the elements from the scatter16_32 buffer */
    488 void vector_gather_16_32(void)
    489 {
    490     HVX_Vector *vgather;
    491     HVX_VectorPair offsets;
    492     HVX_Vector values;
    493 
    494     /* get the vtcm address to gather from */
    495     vgather = (HVX_Vector *)&vtcm.vgather16_32;
    496 
    497     /* get the word offsets in a vector pair */
    498     offsets = *(HVX_VectorPair *)word_offsets;
    499 
    500     VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
    501 
    502     /* deal the elements to get the order back */
    503     values = *(HVX_Vector *)vgather;
    504     values = VDEAL_H(values);
    505 
    506     /* write it back to vtcm address */
    507     *(HVX_Vector *)vgather = values;
    508 }
    509 
    510 void vector_gather_16_32_masked(void)
    511 {
    512     HVX_Vector *vgather;
    513     HVX_VectorPair offsets;
    514     HVX_Vector pred_reg;
    515     HVX_VectorPred preds;
    516     HVX_Vector values;
    517 
    518     /* get the vtcm address to gather from */
    519     vgather = (HVX_Vector *)&vtcm.vgather16_32;
    520 
    521     /* get the word offsets in a vector pair */
    522     offsets = *(HVX_VectorPair *)word_offsets;
    523     pred_reg = *(HVX_Vector *)half_predicates;
    524     pred_reg = VSHUFF_H(pred_reg);
    525     preds = VAND_VAL(pred_reg, ~0);
    526 
    527    *vgather = VSPLAT_H(gather_16_masked_init());
    528    VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
    529                         offsets);
    530 
    531     /* deal the elements to get the order back */
    532     values = *(HVX_Vector *)vgather;
    533     values = VDEAL_H(values);
    534 
    535     /* write it back to vtcm address */
    536     *(HVX_Vector *)vgather = values;
    537 }
    538 
    539 static void check_buffer(const char *name, void *c, void *r, size_t size)
    540 {
    541     char *check = (char *)c;
    542     char *ref = (char *)r;
    543     for (int i = 0; i < size; i++) {
    544         if (check[i] != ref[i]) {
    545             printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
    546                    check[i], check[i], ref[i], ref[i]);
    547             err++;
    548         }
    549     }
    550 }
    551 
    552 /*
    553  * These scalar functions are the C equivalents of the vector functions that
    554  * use HVX
    555  */
    556 
    557 /* scatter the 16 bit elements using C */
    558 void scalar_scatter_16(unsigned short *vscatter16)
    559 {
    560     for (int i = 0; i < MATRIX_SIZE; ++i) {
    561         vscatter16[half_offsets[i] / 2] = half_values[i];
    562     }
    563 }
    564 
    565 void check_scatter_16()
    566 {
    567     memset(vscatter16_ref, FILL_CHAR,
    568            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    569     scalar_scatter_16(vscatter16_ref);
    570     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
    571                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    572 }
    573 
    574 /* scatter the 16 bit elements using C */
    575 void scalar_scatter_16_acc(unsigned short *vscatter16)
    576 {
    577     for (int i = 0; i < MATRIX_SIZE; ++i) {
    578         vscatter16[half_offsets[i] / 2] += half_values_acc[i];
    579     }
    580 }
    581 
    582 void check_scatter_16_acc()
    583 {
    584     memset(vscatter16_ref, FILL_CHAR,
    585            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    586     scalar_scatter_16(vscatter16_ref);
    587     scalar_scatter_16_acc(vscatter16_ref);
    588     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
    589                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    590 }
    591 
    592 /* scatter the 16 bit elements using C */
    593 void scalar_scatter_16_masked(unsigned short *vscatter16)
    594 {
    595     for (int i = 0; i < MATRIX_SIZE; i++) {
    596         if (half_predicates[i]) {
    597             vscatter16[half_offsets[i] / 2] = half_values_masked[i];
    598         }
    599     }
    600 
    601 }
    602 
    603 void check_scatter_16_masked()
    604 {
    605     memset(vscatter16_ref, FILL_CHAR,
    606            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    607     scalar_scatter_16(vscatter16_ref);
    608     scalar_scatter_16_acc(vscatter16_ref);
    609     scalar_scatter_16_masked(vscatter16_ref);
    610     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
    611                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    612 }
    613 
    614 /* scatter the 32 bit elements using C */
    615 void scalar_scatter_32(unsigned int *vscatter32)
    616 {
    617     for (int i = 0; i < MATRIX_SIZE; ++i) {
    618         vscatter32[word_offsets[i] / 4] = word_values[i];
    619     }
    620 }
    621 
    622 void check_scatter_32()
    623 {
    624     memset(vscatter32_ref, FILL_CHAR,
    625            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    626     scalar_scatter_32(vscatter32_ref);
    627     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
    628                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    629 }
    630 
    631 /* scatter the 32 bit elements using C */
    632 void scalar_scatter_32_acc(unsigned int *vscatter32)
    633 {
    634     for (int i = 0; i < MATRIX_SIZE; ++i) {
    635         vscatter32[word_offsets[i] / 4] += word_values_acc[i];
    636     }
    637 }
    638 
    639 void check_scatter_32_acc()
    640 {
    641     memset(vscatter32_ref, FILL_CHAR,
    642            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    643     scalar_scatter_32(vscatter32_ref);
    644     scalar_scatter_32_acc(vscatter32_ref);
    645     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
    646                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    647 }
    648 
    649 /* scatter the 32 bit elements using C */
    650 void scalar_scatter_32_masked(unsigned int *vscatter32)
    651 {
    652     for (int i = 0; i < MATRIX_SIZE; i++) {
    653         if (word_predicates[i]) {
    654             vscatter32[word_offsets[i] / 4] = word_values_masked[i];
    655         }
    656     }
    657 }
    658 
    659 void check_scatter_32_masked()
    660 {
    661     memset(vscatter32_ref, FILL_CHAR,
    662            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    663     scalar_scatter_32(vscatter32_ref);
    664     scalar_scatter_32_acc(vscatter32_ref);
    665     scalar_scatter_32_masked(vscatter32_ref);
    666     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
    667                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
    668 }
    669 
    670 /* scatter the 32 bit elements using C */
    671 void scalar_scatter_16_32(unsigned short *vscatter16_32)
    672 {
    673     for (int i = 0; i < MATRIX_SIZE; ++i) {
    674         vscatter16_32[word_offsets[i] / 2] = half_values[i];
    675     }
    676 }
    677 
    678 void check_scatter_16_32()
    679 {
    680     memset(vscatter16_32_ref, FILL_CHAR,
    681            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    682     scalar_scatter_16_32(vscatter16_32_ref);
    683     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
    684                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    685 }
    686 
    687 /* scatter the 32 bit elements using C */
    688 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
    689 {
    690     for (int i = 0; i < MATRIX_SIZE; ++i) {
    691         vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
    692     }
    693 }
    694 
    695 void check_scatter_16_32_acc()
    696 {
    697     memset(vscatter16_32_ref, FILL_CHAR,
    698            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    699     scalar_scatter_16_32(vscatter16_32_ref);
    700     scalar_scatter_16_32_acc(vscatter16_32_ref);
    701     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
    702                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    703 }
    704 
    705 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
    706 {
    707     for (int i = 0; i < MATRIX_SIZE; i++) {
    708         if (half_predicates[i]) {
    709             vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
    710         }
    711     }
    712 }
    713 
    714 void check_scatter_16_32_masked()
    715 {
    716     memset(vscatter16_32_ref, FILL_CHAR,
    717            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    718     scalar_scatter_16_32(vscatter16_32_ref);
    719     scalar_scatter_16_32_acc(vscatter16_32_ref);
    720     scalar_scatter_16_32_masked(vscatter16_32_ref);
    721     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
    722                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
    723 }
    724 
    725 /* gather the elements from the scatter buffer using C */
    726 void scalar_gather_16(unsigned short *vgather16)
    727 {
    728     for (int i = 0; i < MATRIX_SIZE; ++i) {
    729         vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
    730     }
    731 }
    732 
    733 void check_gather_16()
    734 {
    735       memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
    736       scalar_gather_16(vgather16_ref);
    737       check_buffer(__func__, vtcm.vgather16, vgather16_ref,
    738                    MATRIX_SIZE * sizeof(unsigned short));
    739 }
    740 
    741 void scalar_gather_16_masked(unsigned short *vgather16)
    742 {
    743     for (int i = 0; i < MATRIX_SIZE; ++i) {
    744         if (half_predicates[i]) {
    745             vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
    746         }
    747     }
    748 }
    749 
    750 void check_gather_16_masked()
    751 {
    752     memset(vgather16_ref, gather_16_masked_init(),
    753            MATRIX_SIZE * sizeof(unsigned short));
    754     scalar_gather_16_masked(vgather16_ref);
    755     check_buffer(__func__, vtcm.vgather16, vgather16_ref,
    756                  MATRIX_SIZE * sizeof(unsigned short));
    757 }
    758 
    759 /* gather the elements from the scatter buffer using C */
    760 void scalar_gather_32(unsigned int *vgather32)
    761 {
    762     for (int i = 0; i < MATRIX_SIZE; ++i) {
    763         vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
    764     }
    765 }
    766 
    767 void check_gather_32(void)
    768 {
    769     memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
    770     scalar_gather_32(vgather32_ref);
    771     check_buffer(__func__, vtcm.vgather32, vgather32_ref,
    772                  MATRIX_SIZE * sizeof(unsigned int));
    773 }
    774 
    775 void scalar_gather_32_masked(unsigned int *vgather32)
    776 {
    777     for (int i = 0; i < MATRIX_SIZE; ++i) {
    778         if (word_predicates[i]) {
    779             vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
    780         }
    781     }
    782 }
    783 
    784 
    785 void check_gather_32_masked(void)
    786 {
    787     memset(vgather32_ref, gather_32_masked_init(),
    788            MATRIX_SIZE * sizeof(unsigned int));
    789     scalar_gather_32_masked(vgather32_ref);
    790     check_buffer(__func__, vtcm.vgather32,
    791                  vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
    792 }
    793 
    794 /* gather the elements from the scatter buffer using C */
    795 void scalar_gather_16_32(unsigned short *vgather16_32)
    796 {
    797     for (int i = 0; i < MATRIX_SIZE; ++i) {
    798         vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
    799     }
    800 }
    801 
    802 void check_gather_16_32(void)
    803 {
    804     memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
    805     scalar_gather_16_32(vgather16_32_ref);
    806     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
    807                  MATRIX_SIZE * sizeof(unsigned short));
    808 }
    809 
    810 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
    811 {
    812     for (int i = 0; i < MATRIX_SIZE; ++i) {
    813         if (half_predicates[i]) {
    814             vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
    815         }
    816     }
    817 
    818 }
    819 
    820 void check_gather_16_32_masked(void)
    821 {
    822     memset(vgather16_32_ref, gather_16_masked_init(),
    823            MATRIX_SIZE * sizeof(unsigned short));
    824     scalar_gather_16_32_masked(vgather16_32_ref);
    825     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
    826                  MATRIX_SIZE * sizeof(unsigned short));
    827 }
    828 
    829 /* print scatter16 buffer */
    830 void print_scatter16_buffer(void)
    831 {
    832     if (PRINT_DATA) {
    833         printf("\n\nPrinting the 16 bit scatter buffer");
    834 
    835         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
    836             if ((i % MATRIX_SIZE) == 0) {
    837                 printf("\n");
    838             }
    839             for (int j = 0; j < 2; j++) {
    840                 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
    841             }
    842             printf(" ");
    843         }
    844         printf("\n");
    845     }
    846 }
    847 
    848 /* print the gather 16 buffer */
    849 void print_gather_result_16(void)
    850 {
    851     if (PRINT_DATA) {
    852         printf("\n\nPrinting the 16 bit gather result\n");
    853 
    854         for (int i = 0; i < MATRIX_SIZE; i++) {
    855             for (int j = 0; j < 2; j++) {
    856                 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
    857             }
    858             printf(" ");
    859         }
    860         printf("\n");
    861     }
    862 }
    863 
    864 /* print the scatter32 buffer */
    865 void print_scatter32_buffer(void)
    866 {
    867     if (PRINT_DATA) {
    868         printf("\n\nPrinting the 32 bit scatter buffer");
    869 
    870         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
    871             if ((i % MATRIX_SIZE) == 0) {
    872                 printf("\n");
    873             }
    874             for (int j = 0; j < 4; j++) {
    875                 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
    876             }
    877             printf(" ");
    878         }
    879         printf("\n");
    880     }
    881 }
    882 
    883 /* print the gather 32 buffer */
    884 void print_gather_result_32(void)
    885 {
    886     if (PRINT_DATA) {
    887         printf("\n\nPrinting the 32 bit gather result\n");
    888 
    889         for (int i = 0; i < MATRIX_SIZE; i++) {
    890             for (int j = 0; j < 4; j++) {
    891                 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
    892             }
    893             printf(" ");
    894         }
    895         printf("\n");
    896     }
    897 }
    898 
    899 /* print the scatter16_32 buffer */
    900 void print_scatter16_32_buffer(void)
    901 {
    902     if (PRINT_DATA) {
    903         printf("\n\nPrinting the 16_32 bit scatter buffer");
    904 
    905         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
    906             if ((i % MATRIX_SIZE) == 0) {
    907                 printf("\n");
    908             }
    909             for (int j = 0; j < 2; j++) {
    910                 printf("%c",
    911                       (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
    912             }
    913             printf(" ");
    914         }
    915         printf("\n");
    916     }
    917 }
    918 
    919 /* print the gather 16_32 buffer */
    920 void print_gather_result_16_32(void)
    921 {
    922     if (PRINT_DATA) {
    923         printf("\n\nPrinting the 16_32 bit gather result\n");
    924 
    925         for (int i = 0; i < MATRIX_SIZE; i++) {
    926             for (int j = 0; j < 2; j++) {
    927                 printf("%c",
    928                        (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
    929             }
    930             printf(" ");
    931         }
    932         printf("\n");
    933     }
    934 }
    935 
    936 int main()
    937 {
    938     prefill_vtcm_scratch();
    939 
    940     /* 16 bit elements with 16 bit offsets */
    941     create_offsets_values_preds_16();
    942 
    943     vector_scatter_16();
    944     print_scatter16_buffer();
    945     check_scatter_16();
    946 
    947     vector_gather_16();
    948     print_gather_result_16();
    949     check_gather_16();
    950 
    951     vector_gather_16_masked();
    952     print_gather_result_16();
    953     check_gather_16_masked();
    954 
    955     vector_scatter_16_acc();
    956     print_scatter16_buffer();
    957     check_scatter_16_acc();
    958 
    959     vector_scatter_16_masked();
    960     print_scatter16_buffer();
    961     check_scatter_16_masked();
    962 
    963     /* 32 bit elements with 32 bit offsets */
    964     create_offsets_values_preds_32();
    965 
    966     vector_scatter_32();
    967     print_scatter32_buffer();
    968     check_scatter_32();
    969 
    970     vector_gather_32();
    971     print_gather_result_32();
    972     check_gather_32();
    973 
    974     vector_gather_32_masked();
    975     print_gather_result_32();
    976     check_gather_32_masked();
    977 
    978     vector_scatter_32_acc();
    979     print_scatter32_buffer();
    980     check_scatter_32_acc();
    981 
    982     vector_scatter_32_masked();
    983     print_scatter32_buffer();
    984     check_scatter_32_masked();
    985 
    986     /* 16 bit elements with 32 bit offsets */
    987     create_offsets_values_preds_16_32();
    988 
    989     vector_scatter_16_32();
    990     print_scatter16_32_buffer();
    991     check_scatter_16_32();
    992 
    993     vector_gather_16_32();
    994     print_gather_result_16_32();
    995     check_gather_16_32();
    996 
    997     vector_gather_16_32_masked();
    998     print_gather_result_16_32();
    999     check_gather_16_32_masked();
   1000 
   1001     vector_scatter_16_32_acc();
   1002     print_scatter16_32_buffer();
   1003     check_scatter_16_32_acc();
   1004 
   1005     vector_scatter_16_32_masked();
   1006     print_scatter16_32_buffer();
   1007     check_scatter_16_32_masked();
   1008 
   1009     puts(err ? "FAIL" : "PASS");
   1010     return err;
   1011 }