qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

hvx_histogram_row.S (7359B)


      1 /*
      2  *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
      3  *
      4  *  This program is free software; you can redistribute it and/or modify
      5  *  it under the terms of the GNU General Public License as published by
      6  *  the Free Software Foundation; either version 2 of the License, or
      7  *  (at your option) any later version.
      8  *
      9  *  This program is distributed in the hope that it will be useful,
     10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  *  GNU General Public License for more details.
     13  *
     14  *  You should have received a copy of the GNU General Public License
     15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
     16  */
     17 
     18 
     19 /*
     20  * void hvx_histogram_row(uint8_t *src,     => r0
     21  *                        int stride,       => r1
     22  *                        int width,        => r2
     23  *                        int height,       => r3
     24  *                        int *hist         => r4)
     25  */
     26     .text
     27     .p2align 2
     28     .global hvx_histogram_row
     29     .type hvx_histogram_row, @function
     30 hvx_histogram_row:
     31     { r2 = lsr(r2, #7)          /* size / VLEN */
     32       r5 = and(r2, #127)        /* size % VLEN */
     33       v1 = #0
     34       v0 = #0
     35     }
     36     /*
     37      * Step 1: Clean the whole vector register file
     38      */
     39     { v3:2 = v1:0
     40       v5:4 = v1:0
     41       p0 = cmp.gt(r2, #0)       /* P0 = (width / VLEN > 0) */
     42       p1 = cmp.eq(r5, #0)       /* P1 = (width % VLEN == 0) */
     43     }
     44     { q0 = vsetq(r5)
     45       v7:6 = v1:0
     46     }
     47     { v9:8   = v1:0
     48       v11:10 = v1:0
     49     }
     50     { v13:12 = v1:0
     51       v15:14 = v1:0
     52     }
     53     { v17:16 = v1:0
     54       v19:18 = v1:0
     55     }
     56     { v21:20 = v1:0
     57       v23:22 = v1:0
     58     }
     59     { v25:24 = v1:0
     60       v27:26 = v1:0
     61     }
     62     { v29:28 = v1:0
     63       v31:30 = v1:0
     64       r10 = add(r0, r1)           /* R10 = &src[2 * stride] */
     65       loop1(.outerloop, r3)
     66     }
     67 
     68     /*
     69      * Step 2: vhist
     70      */
     71     .falign
     72 .outerloop:
     73     { if (!p0) jump .loopend
     74       loop0(.innerloop, r2)
     75     }
     76 
     77     .falign
     78 .innerloop:
     79     { v12.tmp = vmem(R0++#1)
     80       vhist
     81     }:endloop0
     82 
     83     .falign
     84 .loopend:
     85     if (p1) jump .skip       /* if (width % VLEN == 0) done with current row */
     86     { v13.tmp = vmem(r0 + #0)
     87       vhist(q0)
     88     }
     89 
     90     .falign
     91 .skip:
     92     { r0 = r10                    /* R0  = &src[(i + 1) * stride] */
     93       r10 = add(r10, r1)          /* R10 = &src[(i + 2) * stride] */
     94     }:endloop1
     95 
     96 
     97     /*
     98      * Step 3: Sum up the data
     99      */
    100     { v0.h = vshuff(v0.h)
    101       r10 = ##0x00010001
    102     }
    103     v1.h = vshuff(v1.h)
    104     { V2.h = vshuff(v2.h)
    105       v0.w = vdmpy(v0.h, r10.h):sat
    106     }
    107     { v3.h = vshuff(v3.h)
    108       v1.w = vdmpy(v1.h, r10.h):sat
    109     }
    110     { v4.h = vshuff(V4.h)
    111       v2.w = vdmpy(v2.h, r10.h):sat
    112     }
    113     { v5.h = vshuff(v5.h)
    114       v3.w = vdmpy(v3.h, r10.h):sat
    115     }
    116     { v6.h = vshuff(v6.h)
    117       v4.w = vdmpy(v4.h, r10.h):sat
    118     }
    119     { v7.h = vshuff(v7.h)
    120       v5.w = vdmpy(v5.h, r10.h):sat
    121     }
    122     { v8.h = vshuff(V8.h)
    123       v6.w = vdmpy(v6.h, r10.h):sat
    124     }
    125     { v9.h = vshuff(V9.h)
    126       v7.w = vdmpy(v7.h, r10.h):sat
    127     }
    128     { v10.h = vshuff(v10.h)
    129       v8.w = vdmpy(v8.h, r10.h):sat
    130     }
    131     { v11.h = vshuff(v11.h)
    132       v9.w = vdmpy(v9.h, r10.h):sat
    133     }
    134     { v12.h = vshuff(v12.h)
    135       v10.w = vdmpy(v10.h, r10.h):sat
    136     }
    137     { v13.h = vshuff(V13.h)
    138       v11.w = vdmpy(v11.h, r10.h):sat
    139     }
    140     { v14.h = vshuff(v14.h)
    141       v12.w = vdmpy(v12.h, r10.h):sat
    142     }
    143     { v15.h = vshuff(v15.h)
    144       v13.w = vdmpy(v13.h, r10.h):sat
    145     }
    146     { v16.h = vshuff(v16.h)
    147       v14.w = vdmpy(v14.h, r10.h):sat
    148     }
    149     { v17.h = vshuff(v17.h)
    150       v15.w = vdmpy(v15.h, r10.h):sat
    151     }
    152     { v18.h = vshuff(v18.h)
    153       v16.w = vdmpy(v16.h, r10.h):sat
    154     }
    155     { v19.h = vshuff(v19.h)
    156       v17.w = vdmpy(v17.h, r10.h):sat
    157     }
    158     { v20.h = vshuff(v20.h)
    159       v18.W = vdmpy(v18.h, r10.h):sat
    160     }
    161     { v21.h = vshuff(v21.h)
    162       v19.w = vdmpy(v19.h, r10.h):sat
    163     }
    164     { v22.h = vshuff(v22.h)
    165       v20.w = vdmpy(v20.h, r10.h):sat
    166     }
    167     { v23.h = vshuff(v23.h)
    168       v21.w = vdmpy(v21.h, r10.h):sat
    169     }
    170     { v24.h = vshuff(v24.h)
    171       v22.w = vdmpy(v22.h, r10.h):sat
    172     }
    173     { v25.h = vshuff(v25.h)
    174       v23.w = vdmpy(v23.h, r10.h):sat
    175     }
    176     { v26.h = vshuff(v26.h)
    177       v24.w = vdmpy(v24.h, r10.h):sat
    178     }
    179     { v27.h = vshuff(V27.h)
    180       v25.w = vdmpy(v25.h, r10.h):sat
    181     }
    182     { v28.h = vshuff(v28.h)
    183       v26.w = vdmpy(v26.h, r10.h):sat
    184     }
    185     { v29.h = vshuff(v29.h)
    186       v27.w = vdmpy(v27.h, r10.h):sat
    187     }
    188     { v30.h = vshuff(v30.h)
    189       v28.w = vdmpy(v28.h, r10.h):sat
    190     }
    191     { v31.h = vshuff(v31.h)
    192       v29.w = vdmpy(v29.h, r10.h):sat
    193       r28 = #32
    194     }
    195     { vshuff(v1, v0, r28)
    196       v30.w = vdmpy(v30.h, r10.h):sat
    197     }
    198     { vshuff(v3, v2, r28)
    199       v31.w = vdmpy(v31.h, r10.h):sat
    200     }
    201     { vshuff(v5, v4, r28)
    202       v0.w = vadd(v1.w, v0.w)
    203       v2.w = vadd(v3.w, v2.w)
    204     }
    205     { vshuff(v7, v6, r28)
    206       r7 = #64
    207     }
    208     { vshuff(v9, v8, r28)
    209       v4.w = vadd(v5.w, v4.w)
    210       v6.w = vadd(v7.w, v6.w)
    211     }
    212     vshuff(v11, v10, r28)
    213     { vshuff(v13, v12, r28)
    214       v8.w = vadd(v9.w, v8.w)
    215       v10.w = vadd(v11.w, v10.w)
    216     }
    217     vshuff(v15, v14, r28)
    218     { vshuff(v17, v16, r28)
    219       v12.w = vadd(v13.w, v12.w)
    220       v14.w = vadd(v15.w, v14.w)
    221     }
    222     vshuff(v19, v18, r28)
    223     { vshuff(v21, v20, r28)
    224       v16.w = vadd(v17.w, v16.w)
    225       v18.w = vadd(v19.w, v18.w)
    226     }
    227     vshuff(v23, v22, r28)
    228     { vshuff(v25, v24, r28)
    229       v20.w = vadd(v21.w, v20.w)
    230       v22.w = vadd(v23.w, v22.w)
    231     }
    232     vshuff(v27, v26, r28)
    233     { vshuff(v29, v28, r28)
    234       v24.w = vadd(v25.w, v24.w)
    235       v26.w = vadd(v27.w, v26.w)
    236     }
    237     vshuff(v31, v30, r28)
    238     { v28.w = vadd(v29.w, v28.w)
    239       vshuff(v2, v0, r7)
    240     }
    241     { v30.w = vadd(v31.w, v30.w)
    242       vshuff(v6, v4, r7)
    243       v0.w  = vadd(v0.w, v2.w)
    244     }
    245     { vshuff(v10, v8, r7)
    246       v1.tmp = vmem(r4 + #0)      /* update hist[0-31] */
    247       v0.w  = vadd(v0.w, v1.w)
    248       vmem(r4++#1) = v0.new
    249     }
    250     { vshuff(v14, v12, r7)
    251       v4.w  = vadd(v4.w, v6.w)
    252       v8.w  = vadd(v8.w, v10.w)
    253     }
    254     { vshuff(v18, v16, r7)
    255       v1.tmp = vmem(r4 + #0)      /* update hist[32-63] */
    256       v4.w  = vadd(v4.w, v1.w)
    257       vmem(r4++#1) = v4.new
    258     }
    259     { vshuff(v22, v20, r7)
    260       v12.w = vadd(v12.w, v14.w)
    261       V16.w = vadd(v16.w, v18.w)
    262     }
    263     { vshuff(v26, v24, r7)
    264       v1.tmp = vmem(r4 + #0)      /* update hist[64-95] */
    265       v8.w  = vadd(v8.w, v1.w)
    266       vmem(r4++#1) = v8.new
    267     }
    268     { vshuff(v30, v28, r7)
    269       v1.tmp = vmem(r4 + #0)      /* update hist[96-127] */
    270       v12.w  = vadd(v12.w, v1.w)
    271       vmem(r4++#1) = v12.new
    272     }
    273 
    274     { v20.w = vadd(v20.w, v22.w)
    275       v1.tmp = vmem(r4 + #0)      /* update hist[128-159] */
    276       v16.w  = vadd(v16.w, v1.w)
    277       vmem(r4++#1) = v16.new
    278     }
    279     { v24.w = vadd(v24.w, v26.w)
    280       v1.tmp = vmem(r4 + #0)      /* update hist[160-191] */
    281       v20.w  = vadd(v20.w, v1.w)
    282       vmem(r4++#1) = v20.new
    283     }
    284     { v28.w = vadd(v28.w, v30.w)
    285       v1.tmp = vmem(r4 + #0)      /* update hist[192-223] */
    286       v24.w  = vadd(v24.w, v1.w)
    287       vmem(r4++#1) = v24.new
    288     }
    289     { v1.tmp = vmem(r4 + #0)      /* update hist[224-255] */
    290       v28.w  = vadd(v28.w, v1.w)
    291       vmem(r4++#1) = v28.new
    292     }
    293     jumpr r31
    294     .size hvx_histogram_row, .-hvx_histogram_row