ext.idef (124464B)
1 /* 2 * Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 /****************************************************************************** 19 * 20 * HOYA: MULTI MEDIA INSTRUCITONS 21 * 22 ******************************************************************************/ 23 24 #ifndef EXTINSN 25 #define EXTINSN Q6INSN 26 #define __SELF_DEF_EXTINSN 1 27 #endif 28 29 #ifndef NO_MMVEC 30 31 #define DO_FOR_EACH_CODE(WIDTH, CODE) \ 32 { \ 33 fHIDE(int i;) \ 34 fVFOREACH(WIDTH, i) {\ 35 CODE ;\ 36 } \ 37 } 38 39 40 41 42 #define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 43 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 44 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 45 46 47 48 #define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 49 ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 50 51 #define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 52 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), \ 53 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 54 55 56 #define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 57 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 58 59 60 #define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 61 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 62 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 63 64 65 66 #define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 67 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 68 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 69 70 #define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 71 ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 72 73 #define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 74 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 75 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 76 77 #define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 78 ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 79 80 #define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 81 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 82 83 84 #define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 85 ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE) 86 87 #define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 88 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 89 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 90 91 #define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 92 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 93 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 94 95 #define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 96 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 97 98 #define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 99 EXTINSN(V6_##TAG, SYNTAX, \ 100 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 101 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 102 103 #define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 104 EXTINSN(V6_##TAG, SYNTAX, \ 105 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 106 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 107 108 #define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 109 ITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 110 111 #define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \ 112 ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE) 113 114 115 #define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 116 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 117 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 118 119 #define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 120 ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 121 122 123 124 125 #define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 126 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 127 128 #define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 129 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 130 131 #define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 132 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), \ 133 DESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE)) 134 135 136 137 138 139 /****************************************************************************************** 140 * 141 * MMVECTOR MEMORY OPERATIONS - NO NAPALI V1 142 * 143 *******************************************************************************************/ 144 145 146 147 #define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 148 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 149 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 150 151 #define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 152 ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 153 154 155 156 #define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 157 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 158 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 159 160 #define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 161 ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 162 163 164 #define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 165 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 166 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 167 168 #define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 169 ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 170 171 172 #define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 173 EXTINSN(V6_##TAG, SYNTAX, \ 174 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 175 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 176 177 #define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 178 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 179 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 180 181 #define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 182 ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 183 184 #define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 185 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 186 187 188 #define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 189 ITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 190 191 #define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 192 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 193 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 194 195 #define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 196 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 197 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 198 199 #define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 200 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 201 202 #define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 203 ITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \ 204 "Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 205 "Vector shift right and shuffle", \ 206 fHIDE(int )shamt = RtV & SHAMTMASK; \ 207 DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 208 DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 209 210 #define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 211 ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 212 ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 213 ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 214 215 #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 216 ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 217 ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 218 219 220 221 /****************************************************************************************** 222 * 223 * MMVECTOR MEMORY OPERATIONS 224 * 225 *******************************************************************************************/ 226 227 #define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \ 228 EXTINSN(V6_##TAG##_pi, SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \ 229 EXTINSN(V6_##TAG##_ai, SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \ 230 EXTINSN(V6_##TAG##_ppu, SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \ 231 232 233 #define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 234 EXTINSN(V6_##TAG##_pred_pi, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 235 EXTINSN(V6_##TAG##_pred_ai, "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 236 EXTINSN(V6_##TAG##_pred_ppu, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \ 237 238 #define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 239 EXTINSN(V6_##TAG##_npred_pi, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 240 EXTINSN(V6_##TAG##_npred_ai, "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 241 EXTINSN(V6_##TAG##_npred_ppu, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) 242 243 #define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 244 MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 245 MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) 246 247 248 #define VEC_SCALE(X) X*fVECSIZE() 249 250 251 #define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV)) 252 #define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV)) 253 #define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV)) 254 #define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV)) 255 256 257 #define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \ 258 MMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \ 259 MMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV)) 260 261 /**************************************************************** 262 * MAPPING FOR VMEMs 263 ****************************************************************/ 264 265 #define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM 266 #define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP 267 268 269 MMVEC_LD(vL32b, "Aligned Vector Load", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),) 270 MMVEC_LDC(vL32b, "Aligned Vector Load Cur", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),) 271 MMVEC_LDT(vL32b, "Aligned Vector Load Tmp", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),) 272 273 MMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);) 274 MMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);) 275 MMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);) 276 277 MMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV)) 278 MMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 279 280 281 MMVEC_STQ(vS32b, "Aligned Vector Store", ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),) 282 283 MMVEC_LDU(vL32Ub, "Unaligned Vector Load", ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),) 284 285 MMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV)) 286 287 MMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV)) 288 289 MMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 290 291 // V65 store relase, zero byte store 292 MMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0)) 293 294 295 296 MMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 297 298 299 /****************************************************************************************** 300 * 301 * MMVECTOR MEMORY OPERATIONS - NON TEMPORAL 302 * 303 *******************************************************************************************/ 304 305 #define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM 306 307 MMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV)) 308 MMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 309 310 MMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 311 MMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 312 313 314 MMVEC_STQ(vS32b_nt, "Aligned Vector Store - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt") 315 316 MMVEC_LD(vL32b_nt, "Aligned Vector Load - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt") 317 MMVEC_LDC(vL32b_nt, "Aligned Vector Load Cur - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt") 318 MMVEC_LDT(vL32b_nt, "Aligned Vector Load Tmp - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt") 319 320 MMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 321 MMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 322 MMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 323 324 325 #undef VEC_SCALE 326 327 328 /*************************************************** 329 * Vector Alignment 330 ************************************************/ 331 332 #define VALIGNB(SHIFT) \ 333 fHIDE(int i;) \ 334 for(i = 0; i < fVBYTES(); i++) {\ 335 VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\ 336 } 337 338 EXTINSN(V6_valignb, "Vd32=valign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 339 { 340 unsigned shift = RtV & (fVBYTES()-1); 341 VALIGNB(shift) 342 }) 343 EXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 344 { 345 unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1)); 346 VALIGNB(shift) 347 }) 348 EXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 349 { 350 VALIGNB(uiV) 351 }) 352 EXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 353 { 354 unsigned shift = fVBYTES() - uiV; 355 VALIGNB(shift) 356 }) 357 358 EXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 359 "Align Two vectors by Rt32 as control", 360 { 361 fHIDE(int k;) 362 for (k=0;k<fVBYTES();k++) { 363 VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)]; 364 } 365 }) 366 367 368 369 370 371 372 373 /************************************************************** 374 * Unpack elements with zero/sign extend and cross lane permute 375 ***************************************************************/ 376 377 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub, "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uh, i) = fZE8_16( VuV.ub[i])) 378 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb, "Vdd32=vunpackb(Vu32)", "Vdd32.h=vunpack(Vu32.b)", "Unpack bytes with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, h, i) = fSE8_16( VuV.b[i] )) 379 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uw, i) = fZE16_32(VuV.uh[i])) 380 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh, "Vdd32=vunpackh(Vu32)", "Vdd32.w=vunpack(Vu32.h)", "Unpack halves with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, w, i) = fSE16_32(VuV.h[i] )) 381 382 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ", fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8) 383 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves", fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16) 384 385 386 /************************************************************** 387 * Pack elements and cross lane permute 388 ***************************************************************/ 389 390 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb, "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)", 391 "Pack bytes", 392 VdV.ub[i] = fGETUBYTE(0, VvV.uh[i]); 393 VdV.ub[i+fVELEM(16)] = fGETUBYTE(0, VuV.uh[i])) 394 395 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh, "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)", 396 "Pack halfwords", 397 VdV.uh[i] = fGETUHALF(0, VvV.uw[i]); 398 VdV.uh[i+fVELEM(32)] = fGETUHALF(0, VuV.uw[i])) 399 400 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob, "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)", 401 "Pack bytes", 402 VdV.ub[i] = fGETUBYTE(1, VvV.uh[i]); 403 VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 404 405 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh, "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)", 406 "Pack halfwords", 407 VdV.uh[i] = fGETUHALF(1, VvV.uw[i]); 408 VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 409 410 411 412 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat, "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat", 413 "Pack ubytes with saturation", 414 VdV.ub[i] = fVSATUB(VvV.h[i]); 415 VdV.ub[i+fVELEM(16)] = fVSATUB(VuV.h[i])) 416 417 418 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat, "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat", 419 "Pack bytes with saturation", 420 VdV.b[i] = fVSATB(VvV.h[i]); 421 VdV.b[i+fVELEM(16)] = fVSATB(VuV.h[i])) 422 423 424 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat, "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat", 425 "Pack ubytes with saturation", 426 VdV.uh[i] = fVSATUH(VvV.w[i]); 427 VdV.uh[i+fVELEM(32)] = fVSATUH(VuV.w[i])) 428 429 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat, "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat", 430 "Pack bytes with saturation", 431 VdV.h[i] = fVSATH(VvV.w[i]); 432 VdV.h[i+fVELEM(32)] = fVSATH(VuV.w[i])) 433 434 435 436 437 438 /************************************************************** 439 * Zero/Sign Extend with in-lane permute 440 ***************************************************************/ 441 442 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)", 443 "Vector Zero Extend Bytes", 444 VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])); 445 VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i]))) 446 447 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)", 448 "Vector Sign Extend Bytes", 449 VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i])); 450 VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i]))) 451 452 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)", 453 "Vector Zero Extend halfwords", 454 VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])); 455 VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i]))) 456 457 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)", 458 "Vector Sign Extend halfwords", 459 VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i])); 460 VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i]))) 461 462 463 /********************************************************************** 464 * 465 * 466 * 467 * MMVECTOR REDUCTION 468 * 469 * 470 * 471 **********************************************************************/ 472 473 /******************************************** 474 * 2-WAY REDUCTION - UNSIGNED BYTE BY BYTE 475 ********************************************/ 476 477 478 ITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)", 479 "Vector Dual Multiply-Accumulates unsigned bytes by bytes", 480 VdV.h[i] = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 481 VdV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 482 483 ITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)", 484 "Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate", 485 VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 486 VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 487 488 489 490 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)", 491 "Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 492 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 493 VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 494 495 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 496 VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 497 498 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)", 499 "Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 500 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 501 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 502 503 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 504 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 505 506 507 508 /******************************************** 509 * 2-WAY REDUCTION - HALF BY BYTE 510 ********************************************/ 511 ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)", 512 "Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 513 VdV.w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 514 VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 515 516 ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)", 517 "Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 518 VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 519 VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 520 521 522 523 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)", 524 "Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 525 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 526 VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 527 528 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 529 VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 530 531 532 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)", 533 "Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 534 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 535 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 536 537 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 538 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 539 540 541 542 543 544 /******************************************** 545 * 2-WAY REDUCTION - HALF BY HALF 546 ********************************************/ 547 548 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat", 549 "Vector halfword multiply, accumulate pairs, sat to word", 550 fHIDE(size8s_t accum;) 551 accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 552 accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 553 VdV.w[i] = fVSATW(accum)) 554 555 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat", 556 "Vector halfword multiply, accumulate pairs, sat to word", 557 fHIDE(size8s_t accum;) 558 accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 559 accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 560 VxV.w[i] = fVSATW(VxV.w[i]+accum)) 561 562 563 /* VDMPYH */ 564 565 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat", 566 "Vector halfword multiply, accumulate pairs, saturate to word", 567 fHIDE(size8s_t accum;) 568 accum = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 569 accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 570 VdV.w[i] = fVSATW(accum)) 571 572 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat", 573 "Vector halfword multiply, accumulate pairs, saturate to word", 574 fHIDE(size8s_t) accum = VxV.w[i]; 575 accum += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 576 accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 577 VxV.w[i] = fVSATW(accum)) 578 579 580 581 582 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat", 583 "Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 584 fHIDE(size8s_t accum;) 585 accum = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 586 accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 587 VdV.w[i] = fVSATW(accum)) 588 589 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat", 590 "Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 591 fHIDE(size8s_t) accum = VxV.w[i]; 592 accum += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 593 accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 594 VxV.w[i] = fVSATW(accum)) 595 596 597 598 599 600 601 602 /* VDMPYHSU */ 603 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat", 604 "Vector halfword multiply, accumulate pairs, saturate to word", 605 fHIDE(size8s_t accum;) 606 accum = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 607 accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 608 VdV.w[i] = fVSATW(accum)) 609 610 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat", 611 "Vector halfword multiply, accumulate pairs, saturate to word", 612 fHIDE(size8s_t) accum=VxV.w[i]; 613 accum += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 614 accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 615 VxV.w[i] = fVSATW(accum)) 616 617 618 619 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 620 "Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 621 fHIDE(size8s_t accum;) 622 accum = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 623 accum += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 624 VdV.w[i] = fVSATW(accum)) 625 626 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 627 "Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 628 fHIDE(size8s_t) accum=VxV.w[i]; 629 accum += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 630 accum += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 631 VxV.w[i] = fVSATW(accum)) 632 633 634 635 /******************************************** 636 * 3-WAY REDUCTION - UNSIGNED BYTE BY BYTE 637 ********************************************/ 638 639 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)", 640 "Dual Vector 3x1 Reduction", 641 VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 642 VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 643 VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 644 645 VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 646 VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 647 VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 648 649 650 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)", 651 "Dual Vector 3x1 Reduction", 652 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 653 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 654 VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 655 656 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 657 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 658 VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 659 660 661 662 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)", 663 "Dual Vector 3x1 Reduction", 664 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 665 VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 666 VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 667 668 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 669 VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 670 VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 671 672 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)", 673 "Dual Vector 3x1 Reduction", 674 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 675 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 676 VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 677 678 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 679 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 680 VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 681 682 683 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)", 684 "Dual Vector 3x1 Reduction", 685 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 686 VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 687 VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 688 689 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 690 VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 691 VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 692 693 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)", 694 "Dual Vector 3x1 Reduction", 695 VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 696 VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 697 VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 698 699 VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 700 VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 701 VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 702 703 704 /******************************************** 705 * 4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE 706 ********************************************/ 707 708 709 710 ITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)", 711 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 712 VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 713 VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 714 VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 715 VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 716 717 ITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)", 718 "Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 719 VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 720 VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 721 VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 722 VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 723 724 725 ITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)", 726 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 727 VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 728 VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 729 VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 730 VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 731 732 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)", 733 "Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 734 VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 735 VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 736 VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 737 VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 738 739 ITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)", 740 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 741 VdV.w[i] = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 742 VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 743 VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 744 VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 745 746 747 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)", 748 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 749 VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 750 VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 751 VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 752 VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 753 754 755 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 756 "Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 757 VddV.v[0].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 758 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 759 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 760 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 761 762 VddV.v[1].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 763 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 764 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 765 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 766 767 768 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 769 "Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 770 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 771 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 772 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 773 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 774 775 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 776 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 777 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 778 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 779 780 781 782 783 /******************************************** 784 * 4-WAY REDUCTION - UNSIGNED BYTE BY BYTE 785 ********************************************/ 786 787 ITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)", 788 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 789 VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 790 VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 791 VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 792 VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 793 794 795 ITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)", 796 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 797 VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 798 VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 799 VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 800 VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 801 802 803 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)", 804 "Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 805 VddV.v[0].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 806 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 807 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 808 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 809 810 VddV.v[1].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 811 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 812 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 813 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 814 815 816 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)", 817 "Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 818 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 819 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 820 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 821 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 822 823 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 824 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 825 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 826 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 827 828 829 830 831 ITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)", 832 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 833 VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 834 VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 835 VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 836 VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 837 838 839 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)", 840 "Vector Multiply-Accumulate Reduce with 4 byte coefficients", 841 VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 842 VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 843 VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 844 VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 845 846 847 848 849 850 851 852 853 854 855 856 /******************************************** 857 * 2-WAY REDUCTION - SAD 858 ********************************************/ 859 860 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)", 861 "Dual Vector Halfword by Byte 4-Way Reduction to Word", 862 VddV.v[0].uw[i] = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 863 VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 864 VddV.v[1].uw[i] = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 865 VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 866 867 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)", 868 "Dual Vector Halfword by Byte 4-Way Reduction to Word", 869 VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 870 VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 871 VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 872 VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 873 874 875 876 877 /******************************************** 878 * 4-WAY REDUCTION - SAD 879 ********************************************/ 880 881 882 883 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)", 884 "Dual Vector Halfword by Byte 4-Way Reduction to Word", 885 VddV.v[0].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 886 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 887 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 888 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 889 890 VddV.v[1].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 891 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 892 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 893 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 894 895 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)", 896 "Dual Vector Halfword by Byte 4-Way Reduction to Word", 897 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 898 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 899 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 900 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 901 902 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 903 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 904 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 905 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 906 907 908 909 910 911 912 913 914 915 916 /********************************************************************* 917 * MMVECTOR SHIFTING 918 * ******************************************************************/ 919 // Macro to shift arithmetically left/right and by either RT or Vv 920 921 #define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE) \ 922 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE, "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = (VuV.TYPE[i] >> (RtV & (SIZE-1)))) \ 923 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE, "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = (VuV.TYPE[i] << (RtV & (SIZE-1)))) \ 924 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE, "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)", "Vector logical shift right " DESC, VdV.u##TYPE[i] = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \ 925 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 926 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTL(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 927 ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right " DESC, VdV.u##TYPE[i] = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 928 929 V_SHIFT(w, "word", 32,5,4_4) 930 V_SHIFT(h, "halfword", 16,4,2_2) 931 932 ITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7)) 933 934 ITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f))))) 935 936 /********************************************************************* 937 * MMVECTOR SHIFT AND PERMUTE 938 * ******************************************************************/ 939 940 ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements", 941 fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32); 942 fHIDE(int64_t ) mask = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i])); 943 fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1); 944 fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f); 945 fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count))))); 946 VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff); 947 VxxV.v[0].w[i] = (result & 0xffffffff)) 948 949 #define NEW_NARROWING_SHIFT 1 950 951 #if NEW_NARROWING_SHIFT 952 #define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 953 ITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \ 954 "Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 955 "Vector shift right and shuffle", \ 956 fHIDE(int )shamt = RtV & SHAMTMASK; \ 957 DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 958 DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 959 960 961 962 963 964 /* WORD TO HALF*/ 965 966 NARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF) 967 NARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF) 968 NARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF) 969 NARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF) 970 NARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF) 971 NARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF) 972 973 NARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF) 974 NARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7) 975 NARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7) 976 NARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7) 977 NARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7) 978 979 NARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7) 980 NARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7) 981 982 #else 983 ITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)", 984 "Vector arithmetic shift right words, shuffle even halfwords", 985 fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF))); 986 fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF)))) 987 988 989 ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat", 990 "Vector arithmetic shift right words, shuffle even halfwords", 991 fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF))); 992 fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF)))) 993 994 ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 995 "Vector arithmetic shift right words, shuffle even halfwords", 996 fHIDE(int ) shamt = RtV & 0xF; 997 fSETHALF(0,VdV.w[i], fVSATH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 998 fSETHALF(1,VdV.w[i], fVSATH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 999 1000 ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 1001 "Vector arithmetic shift right words, shuffle even halfwords", 1002 fHIDE(int ) shamt = RtV & 0xF; 1003 fSETHALF(0,VdV.w[i], fVSATUH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1004 fSETHALF(1,VdV.w[i], fVSATUH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1005 1006 ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat", 1007 "Vector arithmetic shift right words, shuffle even halfwords", 1008 fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF))); 1009 fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF)))) 1010 1011 ITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat", 1012 "Vector arithmetic shift right words, shuffle even halfwords", 1013 fHIDE(int ) shamt = RtV & 0xF; 1014 fSETHALF(0,VdV.w[i], fVSATUH( (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1015 fSETHALF(1,VdV.w[i], fVSATUH( (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1016 #endif 1017 1018 1019 1020 ITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat", 1021 "Vector round words to halves, shuffle resultant halfwords", 1022 fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1023 fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1024 1025 ITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat", 1026 "Vector round words to halves, shuffle resultant halfwords", 1027 fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1028 fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1029 1030 ITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat", 1031 "Vector round words to halves, shuffle resultant halfwords", 1032 fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16)); 1033 fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16))) 1034 1035 1036 1037 1038 1039 /* HALF TO BYTE*/ 1040 1041 ITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat", 1042 "Vector round words to halves, shuffle resultant halfwords", 1043 fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8)); 1044 fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8))) 1045 1046 ITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat", 1047 "Vector round words to halves, shuffle resultant halfwords", 1048 fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8)); 1049 fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8))) 1050 1051 ITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat", 1052 "Vector round words to halves, shuffle resultant halfwords", 1053 fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8)); 1054 fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8))) 1055 1056 1057 ITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)", 1058 "Vector shift add word", 1059 VxV.w[i] += (VuV.w[i] << (RtV & (32-1)))) 1060 1061 ITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)", 1062 "Vector shift add word", 1063 VxV.w[i] += (VuV.w[i] >> (RtV & (32-1)))) 1064 1065 ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)", 1066 "Vector shift add halfword", 1067 VxV.h[i] += (VuV.h[i] << (RtV & (16-1)))) 1068 1069 ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)", 1070 "Vector shift add halfword", 1071 VxV.h[i] += (VuV.h[i] >> (RtV & (16-1)))) 1072 1073 /************************************************************************** 1074 * 1075 * MMVECTOR ELEMENT-WISE ARITHMETIC 1076 * 1077 **************************************************************************/ 1078 1079 /************************************************************************** 1080 * MACROS GO IN MACROS.DEF NOT HERE!!! 1081 **************************************************************************/ 1082 1083 1084 #define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1085 ITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE, "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" , "Vector Absolute of Difference "DESCR, VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i])) 1086 1087 #define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1088 ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1089 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1090 ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1091 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1092 1093 #define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\ 1094 ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1095 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1096 ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1097 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1098 1099 #define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1100 ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1101 ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 1102 1103 1104 1105 #define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1106 ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1107 ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1108 ITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 1109 1110 1111 1112 1113 1114 1115 1116 #define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\ 1117 ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE, "Vd32=vadd"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")", "Vector Add "DESCR, VdV.DEST[i] = VuV.SRC[i] + VvV.SRC[i])\ 1118 ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE, "Vd32=vsub"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")", "Vector Sub "DESCR, VdV.DEST[i] = VuV.SRC[i] - VvV.SRC[i])\ 1119 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\ 1120 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \ 1121 1122 1123 1124 1125 1126 /* Wrapping Adds */ 1127 MMVEC_ADDWRAP(b, "b", "Byte", 8, b, b) 1128 MMVEC_ADDWRAP(h, "h", "Halfword", 16, h, h) 1129 MMVEC_ADDWRAP(w, "w", "Word", 32, w, w) 1130 1131 /* Saturating Adds */ 1132 MMVEC_ADDU_SAT(ub, "ub", "Unsigned Byte", 8, ub, ub) 1133 MMVEC_ADDU_SAT(uh, "uh", "Unsigned Halfword", 16, uh, uh) 1134 MMVEC_ADDU_SAT(uw, "uw", "Unsigned word", 32, uw, uw) 1135 MMVEC_ADDS_SAT(b, "b", "byte", 8, b, b) 1136 MMVEC_ADDS_SAT(h, "h", "Halfword", 16, h, h) 1137 MMVEC_ADDS_SAT(w, "w", "Word", 32, w, w) 1138 1139 1140 /* Averaging Instructions */ 1141 MMVEC_AVGU(ub,"ub", "Unsigned Byte", 8, ub, ub) 1142 MMVEC_AVGU(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1143 MMVEC_AVGU_NOV1(uw,"uw", "Unsigned Word", 32, uw, uw) 1144 MMVEC_AVGS_NOV1(b, "b", "Byte", 8, b, b) 1145 MMVEC_AVGS(h, "h", "Halfword", 16, h, h) 1146 MMVEC_AVGS(w, "w", "Word", 32, w, w) 1147 1148 1149 /* Absolute Difference */ 1150 MMVEC_ABSDIFF(ub,"ub", "Unsigned Byte", 8, ub, ub) 1151 MMVEC_ABSDIFF(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1152 MMVEC_ABSDIFF(h,"h", "Halfword", 16, uh, h) 1153 MMVEC_ABSDIFF(w,"w", "Word", 32, uw, w) 1154 1155 ITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)", 1156 "Vector Negative Average Unsigned Byte", VdV.b[i] = fVNAVGU(8, VuV.ub[i], VvV.ub[i])) 1157 1158 ITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate", 1159 VdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4))) 1160 1161 ITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1162 VdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4); 1163 fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4)))) 1164 1165 ITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1166 VdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4); 1167 fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4)))) 1168 1169 ITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only", 1170 VdV.w[i] = VuV.w[i]+VvV.w[i]; 1171 fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0))) 1172 1173 ITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only", 1174 VdV.w[i] = VuV.w[i]+~VvV.w[i]+1; 1175 fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1))) 1176 1177 1178 ITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i])) 1179 1180 1181 #define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\ 1182 ITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Add mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] + VvV.SRC2[i]))\ 1183 ITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Sub mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] - VvV.SRC2[i]))\ 1184 1185 MMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b) 1186 1187 /**************************** 1188 * WIDENING 1189 ****************************/ 1190 1191 1192 1193 1194 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)", 1195 "Vector addition with widen into two vectors", 1196 VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1197 VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1198 1199 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)", 1200 "Vector subtraction with widen into two vectors", 1201 VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1202 VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1203 1204 1205 1206 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)", 1207 "Vector addition with widen into two vectors", 1208 VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1209 VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1210 1211 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)", 1212 "Vector subtraction with widen into two vectors", 1213 VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]); 1214 VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i])) 1215 1216 1217 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)", 1218 "Vector addition with widen into two vectors", 1219 VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i])); 1220 VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1221 1222 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)", 1223 "Vector subtraction with widen into two vectors", 1224 VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i])); 1225 VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1226 1227 1228 1229 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)", 1230 "Vector addition with widen into two vectors", 1231 VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1232 VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1233 1234 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)", 1235 "Vector addition with widen into two vectors", 1236 VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]); 1237 VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i])) 1238 1239 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)", 1240 "Vector addition with widen into two vectors", 1241 VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]); 1242 VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i])) 1243 1244 1245 /**************************** 1246 * Conditional 1247 ****************************/ 1248 1249 #define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \ 1250 ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \ 1251 ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \ 1252 ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \ 1253 ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \ 1254 1255 CONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i]) 1256 CONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i]) 1257 CONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i]) 1258 1259 /***************************************************** 1260 ABSOLUTE VALUES 1261 *****************************************************/ 1262 // V65 1263 ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb, "Vd32=vabsb(Vu32)", "Vd32.b=vabs(Vu32.b)", "Vector absolute value of bytes", VdV.b[i] = fABS(VuV.b[i])) 1264 ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat, "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes", VdV.b[i] = fVSATB(fABS(fSE8_16(VuV.b[i])))) 1265 1266 1267 ITERATOR_INSN2_ANY_SLOT(16,vabsh, "Vd32=vabsh(Vu32)", "Vd32.h=vabs(Vu32.h)", "Vector absolute value of halfwords", VdV.h[i] = fABS(VuV.h[i])) 1268 ITERATOR_INSN2_ANY_SLOT(16,vabsh_sat, "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords", VdV.h[i] = fVSATH(fABS(fSE16_32(VuV.h[i])))) 1269 ITERATOR_INSN2_ANY_SLOT(32,vabsw, "Vd32=vabsw(Vu32)", "Vd32.w=vabs(Vu32.w)", "Vector absolute value of words", VdV.w[i] = fABS(VuV.w[i])) 1270 ITERATOR_INSN2_ANY_SLOT(32,vabsw_sat, "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words", VdV.w[i] = fVSATW(fABS(fSE32_64(VuV.w[i])))) 1271 1272 1273 /************************************************************************** 1274 * MMVECTOR MULTIPLICATIONS 1275 * ************************************************************************/ 1276 1277 1278 /* Byte by Byte */ 1279 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)", 1280 "Vector absolute value of words", 1281 VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1282 VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1283 1284 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)", 1285 "Vector absolute value of words", 1286 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1287 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1288 1289 1290 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)", 1291 "Vector absolute value of words", 1292 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1293 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1294 1295 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)", 1296 "Vector absolute value of words", 1297 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1298 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1299 1300 1301 1302 1303 1304 1305 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)", 1306 "Vector absolute value of words", 1307 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1308 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1309 1310 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)", 1311 "Vector absolute value of words", 1312 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1313 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1314 1315 1316 1317 1318 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)", 1319 "Vertical Byte Multiply", 1320 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i])); 1321 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i]))) 1322 1323 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)", 1324 "Vertical Byte Multiply", 1325 VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i])); 1326 VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i]))) 1327 1328 1329 1330 1331 1332 1333 1334 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)", 1335 "Vector by Vector Halfword Multiply", 1336 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1337 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1338 1339 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)", 1340 "Vector by Vector Halfword Multiply", 1341 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1342 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1343 1344 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)", 1345 "Vector by Vector Unsigned Halfword Multiply", 1346 VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1347 VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1348 1349 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)", 1350 "Vector by Vector Unsigned Halfword Multiply", 1351 VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1352 VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1353 1354 1355 1356 /* Vector by Vector */ 1357 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat", 1358 "Vector halfword multiply with round, shift, and sat16", 1359 VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i] )<<1)))))) 1360 1361 1362 1363 1364 1365 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)", 1366 "Vector by Vector Halfword Multiply", 1367 VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1368 VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1369 1370 1371 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)", 1372 "Vector by Vector Halfword Multiply", 1373 VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1374 VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1375 1376 1377 1378 1379 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)", 1380 "Vector by Vector Halfword Multiply", 1381 VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i])) 1382 1383 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)", 1384 "Vector by Vector Halfword Multiply", 1385 VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i])) 1386 1387 1388 1389 /* 32x32 high half / frac */ 1390 1391 1392 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)", 1393 "Vector by Vector Halfword Multiply", 1394 VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16) 1395 1396 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat", 1397 "Vector by Vector Halfword Multiply", 1398 VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1))) 1399 1400 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat", 1401 "Vector by Vector Halfword Multiply", 1402 VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1))) 1403 1404 ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)", 1405 "Word times Halfword Multiply, 64-bit result", 1406 fHIDE(size8s_t prod;) 1407 prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i])); 1408 VddV.v[1].w[i] = prod >> 16; 1409 VddV.v[0].w[i] = prod << 16) 1410 1411 ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)", 1412 "Word times Halfword Multiply, 64-bit result", 1413 fHIDE(size8s_t prod;) 1414 prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i])) + fSE32_64(VxxV.v[1].w[i]); 1415 VxxV.v[1].w[i] = prod >> 16; 1416 fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16); 1417 fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff)) 1418 1419 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift", 1420 "Vector by Vector Halfword Multiply", 1421 IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1))) 1422 1423 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift", 1424 "Vector by Vector Halfword Multiply", 1425 IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1))) 1426 1427 /* For 32x32 integer / low half */ 1428 1429 ITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half", 1430 VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16) 1431 1432 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)", 1433 "Vector by Vector Word by Halfword Multiply", 1434 IV1DEAD() VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1435 1436 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)", 1437 "Vector by Vector Word by Halfword Multiply", 1438 IV1DEAD() VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) ) 1439 1440 /* Add back these... */ 1441 1442 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)", 1443 "Vector by Vector Word by Halfword Multiply", 1444 VxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) ) 1445 1446 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)", 1447 "Vector by Vector Word by Halfword Multiply", 1448 VxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1449 1450 1451 1452 1453 1454 1455 1456 /* Vector by Scalar */ 1457 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)", 1458 "Vector absolute value of words", 1459 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1460 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1461 1462 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)", 1463 "Vector absolute value of words", 1464 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1465 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1466 1467 1468 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)", 1469 "Vector absolute value of words", 1470 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1471 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1472 1473 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)", 1474 "Vector absolute value of words", 1475 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1476 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1477 1478 1479 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)", 1480 "Vertical Byte Multiply", 1481 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1482 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1483 1484 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)", 1485 "Vertical Byte Multiply", 1486 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1487 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1488 1489 // V65 1490 1491 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)", 1492 "Vertical Byte Multiply", 1493 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1494 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1495 1496 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)", 1497 "Vertical Byte Multiply", 1498 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1499 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1500 1501 1502 1503 1504 /* Half by Byte */ 1505 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)", 1506 "Vertical Byte Multiply", 1507 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1508 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1509 1510 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)", 1511 "Vertical Byte Multiply", 1512 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1513 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1514 1515 /* Half by Byte */ 1516 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)", 1517 "Vertical Byte Multiply", 1518 VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1519 VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1520 1521 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)", 1522 "Vertical Byte Multiply", 1523 VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1524 VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1525 1526 1527 1528 1529 1530 1531 1532 /* Half by Half */ 1533 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)", 1534 "Vector absolute value of words", 1535 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1536 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1537 1538 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)", 1539 "Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1540 VxxV.v[0].w[i] = fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1541 VxxV.v[1].w[i] = fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1542 1543 1544 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat", 1545 "Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1546 VxxV.v[0].w[i] = fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV))); 1547 VxxV.v[1].w[i] = fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))) 1548 1549 1550 1551 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat", 1552 "Vector halfword by halfword multiply, shift by 1, and take upper 16 msb", 1553 fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))); 1554 fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))); 1555 ) 1556 1557 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat", 1558 "Vector halfword with scalar halfword multiply with round, shift, and sat16", 1559 fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))))); 1560 fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))))); 1561 ) 1562 1563 1564 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)", 1565 "Vector even halfword unsigned multiply by scalar", 1566 VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1567 VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1568 1569 1570 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)", 1571 "Vector even halfword unsigned multiply by scalar", 1572 VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1573 VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1574 1575 1576 1577 1578 /******************************************** 1579 * HALF BY BYTE 1580 ********************************************/ 1581 ITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)", 1582 "Vector word by byte multiply, keep lower result", 1583 VdV.h[i] = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1584 1585 ITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)", 1586 "Vector word by byte multiply, keep lower result", 1587 VxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1588 1589 1590 /******************************************** 1591 * WORD BY BYTE 1592 ********************************************/ 1593 ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)", 1594 "Vector word by byte multiply, keep lower result", 1595 VdV.w[i] = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1596 1597 ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)", 1598 "Vector word by byte multiply, keep lower result", 1599 VxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1600 1601 ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)", 1602 "Vector word by byte multiply, keep lower result", 1603 VdV.w[i] = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1604 1605 ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)", 1606 "Vector word by byte multiply, keep lower result", 1607 VxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1608 1609 1610 /******************************************** 1611 * WORD BY HALF 1612 ********************************************/ 1613 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)", 1614 "Vector word by byte multiply, keep lower result", 1615 VdV.w[i] = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1616 1617 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)", 1618 "Vector word by byte multiply, keep lower result", 1619 VxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 /************************************************************************** 1640 * MMVECTOR LOGICAL OPERATIONS 1641 * ************************************************************************/ 1642 ITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i]) 1643 ITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)", "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i]) 1644 ITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR", VdV.uh[i] = VuV.uh[i] ^ VvV.h[i]) 1645 ITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)", "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i]) 1646 1647 1648 1649 1650 1651 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt, 1652 "Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1653 VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1654 1655 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc, 1656 "Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1657 VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1658 1659 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt, 1660 "Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1661 VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1662 1663 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc, 1664 "Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1665 VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1666 1667 1668 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt, 1669 "Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate", 1670 fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)) 1671 1672 ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc, 1673 "Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ", 1674 fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))) 1675 1676 ITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes", 1677 VdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1678 ITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes", 1679 VdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1680 1681 1682 /*************************************************** 1683 * Compare Vector with Vector 1684 ***************************************************/ 1685 #define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH) \ 1686 { \ 1687 for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \ 1688 fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \ 1689 } \ 1690 } 1691 1692 1693 #define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1694 EXTINSN(V6_vgt##TYPE, "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \ 1695 VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \ 1696 EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \ 1697 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \ 1698 EXTINSN(V6_vgt##TYPE##_or, "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \ 1699 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \ 1700 EXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \ 1701 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH)) 1702 1703 #define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\ 1704 MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1705 EXTINSN(V6_veq##TYPE, "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \ 1706 VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \ 1707 EXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \ 1708 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \ 1709 EXTINSN(V6_veq##TYPE##_or, "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \ 1710 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \ 1711 EXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \ 1712 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH)) 1713 1714 1715 MMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w) 1716 MMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h) 1717 MMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8), 0x1, 1, b) 1718 MMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw) 1719 MMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh) 1720 MMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8), 0x1, 1,ub) 1721 1722 /*************************************************** 1723 * Predicate Operations 1724 ***************************************************/ 1725 1726 EXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1727 { 1728 fHIDE(int i;) 1729 for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0); 1730 }) 1731 1732 EXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1733 { 1734 fHIDE(int i;) 1735 for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0); 1736 }) 1737 1738 1739 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) ) 1740 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) ) 1741 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) ) 1742 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) ) 1743 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) ) 1744 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) ) 1745 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) ) 1746 ITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) ) 1747 1748 1749 1750 EXTINSN(V6_vcmov, "if (Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1751 { 1752 if (fLSBOLD(PsV)) { 1753 fHIDE(int i;) 1754 fVFOREACH(8, i) { 1755 VdV.ub[i] = VuV.ub[i]; 1756 } 1757 } else {CANCEL;} 1758 }) 1759 1760 EXTINSN(V6_vncmov, "if (!Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1761 { 1762 if (fLSBOLDNOT(PsV)) { 1763 fHIDE(int i;) 1764 fVFOREACH(8, i) { 1765 VdV.ub[i] = VuV.ub[i]; 1766 } 1767 } else {CANCEL;} 1768 }) 1769 1770 EXTINSN(V6_vccombine, "if (Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1771 { 1772 if (fLSBOLD(PsV)) { 1773 fHIDE(int i;) 1774 fVFOREACH(8, i) { 1775 VddV.v[0].ub[i] = VvV.ub[i]; 1776 VddV.v[1].ub[i] = VuV.ub[i]; 1777 } 1778 } else {CANCEL;} 1779 }) 1780 1781 EXTINSN(V6_vnccombine, "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1782 { 1783 if (fLSBOLDNOT(PsV)) { 1784 fHIDE(int i;) 1785 fVFOREACH(8, i) { 1786 VddV.v[0].ub[i] = VvV.ub[i]; 1787 VddV.v[1].ub[i] = VuV.ub[i]; 1788 } 1789 } else {CANCEL;} 1790 }) 1791 1792 1793 1794 ITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)", 1795 "Vector Select Element 8-bit", 1796 VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1797 1798 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)", 1799 "Vector Swap Element 8-bit", 1800 VddV.v[0].ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]; 1801 VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1802 1803 1804 /*************************************************************************** 1805 * 1806 * MMVECTOR SORTING 1807 * 1808 ****************************************************************************/ 1809 1810 #define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\ 1811 ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) \ 1812 ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) 1813 1814 MMVEC_SORT(b,"b", "signed byte", 8, b) 1815 MMVEC_SORT(ub,"ub", "unsigned byte", 8, ub) 1816 MMVEC_SORT(uh,"uh", "unsigned halfword",16, uh) 1817 MMVEC_SORT(h, "h", "halfword", 16, h) 1818 MMVEC_SORT(w, "w", "word", 32, w) 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 /************************************************************* 1829 * SHUFFLES 1830 ****************************************************************/ 1831 1832 ITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)", 1833 "Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them", 1834 fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i])); 1835 fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i]))) 1836 1837 ITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)", 1838 "Saturate and pack 16 words to 16 halfwords, and interleave them", 1839 fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i])); 1840 fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i]))) 1841 1842 ITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)", 1843 "Saturate and pack 16 words to 16 halfwords, and interleave them", 1844 fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i])); 1845 fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i]))) 1846 1847 ITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)", 1848 "Shuffle half words with in a lane", 1849 fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i])); 1850 fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i]))) 1851 1852 ITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)", 1853 "Shuffle half words with in a lane", 1854 fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i])); 1855 fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i]))) 1856 1857 ITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)", 1858 "Shuffle half words with in a lane", 1859 fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i])); 1860 fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i]))) 1861 1862 ITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)", 1863 "Shuffle half words with in a lane", 1864 fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i])); 1865 fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i]))) 1866 1867 1868 1869 1870 /************************************************************************** 1871 * Double Vector Shuffles 1872 **************************************************************************/ 1873 1874 EXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)", 1875 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1876 "2x2->2x2 transpose, for multiple data sizes, inplace", 1877 { 1878 fHIDE(int offset;) 1879 for (offset=1; offset<fVBYTES(); offset<<=1) { 1880 if ( RtV & offset) { 1881 fHIDE(int k;) \ 1882 fVFOREACH(8, k) {\ 1883 if (!( k & offset)) { 1884 fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1885 } 1886 } 1887 } 1888 } 1889 }) 1890 1891 EXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)", 1892 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1893 "2x2->2x2 transpose for multiple data sizes", 1894 { 1895 fHIDE(int offset;) 1896 VddV.v[0] = VvV; 1897 VddV.v[1] = VuV; 1898 for (offset=1; offset<fVBYTES(); offset<<=1) { 1899 if ( RtV & offset) { 1900 fHIDE(int k;) \ 1901 fVFOREACH(8, k) {\ 1902 if (!( k & offset)) { 1903 fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1904 } 1905 } 1906 } 1907 } 1908 }) 1909 1910 EXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)", 1911 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1912 " vector - vector deal - or deinterleave, for multiple data sizes, inplace", 1913 { 1914 fHIDE(int offset;) 1915 for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1916 if ( RtV & offset) { 1917 fHIDE(int k;) \ 1918 fVFOREACH(8, k) {\ 1919 if (!( k & offset)) { 1920 fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1921 } 1922 } 1923 } 1924 } 1925 }) 1926 1927 EXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)", 1928 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1929 " vector - vector deal - or deinterleave, for multiple data sizes", 1930 { 1931 fHIDE(int offset;) 1932 VddV.v[0] = VvV; 1933 VddV.v[1] = VuV; 1934 for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1935 if ( RtV & offset) { 1936 fHIDE(int k;) \ 1937 fVFOREACH(8, k) {\ 1938 if (!( k & offset)) { 1939 fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1940 } 1941 } 1942 } 1943 } 1944 }) 1945 1946 /**************************************************************************/ 1947 1948 1949 1950 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)", 1951 "Vector Shuffle half words", 1952 fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i])); 1953 fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i])); 1954 fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i])); 1955 fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i]))) 1956 1957 ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)", 1958 "Vector Shuffle bytes", 1959 fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i])); 1960 fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i])); 1961 fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i])); 1962 fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i]))) 1963 1964 1965 /*************************************************************** 1966 * Deal 1967 ***************************************************************/ 1968 1969 ITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)", 1970 "Deal Halfwords", 1971 VdV.uh[i ] = fGETUHALF(0, VuV.uw[i]); 1972 VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 1973 1974 ITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)", 1975 "Deal Halfwords", 1976 VdV.ub[i ] = fGETUBYTE(0, VuV.uh[i]); 1977 VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 1978 1979 ITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w, "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)", 1980 "Deal Two Vectors Bytes", 1981 VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]); 1982 VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]); 1983 VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]); 1984 VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i])) 1985 1986 /*************************************************************** 1987 * shuffle 1988 ***************************************************************/ 1989 1990 ITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)", 1991 "Deal Halfwords", 1992 fSETHALF(0, VdV.uw[i], VuV.uh[i]); 1993 fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)])) 1994 1995 ITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)", 1996 "Deal Halfwords", 1997 fSETBYTE(0, VdV.uh[i], VuV.ub[i]); 1998 fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)])) 1999 2000 2001 2002 2003 2004 /*********************************************************** 2005 * INSERT AND EXTRACT 2006 *********************************************************/ 2007 EXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)", 2008 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY), 2009 "Extract an element from a vector to scalar", 2010 fHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));) 2011 RdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2]; 2012 fHIDE(warn("RdV=0x%08x",RdV);)) 2013 2014 EXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)", 2015 ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), 2016 "Insert Word Scalar into Vector", 2017 VxV.uw[0] = RtV;) 2018 2019 2020 2021 2022 ITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar accross words in vector", VdV.uw[i] = RtV) 2023 2024 ITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar accross halves in vector", VdV.uh[i] = RtV) 2025 2026 ITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar accross bytes in vector", VdV.ub[i] = RtV) 2027 2028 2029 ITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i]) 2030 2031 2032 ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)", 2033 "Vector assign, Any two to Vector Pair", 2034 VddV.v[0].ub[i] = VvV.ub[i]; 2035 VddV.v[1].ub[i] = VuV.ub[i]) 2036 2037 2038 2039 /////////////////////////////////////////////////////////////////////////// 2040 2041 2042 /********************************************************* 2043 * GENERAL PERMUTE NETWORKS 2044 *********************************************************/ 2045 2046 2047 EXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2048 "Reverse Benes Butterfly network ", 2049 { 2050 fHIDE(int offset;) 2051 fHIDE(int k;) 2052 fHIDE(mmvector_t tmp;) 2053 tmp = VuV; 2054 for (offset=fVBYTES(); (offset>>=1)>0; ) { 2055 for (k = 0; k<fVBYTES(); k++) { 2056 VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2057 } 2058 for (k = 0; k<fVBYTES(); k++) { 2059 tmp.ub[k] = VdV.ub[k]; 2060 } 2061 } 2062 }) 2063 2064 2065 EXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2066 "Forward Benes Butterfly network ", 2067 { 2068 fHIDE(int offset;) 2069 fHIDE(int k;) 2070 fHIDE(mmvector_t tmp;) 2071 tmp = VuV; 2072 for (offset=1; offset<fVBYTES(); offset<<=1){ 2073 for (k = 0; k<fVBYTES(); k++) { 2074 VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2075 } 2076 for (k = 0; k<fVBYTES(); k++) { 2077 tmp.ub[k] = VdV.ub[k]; 2078 } 2079 } 2080 }) 2081 2082 2083 2084 2085 2086 ITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)", "Count Leading Zeros in Word", VdV.uw[i]=fCL1_4(~VuV.uw[i])) 2087 ITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)", "Count Leading Zeros in Word", VdV.uh[i]=fCL1_2(~VuV.uh[i])) 2088 2089 ITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word", 2090 VdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();)) 2091 ITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword", 2092 VdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();)) 2093 2094 ITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)", 2095 "Count leading bits and add", 2096 VdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i]) 2097 2098 ITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)", 2099 "Count leading bits and add", 2100 VdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i]) 2101 2102 2103 ITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)", "Count Leading Zeros in Word", VdV.uh[i]=fCOUNTONES_2(VuV.uh[i])) 2104 2105 2106 #define fHIST(INPUTVEC) \ 2107 fUARCH_NOTE_PUMP_4X(); \ 2108 fHIDE(int lane;) \ 2109 fHIDE(mmvector_t tmp;) \ 2110 fVFOREACH(128, lane) { \ 2111 for (fHIDE(int )i=0; i<128/8; ++i) { \ 2112 unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2113 unsigned char regno = value>>3; \ 2114 unsigned char element = value & 7; \ 2115 READ_EXT_VREG(regno,tmp,0); \ 2116 tmp.uh[(128/16)*lane+(element)]++; \ 2117 WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2118 } \ 2119 } 2120 2121 #define fHISTQ(INPUTVEC,QVAL) \ 2122 fUARCH_NOTE_PUMP_4X(); \ 2123 fHIDE(int lane;) \ 2124 fHIDE(mmvector_t tmp;) \ 2125 fVFOREACH(128, lane) { \ 2126 for (fHIDE(int )i=0; i<128/8; ++i) { \ 2127 unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2128 unsigned char regno = value>>3; \ 2129 unsigned char element = value & 7; \ 2130 READ_EXT_VREG(regno,tmp,0); \ 2131 if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \ 2132 WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2133 } \ 2134 } 2135 2136 2137 2138 EXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); }) 2139 EXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); }) 2140 2141 #undef fHIST 2142 #undef fHISTQ 2143 2144 2145 /* **** WEIGHTED HISTOGRAM **** */ 2146 2147 2148 #if 1 2149 #define WHIST(EL,MASK,BSHIFT,COND,SATF) \ 2150 fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \ 2151 fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \ 2152 fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \ 2153 fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \ 2154 fHIDE(mmvector_t tmp;) \ 2155 READ_EXT_VREG(vindex,tmp,0); \ 2156 COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \ 2157 WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \ 2158 fUARCH_NOTE_PUMP_2X(); 2159 2160 ITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,)) 2161 ITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),)) 2162 ITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH)) 2163 ITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH)) 2164 ITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,)) 2165 ITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),)) 2166 ITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),)) 2167 ITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),)) 2168 2169 2170 #endif 2171 2172 2173 2174 /* ****** lookup table instructions *********** */ 2175 2176 /* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */ 2177 2178 ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2179 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2180 matchval = RtV & 0x7; 2181 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2182 idx = VuV.ub[i]; 2183 VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2184 2185 2186 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2187 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2188 matchval = RtV & 0x7; 2189 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2190 idx = VuV.ub[i]; 2191 VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2192 2193 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2194 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2195 matchval = RtV & 0xF; 2196 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2197 idx = fGETUBYTE(0,VuV.uh[i]); 2198 VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2199 idx = fGETUBYTE(1,VuV.uh[i]); 2200 VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2201 2202 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2203 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2204 matchval = fGETUBYTE(0,RtV) & 0xF; 2205 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2206 idx = fGETUBYTE(0,VuV.uh[i]); 2207 VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2208 idx = fGETUBYTE(1,VuV.uh[i]); 2209 VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2210 2211 ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2212 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2213 matchval = uiV & 0x7; 2214 oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2215 idx = VuV.ub[i]; 2216 VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2217 2218 2219 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2220 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2221 matchval = uiV & 0x7; 2222 oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2223 idx = VuV.ub[i]; 2224 VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2225 2226 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2227 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2228 matchval = uiV & 0xF; 2229 oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2230 idx = fGETUBYTE(0,VuV.uh[i]); 2231 VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2232 idx = fGETUBYTE(1,VuV.uh[i]); 2233 VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2234 2235 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2236 fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2237 matchval = uiV & 0xF; 2238 oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2239 idx = fGETUBYTE(0,VuV.uh[i]); 2240 VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2241 idx = fGETUBYTE(1,VuV.uh[i]); 2242 VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2243 2244 ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup", 2245 fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2246 matchval = RtV & 0x7; 2247 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2248 idx = VuV.ub[i]; 2249 idx = (idx&0x1F) | (matchval<<5); 2250 VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)])) 2251 2252 ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup", 2253 fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2254 matchval = RtV & 0xF; 2255 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2256 idx = fGETUBYTE(0,VuV.uh[i]); 2257 idx = (idx&0x0F) | (matchval<<4); 2258 VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]); 2259 idx = fGETUBYTE(1,VuV.uh[i]); 2260 idx = (idx&0x0F) | (matchval<<4); 2261 VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)])) 2262 2263 2264 2265 2266 /****************************************************************************** 2267 NON LINEAR - V65 2268 ******************************************************************************/ 2269 2270 ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation", 2271 VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16)) 2272 2273 2274 ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2275 VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2276 2277 ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2278 VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2279 2280 2281 ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table", 2282 VdV.h[i]= fGETHALF( ((VuV.h[i]>>14)&0x3), RttV )) 2283 2284 2285 2286 /****************************************************************************** 2287 V65 2288 ******************************************************************************/ 2289 2290 ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)", 2291 "Vector even halfword unsigned multiply by scalar", 2292 VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2293 2294 2295 ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)", 2296 "Vector even halfword unsigned multiply by scalar", 2297 VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2298 2299 2300 2301 2302 EXTINSN(V6_vgathermw, "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2303 { 2304 fHIDE(int i;) 2305 fHIDE(int element_size = 4;) 2306 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2307 fVLASTBYTE(MuV, element_size); 2308 fVALIGN(RtV, element_size); 2309 fVFOREACH(32, i) { 2310 EA = RtV+VvV.uw[i]; 2311 fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV); 2312 } 2313 fGATHER_FINISH() 2314 }) 2315 EXTINSN(V6_vgathermh, "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2316 { 2317 fHIDE(int i;) 2318 fHIDE(int element_size = 2;) 2319 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2320 fVLASTBYTE(MuV, element_size); 2321 fVALIGN(RtV, element_size); 2322 fVFOREACH(16, i) { 2323 EA = RtV+VvV.uh[i]; 2324 fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV); 2325 } 2326 fGATHER_FINISH() 2327 }) 2328 2329 2330 2331 EXTINSN(V6_vgathermhw, "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2332 { 2333 fHIDE(int i;) 2334 fHIDE(int j;) 2335 fHIDE(int element_size = 2;) 2336 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2337 fVLASTBYTE(MuV, element_size); 2338 fVALIGN(RtV, element_size); 2339 fVFOREACH(32, i) { 2340 for(j = 0; j < 2; j++) { 2341 EA = RtV+VvvV.v[j].uw[i]; 2342 fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV); 2343 } 2344 } 2345 fGATHER_FINISH() 2346 }) 2347 2348 2349 EXTINSN(V6_vgathermwq, "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2350 { 2351 fHIDE(int i;) 2352 fHIDE(int element_size = 4;) 2353 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2354 fVLASTBYTE(MuV, element_size); 2355 fVALIGN(RtV, element_size); 2356 fVFOREACH(32, i) { 2357 EA = RtV+VvV.uw[i]; 2358 fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV); 2359 } 2360 fGATHER_FINISH() 2361 }) 2362 EXTINSN(V6_vgathermhq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2363 { 2364 fHIDE(int i;) 2365 fHIDE(int element_size = 2;) 2366 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2367 fVLASTBYTE(MuV, element_size); 2368 fVALIGN(RtV, element_size); 2369 fVFOREACH(16, i) { 2370 EA = RtV+VvV.uh[i]; 2371 fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV); 2372 } 2373 fGATHER_FINISH() 2374 }) 2375 2376 2377 2378 EXTINSN(V6_vgathermhwq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2379 { 2380 fHIDE(int i;) 2381 fHIDE(int j;) 2382 fHIDE(int element_size = 2;) 2383 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2384 fVLASTBYTE(MuV, element_size); 2385 fVALIGN(RtV, element_size); 2386 fVFOREACH(32, i) { 2387 for(j = 0; j < 2; j++) { 2388 EA = RtV+VvvV.v[j].uw[i]; 2389 fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV); 2390 } 2391 } 2392 fGATHER_FINISH() 2393 }) 2394 2395 2396 2397 EXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2398 { 2399 fHIDE(int i;) 2400 fHIDE(int element_size = 4;) 2401 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2402 fVLASTBYTE(MuV, element_size); 2403 fVALIGN(RtV, element_size); 2404 fVFOREACH(32, i) { 2405 EA = RtV+VvV.uw[i]; 2406 fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV); 2407 } 2408 fSCATTER_FINISH(0) 2409 }) 2410 2411 2412 2413 EXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords", 2414 { 2415 fHIDE(int i;) 2416 fHIDE(int element_size = 2;) 2417 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2418 fVLASTBYTE(MuV, element_size); 2419 fVALIGN(RtV, element_size); 2420 fVFOREACH(16, i) { 2421 EA = RtV+VvV.uh[i]; 2422 fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV); 2423 } 2424 fSCATTER_FINISH(0) 2425 }) 2426 2427 2428 EXTINSN(V6_vscattermw_add, "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add", 2429 { 2430 fHIDE(int i;) 2431 fHIDE(int ALIGNMENT=4;) 2432 fHIDE(int element_size = 4;) 2433 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2434 fVLASTBYTE(MuV, element_size); 2435 fVALIGN(RtV, element_size); 2436 fVFOREACH(32, i) { 2437 EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT)); 2438 fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV); 2439 } 2440 fHIDE(fLOG_SCATTER_OP(4);) 2441 fSCATTER_FINISH(1) 2442 }) 2443 2444 EXTINSN(V6_vscattermh_add, "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add", 2445 { 2446 fHIDE(int i;) 2447 fHIDE(int ALIGNMENT=2;) 2448 fHIDE(int element_size = 2;) 2449 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2450 fVLASTBYTE(MuV, element_size); 2451 fVALIGN(RtV, element_size); 2452 fVFOREACH(16, i) { 2453 EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT)); 2454 fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV); 2455 } 2456 fHIDE(fLOG_SCATTER_OP(2);) 2457 fSCATTER_FINISH(1) 2458 }) 2459 2460 2461 EXTINSN(V6_vscattermwq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional", 2462 { 2463 fHIDE(int i;) 2464 fHIDE(int element_size = 4;) 2465 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2466 fVLASTBYTE(MuV, element_size); 2467 fVALIGN(RtV, element_size); 2468 fVFOREACH(32, i) { 2469 EA = RtV+VvV.uw[i]; 2470 fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV); 2471 } 2472 fSCATTER_FINISH(0) 2473 }) 2474 2475 EXTINSN(V6_vscattermhq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional", 2476 { 2477 fHIDE(int i;) 2478 fHIDE(int element_size = 2;) 2479 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2480 fVLASTBYTE(MuV, element_size); 2481 fVALIGN(RtV, element_size); 2482 fVFOREACH(16, i) { 2483 EA = RtV+VvV.uh[i]; 2484 fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV); 2485 } 2486 fSCATTER_FINISH(0) 2487 }) 2488 2489 2490 2491 2492 EXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2493 { 2494 fHIDE(int i;) 2495 fHIDE(int j;) 2496 fHIDE(int element_size = 2;) 2497 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2498 fVLASTBYTE(MuV, element_size); 2499 fVALIGN(RtV, element_size); 2500 fVFOREACH(32, i) { 2501 for(j = 0; j < 2; j++) { 2502 EA = RtV+VvvV.v[j].uw[i]; 2503 fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV); 2504 } 2505 } 2506 fSCATTER_FINISH(0) 2507 }) 2508 2509 2510 2511 EXTINSN(V6_vscattermhwq, "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional", 2512 { 2513 fHIDE(int i;) 2514 fHIDE(int j;) 2515 fHIDE(int element_size = 2;) 2516 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2517 fVLASTBYTE(MuV, element_size); 2518 fVALIGN(RtV, element_size); 2519 fVFOREACH(32, i) { 2520 for(j = 0; j < 2; j++) { 2521 EA = RtV+VvvV.v[j].uw[i]; 2522 fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV); 2523 } 2524 } 2525 fSCATTER_FINISH(0) 2526 }) 2527 2528 EXTINSN(V6_vscattermhw_add, "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add", 2529 { 2530 fHIDE(int i;) 2531 fHIDE(int j;) 2532 fHIDE(int ALIGNMENT=2;) 2533 fHIDE(int element_size = 2;) 2534 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2535 fVLASTBYTE(MuV, element_size); 2536 fVALIGN(RtV, element_size); 2537 fVFOREACH(32, i) { 2538 for(j = 0; j < 2; j++) { 2539 EA = RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT);; 2540 fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV); 2541 } 2542 } 2543 fHIDE(fLOG_SCATTER_OP(2);) 2544 fSCATTER_FINISH(1) 2545 }) 2546 2547 EXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into byte", 2548 { 2549 fHIDE(int i;) 2550 fHIDE(size1u_t acc = 0;) 2551 fVFOREACH(8, i) { 2552 acc += fGETQBIT(QvV,i); 2553 VdV.ub[i] = acc; 2554 } 2555 } ) 2556 EXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into halfwords", 2557 { 2558 fHIDE(int i;) 2559 fHIDE(size2u_t acc = 0;) 2560 fVFOREACH(16, i) { 2561 acc += fGETQBIT(QvV,i*2+0); 2562 acc += fGETQBIT(QvV,i*2+1); 2563 VdV.uh[i] = acc; 2564 } 2565 } ) 2566 EXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into words", 2567 { 2568 fHIDE(int i;) 2569 fHIDE(size4u_t acc = 0;) 2570 fVFOREACH(32, i) { 2571 acc += fGETQBIT(QvV,i*4+0); 2572 acc += fGETQBIT(QvV,i*4+1); 2573 acc += fGETQBIT(QvV,i*4+2); 2574 acc += fGETQBIT(QvV,i*4+3); 2575 VdV.uw[i] = acc; 2576 } 2577 } ) 2578 2579 2580 2581 2582 2583 /****************************************************************************** 2584 DEBUG Vector/Register Printing 2585 ******************************************************************************/ 2586 2587 #define PRINT_VU(TYPE, TYPE2, COUNT)\ 2588 int i; \ 2589 size4u_t vec_len = fVBYTES();\ 2590 fprintf(stdout,"V%2d: ",VuN); \ 2591 for (i=0;i<vec_len>>COUNT;i++) { \ 2592 fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \ 2593 }; \ 2594 fprintf(stdout,"\\n"); \ 2595 fflush(stdout);\ 2596 2597 #undef ATTR_VMEM 2598 #undef ATTR_VMEMU 2599 #undef ATTR_VMEM_NT 2600 2601 #endif /* NO_MMVEC */ 2602 2603 #ifdef __SELF_DEF_EXTINSN 2604 #undef EXTINSN 2605 #undef __SELF_DEF_EXTINSN 2606 #endif