I'm trying to optimise an inner loop of the form
{ sum0 += x0*y0;
sum1 += x1*y0;
x0 = *xP++;
y1 = *yP++;
sum0 += x1* y1;
sum1 += x0*y1;
x1 = *xP++;
y0 = *yP++;
}
which ought to map directly onto two Hexagon packets.
The multiply and accumulate has an intrinsic Q6_R_mpyiacc_RR, but I have not found an intrinsic for r0 = memw(r1++#4).
Without it, the compiler wants to generate memory access code of the form : xP+=2; x0 = xP[-2]; x1 = xP[-1] which stops it fitting the code fitting into two packets.
I have succeeded in forcing post increment addressing mode by inserting asm(" " : "+r"(xP), "+r"(yP)) before the sum0 accumulations to keep the compiler honest by pretending to use the pointer values, but this has thrown up what I believe to be a compiler bug. It seems to ignore the "+" modifier, treating it as an "=" and so never initialising the register for the pointer.
I am using SDK 3.2 with hexagon tools 8.0.10 on linux.
Illustrative Source code (with only one asm, so it doesn't get the addressing right. But it shows the bug):
#include <hexagon_protos.h>
#include <stdio.h>
int __attribute__ ((noinline)) works(int *xs, int *ys, int n)
{
int *xP = xs, *yP = ys;
int sum0 = 0, sum1 = 0;
int x0 = *xP++;
int y0 = *yP++;
int x1 = *xP++;
int y1;
#pragma nounroll
for (int i=0; i<n; i++)
{
sum0 = Q6_R_mpyiacc_RR(sum0, x0, y0);
sum1 = Q6_R_mpyiacc_RR(sum1, x1, y0);
x0 = *xP++;
y1 = *yP++;
sum0 = Q6_R_mpyiacc_RR(sum0, x1, y1);
sum1 = Q6_R_mpyiacc_RR(sum1, x0, y1);
x1 = *xP++;
y0 = *yP++;
}
sum0 += x0*y0;
return sum0+sum1;
}
int __attribute__ ((noinline)) brokenAsm(int *xs, int *ys, int n)
{
int *xP = xs, *yP = ys;
int sum0 = 0, sum1 = 0;
int x0 = *xP++;
int y0 = *yP++;
int x1 = *xP++;
int y1;
#pragma nounroll
for (int i=0; i<n; i++)
{
asm("" : "+r"(xP), "+r"(yP));
sum0 = Q6_R_mpyiacc_RR(sum0, x0, y0);
sum1 = Q6_R_mpyiacc_RR(sum1, x1, y0);
x0 = *xP++;
y1 = *yP++;
sum0 = Q6_R_mpyiacc_RR(sum0, x1, y1);
sum1 = Q6_R_mpyiacc_RR(sum1, x0, y1);
x1 = *xP++;
y0 = *yP++;
}
sum0 += x0*y0;
return sum0+sum1;
}
int main(int argc, const char *argv[])
{
int xs[100], ys[100];
for (int i=0; i<100; i++)
xs[i] = ys[i] = i;
for (int i=10; i<40; i++)
{
int t1 = works(xs, ys, i);
int t2 = brokenAsm(xs, ys, i);
printf ("%d %d\n", t1, t2);
}
}
Compilation command line
hexagon-clang++ -O3 -mv5 -G 0 -std=c++98 -g hexAsmBug.cpp -o hexAsmBug
Offending dissasembly of brokenAsm()
_Z9brokenAsmPiS_i:
5140: 03 40 00 78 78004003 { r3 = #0
5144: 01 40 42 75 75424001 p1 = cmp.gt(r2, #0)
5148: 0c 40 81 91 9181400c r12 = memw(r1 + #0)
514c: 04 c0 80 91 9180c004 r4 = memw(r0 + #0) }
5150: 86 40 21 74 74214086 { if (p1) r6 = add(r1, #4)
5154: 05 41 20 74 74204105 if (p1) r5 = add(r0, #8)
5158: 05 40 a0 7e 7ea04005 if (!p1) r5 = #0
515c: 03 c0 20 7e 7e20c003 if (p1) r3 = #0 }
5160: 36 41 20 5c 5c204136 { if (!p1) jump:nt 0x51cc <_Z9brokenAsmPiS_i+0x8C>
5164: 28 c8 80 41 4180c828 if (p1) r8 = memw(r0 + #4) }
5168: ed 7f e2 bf bfe27fed { r13 = add(r2, #-1)
516c: 20 40 82 75 75824020 p0 = cmp.gtu(r2, #1)
5170: 72 00 05 48 48050072 r5 = #0; r2 = memw(r7 + #0) }
5174: 05 4c 04 ef ef044c05 { r5 += mpyi(r4, r12)
5178: 03 4c 08 ef ef084c03 r3 += mpyi(r8, r12)
517c: 89 40 07 b0 b0074089 r9 = add(r7, #4)
5180: 04 c0 86 91 9186c004 r4 = memw(r6 + #0) }
5184: 18 40 0d 60 600d4018 { loop0(0x5190, r13)
5188: 1e 40 20 5c 5c20401e if (!p0) jump:nt 0x51c0 <_Z9brokenAsmPiS_i+0x80>
518c: 00 c0 00 7f 7f00c000 nop }
5190: 05 42 08 ef ef084205 { r5 += mpyi(r8, r2)
5194: 03 42 04 ef ef044203 r3 += mpyi(r4, r2)
5198: 2c 40 87 91 9187402c r12 = memw(r7 + #4)
519c: 28 c0 86 91 9186c028 r8 = memw(r6 + #4) }
51a0: 03 4c 08 ef ef084c03 { r3 += mpyi(r8, r12)
51a4: 05 4c 04 ef ef044c05 r5 += mpyi(r4, r12)
51a8: 06 41 06 b0 b0064106 r6 = add(r6, #8)
51ac: 87 c0 09 b0 b009c087 r7 = add(r9, #4) }
51b0: 89 80 07 b0 b0078089 { r9 = add(r7, #4)
51b4: 00 40 00 7f 7f004000 nop
51b8: 72 00 64 00 00640072 r4 = memw(r6 + #0); r2 = memw(r7 + #0) } :endloop0
51bc: 00 c0 00 7f 7f00c000 { nop }
51c0: 03 42 04 ef ef044203 { r3 += mpyi(r4, r2)
51c4: 05 42 08 ef ef084205 r5 += mpyi(r8, r2)
51c8: 2c c0 87 91 9187c02c r12 = memw(r7 + #4) }
51cc: 00 c4 0c ed ed0cc400 { r0 = mpyi(r12, r4) }
51d0: 20 45 03 ef ef034520 { r0 += add(r3, r5)
51d4: 00 40 9f 52 529f4000 jumpr r31
51d8: 00 40 00 7f 7f004000 nop
51dc: 00 c0 00 7f 7f00c000 nop }
r5 and r6 are initialised from parameters xs and ys. But r6 and r7 are actually used to access memory and r7 is never initialised.