Forums - hexagon-clang++ inline assembly bug

1 post / 0 new
hexagon-clang++ inline assembly bug
malcolm
Join Date: 17 Mar 17
Posts: 1
Posted: Wed, 2017-04-26 10:12

I'm trying to optimise an inner loop of the form

{       sum0 += x0*y0;
        sum1 += x1*y0;
        x0 = *xP++;
        y1 = *yP++;
        sum0 += x1* y1;
        sum1 += x0*y1;
        x1 = *xP++;
        y0 = *yP++;

}
which ought to map directly onto two Hexagon packets.

The multiply and accumulate has an intrinsic Q6_R_mpyiacc_RR, but I have not found an intrinsic for r0 = memw(r1++#4).

Without it, the compiler wants to generate memory access code of the form : xP+=2; x0 = xP[-2]; x1 = xP[-1] which stops it fitting the code fitting into two packets.

I have succeeded in forcing post increment addressing mode by inserting asm(" " : "+r"(xP), "+r"(yP)) before the sum0 accumulations to keep the compiler honest by pretending to use the pointer values, but this has thrown up what I believe to be a compiler bug. It seems to ignore the "+" modifier, treating it as an "=" and so never initialising the register for the pointer.

I am using SDK 3.2 with hexagon tools 8.0.10 on linux.

Illustrative Source code (with only one asm, so it doesn't get the addressing right. But it shows the bug):

#include <hexagon_protos.h>
#include <stdio.h>

int __attribute__ ((noinline)) works(int *xs, int *ys, int n)
{
    int *xP = xs, *yP = ys;
    int sum0 = 0, sum1 = 0;
    int x0 = *xP++;
    int y0 = *yP++;
    int x1 = *xP++;
    int y1;
#pragma nounroll
    for (int i=0; i<n; i++)
    {
        sum0 = Q6_R_mpyiacc_RR(sum0, x0, y0);
        sum1 = Q6_R_mpyiacc_RR(sum1, x1, y0);
        x0 = *xP++;
        y1 = *yP++;
        sum0 = Q6_R_mpyiacc_RR(sum0, x1, y1);
        sum1 = Q6_R_mpyiacc_RR(sum1, x0, y1);
        x1 = *xP++;
        y0 = *yP++;
    }
    sum0 += x0*y0;
    return sum0+sum1;
}

int __attribute__ ((noinline)) brokenAsm(int *xs, int *ys, int n)
{
    int *xP = xs, *yP = ys;
    int sum0 = 0, sum1 = 0;
    int x0 = *xP++;
    int y0 = *yP++;
    int x1 = *xP++;
    int y1;
#pragma nounroll
    for (int i=0; i<n; i++)
    {
        asm("" : "+r"(xP), "+r"(yP));
        sum0 = Q6_R_mpyiacc_RR(sum0, x0, y0);
        sum1 = Q6_R_mpyiacc_RR(sum1, x1, y0);
        x0 = *xP++;
        y1 = *yP++;
        sum0 = Q6_R_mpyiacc_RR(sum0, x1, y1);
        sum1 = Q6_R_mpyiacc_RR(sum1, x0, y1);
        x1 = *xP++;
        y0 = *yP++;
    }
    sum0 += x0*y0;
    return sum0+sum1;
}


int main(int argc, const char *argv[])
{
    int xs[100], ys[100];
    for (int i=0; i<100; i++)
        xs[i] = ys[i] = i;
    for (int i=10; i<40; i++)
    {
        int t1 = works(xs, ys, i);
        int t2 = brokenAsm(xs, ys, i);
        printf ("%d %d\n", t1, t2);
    }
}

Compilation command line

hexagon-clang++ -O3 -mv5 -G 0 -std=c++98 -g hexAsmBug.cpp -o hexAsmBug

Offending dissasembly of brokenAsm()

_Z9brokenAsmPiS_i:
    5140:    03 40 00 78 78004003 { r3 = #0
    5144:    01 40 42 75 75424001   p1 = cmp.gt(r2, #0)
    5148:    0c 40 81 91 9181400c   r12 = memw(r1 + #0)
    514c:    04 c0 80 91 9180c004   r4 = memw(r0 + #0) }
    5150:    86 40 21 74 74214086 { if (p1) r6 = add(r1, #4)
    5154:    05 41 20 74 74204105   if (p1) r5 = add(r0, #8)
    5158:    05 40 a0 7e 7ea04005   if (!p1) r5 = #0
    515c:    03 c0 20 7e 7e20c003   if (p1) r3 = #0 }
    5160:    36 41 20 5c 5c204136 { if (!p1) jump:nt 0x51cc <_Z9brokenAsmPiS_i+0x8C>
    5164:    28 c8 80 41 4180c828   if (p1) r8 = memw(r0 + #4) }
    5168:    ed 7f e2 bf bfe27fed { r13 = add(r2, #-1)
    516c:    20 40 82 75 75824020   p0 = cmp.gtu(r2, #1)
    5170:    72 00 05 48 48050072   r5 = #0; r2 = memw(r7 + #0) }
    5174:    05 4c 04 ef ef044c05 { r5 += mpyi(r4, r12)
    5178:    03 4c 08 ef ef084c03   r3 += mpyi(r8, r12)
    517c:    89 40 07 b0 b0074089   r9 = add(r7, #4)
    5180:    04 c0 86 91 9186c004   r4 = memw(r6 + #0) }
    5184:    18 40 0d 60 600d4018 { loop0(0x5190, r13)
    5188:    1e 40 20 5c 5c20401e   if (!p0) jump:nt 0x51c0 <_Z9brokenAsmPiS_i+0x80>
    518c:    00 c0 00 7f 7f00c000   nop }
    5190:    05 42 08 ef ef084205 { r5 += mpyi(r8, r2)
    5194:    03 42 04 ef ef044203   r3 += mpyi(r4, r2)
    5198:    2c 40 87 91 9187402c   r12 = memw(r7 + #4)
    519c:    28 c0 86 91 9186c028   r8 = memw(r6 + #4) }
    51a0:    03 4c 08 ef ef084c03 { r3 += mpyi(r8, r12)
    51a4:    05 4c 04 ef ef044c05   r5 += mpyi(r4, r12)
    51a8:    06 41 06 b0 b0064106   r6 = add(r6, #8)
    51ac:    87 c0 09 b0 b009c087   r7 = add(r9, #4) }
    51b0:    89 80 07 b0 b0078089 { r9 = add(r7, #4)
    51b4:    00 40 00 7f 7f004000   nop
    51b8:    72 00 64 00 00640072   r4 = memw(r6 + #0); r2 = memw(r7 + #0) } :endloop0
    51bc:    00 c0 00 7f 7f00c000 { nop }
    51c0:    03 42 04 ef ef044203 { r3 += mpyi(r4, r2)
    51c4:    05 42 08 ef ef084205   r5 += mpyi(r8, r2)
    51c8:    2c c0 87 91 9187c02c   r12 = memw(r7 + #4) }
    51cc:    00 c4 0c ed ed0cc400 { r0 = mpyi(r12, r4) }
    51d0:    20 45 03 ef ef034520 { r0 += add(r3, r5)
    51d4:    00 40 9f 52 529f4000   jumpr r31
    51d8:    00 40 00 7f 7f004000   nop
    51dc:    00 c0 00 7f 7f00c000   nop }
 

r5 and r6 are initialised from parameters xs and ys. But r6 and r7 are actually used to access memory and r7 is never initialised.

 

  • Up0
  • Down0

Opinions expressed in the content posted here are the personal opinions of the original authors, and do not necessarily reflect those of Qualcomm Incorporated or its subsidiaries (“Qualcomm”). The content is provided for informational purposes only and is not meant to be an endorsement or representation by Qualcomm or any other party. This site may also provide links or references to non-Qualcomm sites and resources. Qualcomm makes no representations, warranties, or other commitments whatsoever about any non-Qualcomm sites or third-party resources that may be referenced, accessible from, or linked to this site.