#elif defined(ATL_GAS_PPC) && !defined(ATL_ARCH_POWER4)
#if defined(__GNUC__) || defined(__IBM_GCC_ASM)
#define ATL_pfl1R(mem) \
- __asm__ __volatile__ ("dcbt 0, %0, 0" : : "r" ((mem)))
+ __asm__ __volatile__ ("dcbt 0, %0" : : "r" ((mem)))
#define ATL_pfl1W(mem) \
__asm__ __volatile__ ("dcbtst 0, %0" : : "r" ((mem)))
#define ATL_pfST(mem) \
- __asm__ __volatile__ ("dcbt 0, %0, 1" : : "r" ((mem)))
+ __asm__ __volatile__ ("dcbt 0, %0" : : "r" ((mem)))
#define ATL_pfl1STi(mem, str) \
__asm__ __volatile__ ("rlwinm %0, %0, 0, 0, 24\n\t" \
"ori %0, %0, 96+%2\n\t" \
- "dcbt 0, %0, 8" \
+ "dcbt 0, %0" \
: "=r" (mem) \
: "0" (mem), "i" (str))
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32
fmadd rC00, rA0, rB0, rC00
lfd rb3, 8+KB3*8(pB0)
fmadd rC10, rA1, rB0, rC10
- dcbt 0, pfB, 0
+ dcbt 0, pfB
addi pfB, pfB, 128
fmadd rC20, rA2, rB0, rC20
fmadd rC30, rA3, rB0, rC30
#if KB > 1
fmadd rC00, ra0, rb0, rC00
fmadd rC10, ra1, rb0, rC10
- dcbt 0, pfA, 0
+ dcbt 0, pfA
addi pfA, pfA, 128
fmadd rC20, ra2, rb0, rC20
fmadd rC30, ra3, rb0, rC30
fmul rC11, rA1, rB1
fmul rC21, rA2, rB1
fmul rC31, rA3, rB1
- dcbt 0, pfA, 0
+ dcbt 0, pfA
addi pfA, pfA, 128
- dcbt 0, pfB, 0
+ dcbt 0, pfB
addi pfB, pfB, 128
fmul rC02, rA0, rB2
fmul rC12, rA1, rB2
fmadd rC12, rA1, rB2, rC12
fmadd rC22, rA2, rB2, rC22
fmadd rC32, rA3, rB2, rC32
- dcbt 0, pfA, 0
+ dcbt 0, pfA
addi pfA, pfA, 128
- dcbt 0, pfB, 0
+ dcbt 0, pfB
addi pfB, pfB, 128
fmadd rC03, rA0, rB3, rC03
fmadd rC13, rA1, rB3, rC13
fmadd rC10, rA1, rB0, rC10
fmadd rC20, rA2, rB0, rC20
fmadd rC30, rA3, rB0, rC30
- dcbt 0, pfA, 0
- dcbt 0, pfB, 0
+ dcbt 0, pfA
+ dcbt 0, pfB
addi pfA, pfA, 128
addi pfB, pfB, 128
fmadd rC01, rA0, rB1, rC01
fmul rC11, rA1, rB1
fmul rC21, rA2, rB1
fmul rC31, rA3, rB1
- dcbt 0, pfA, 0
+ dcbt 0, pfA
addi pfA, pfA, 128
- dcbt 0, pfB, 0
+ dcbt 0, pfB
addi pfB, pfB, 128
fmul rC02, rA0, rB2
fmul rC12, rA1, rB2
fmadd rC12, rA1, rB2, rC12
fmadd rC22, rA2, rB2, rC22
fmadd rC32, rA3, rB2, rC32
- dcbt 0, pfA, 0
+ dcbt 0, pfA
addi pfA, pfA, 128
- dcbt 0, pfB, 0
+ dcbt 0, pfB
addi pfB, pfB, 128
fmadd rC03, rA0, rB3, rC03
fmadd rC13, rA1, rB3, rC13
fmadd rC10, rA1, rB0, rC10
fmadd rC20, rA2, rB0, rC20
fmadd rC30, rA3, rB0, rC30
- dcbt 0, pfA, 0
- dcbt 0, pfB, 0
+ dcbt 0, pfA
+ dcbt 0, pfB
addi pfA, pfA, 128
addi pfB, pfB, 128
fmadd rC01, rA0, rB1, rC01
#ifndef ATL_GOT_L1PREFETCH
#ifdef _ARCH_PPC
#undef ATL_pfl1R
-#define ATL_pfl1R(mem) { __asm__ volatile ("dcbt 0, %0, 0" : : "r" ((mem))); }
+#define ATL_pfl1R(mem) { __asm__ volatile ("dcbt 0, %0" : : "r" ((mem))); }
#endif
#endif
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32
#endif
vmaddfp vC01, vA0, vB1, vC01
#ifdef BETAX
- dcbt 0, pBETA, 0
+ dcbt 0, pBETA
#endif
vmaddfp vC11, vA1, vB1, vC11
vmaddfp vC21, vA2, vB1, vC21
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32
#endif
vmaddfp vC01, vA0, vB1, vC01
#ifdef BETAX
- dcbt 0, pBETA, 0
+ dcbt 0, pBETA
#endif
vmaddfp vC11, vA1, vB1, vC11
vmaddfp vC21, vA2, vB1, vC21
#endif
vmaddfp vC01, vA0, vB1, vC33
vmaddfp vC11, vA1, vB1, vC33
- dcbt 0, pfA, 0
+ dcbt 0, pfA
vmaddfp vC21, vA2, vB1, vC33
addi pfA, pfA, 64
vmaddfp vC31, vA3, vB1, vC33
#endif
vmaddfp vC02, va0, vb2, vC02
vmaddfp vC12, va1, vb2, vC12
- dcbt 0, pfB, 0
+ dcbt 0, pfB
vmaddfp vC22, va2, vb2, vC22
addi pfB, pfB, 64
vmaddfp vC32, va3, vb2, vC32