Skip to content
Snippets Groups Projects
Commit 8909c060 authored by bRong Njam's avatar bRong Njam
Browse files

Fix *SubScalar and *SubByScalar

parent 43558cf8
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
......@@ -65,18 +65,12 @@ TEXT ·int8AddScalarAvx2Asm(SB), NOSPLIT, $0-56
int8AddScalarBlockLoop:
CMPQ BX, $0x000000c0
JL int8AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDB Y0, Y1, Y1
VPADDB Y0, Y2, Y2
VPADDB Y0, Y3, Y3
VPADDB Y0, Y4, Y4
VPADDB Y0, Y5, Y5
VPADDB Y0, Y6, Y6
VPADDB (CX), Y0, Y1
VPADDB 32(CX), Y0, Y2
VPADDB 64(CX), Y0, Y3
VPADDB 96(CX), Y0, Y4
VPADDB 128(CX), Y0, Y5
VPADDB 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -91,8 +85,7 @@ int8AddScalarBlockLoop:
int8AddScalarTailLoop:
CMPQ BX, $0x00000020
JL int8AddScalarDone
VMOVDQU (CX), Y1
VPADDB Y0, Y1, Y1
VPADDB (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -165,18 +158,12 @@ TEXT ·int16AddScalarAvx2Asm(SB), NOSPLIT, $0-56
int16AddScalarBlockLoop:
CMPQ BX, $0x00000060
JL int16AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDW Y0, Y1, Y1
VPADDW Y0, Y2, Y2
VPADDW Y0, Y3, Y3
VPADDW Y0, Y4, Y4
VPADDW Y0, Y5, Y5
VPADDW Y0, Y6, Y6
VPADDW (CX), Y0, Y1
VPADDW 32(CX), Y0, Y2
VPADDW 64(CX), Y0, Y3
VPADDW 96(CX), Y0, Y4
VPADDW 128(CX), Y0, Y5
VPADDW 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -191,8 +178,7 @@ int16AddScalarBlockLoop:
int16AddScalarTailLoop:
CMPQ BX, $0x00000010
JL int16AddScalarDone
VMOVDQU (CX), Y1
VPADDW Y0, Y1, Y1
VPADDW (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -265,18 +251,12 @@ TEXT ·int32AddScalarAvx2Asm(SB), NOSPLIT, $0-56
int32AddScalarBlockLoop:
CMPQ BX, $0x00000030
JL int32AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDD Y0, Y1, Y1
VPADDD Y0, Y2, Y2
VPADDD Y0, Y3, Y3
VPADDD Y0, Y4, Y4
VPADDD Y0, Y5, Y5
VPADDD Y0, Y6, Y6
VPADDD (CX), Y0, Y1
VPADDD 32(CX), Y0, Y2
VPADDD 64(CX), Y0, Y3
VPADDD 96(CX), Y0, Y4
VPADDD 128(CX), Y0, Y5
VPADDD 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -291,8 +271,7 @@ int32AddScalarBlockLoop:
int32AddScalarTailLoop:
CMPQ BX, $0x00000008
JL int32AddScalarDone
VMOVDQU (CX), Y1
VPADDD Y0, Y1, Y1
VPADDD (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -365,18 +344,12 @@ TEXT ·int64AddScalarAvx2Asm(SB), NOSPLIT, $0-56
int64AddScalarBlockLoop:
CMPQ BX, $0x00000018
JL int64AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDQ Y0, Y1, Y1
VPADDQ Y0, Y2, Y2
VPADDQ Y0, Y3, Y3
VPADDQ Y0, Y4, Y4
VPADDQ Y0, Y5, Y5
VPADDQ Y0, Y6, Y6
VPADDQ (CX), Y0, Y1
VPADDQ 32(CX), Y0, Y2
VPADDQ 64(CX), Y0, Y3
VPADDQ 96(CX), Y0, Y4
VPADDQ 128(CX), Y0, Y5
VPADDQ 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -391,8 +364,7 @@ int64AddScalarBlockLoop:
int64AddScalarTailLoop:
CMPQ BX, $0x00000004
JL int64AddScalarDone
VMOVDQU (CX), Y1
VPADDQ Y0, Y1, Y1
VPADDQ (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -465,18 +437,12 @@ TEXT ·uint8AddScalarAvx2Asm(SB), NOSPLIT, $0-56
uint8AddScalarBlockLoop:
CMPQ BX, $0x000000c0
JL uint8AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDB Y0, Y1, Y1
VPADDB Y0, Y2, Y2
VPADDB Y0, Y3, Y3
VPADDB Y0, Y4, Y4
VPADDB Y0, Y5, Y5
VPADDB Y0, Y6, Y6
VPADDB (CX), Y0, Y1
VPADDB 32(CX), Y0, Y2
VPADDB 64(CX), Y0, Y3
VPADDB 96(CX), Y0, Y4
VPADDB 128(CX), Y0, Y5
VPADDB 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -491,8 +457,7 @@ uint8AddScalarBlockLoop:
uint8AddScalarTailLoop:
CMPQ BX, $0x00000020
JL uint8AddScalarDone
VMOVDQU (CX), Y1
VPADDB Y0, Y1, Y1
VPADDB (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -565,18 +530,12 @@ TEXT ·uint16AddScalarAvx2Asm(SB), NOSPLIT, $0-56
uint16AddScalarBlockLoop:
CMPQ BX, $0x00000060
JL uint16AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDW Y0, Y1, Y1
VPADDW Y0, Y2, Y2
VPADDW Y0, Y3, Y3
VPADDW Y0, Y4, Y4
VPADDW Y0, Y5, Y5
VPADDW Y0, Y6, Y6
VPADDW (CX), Y0, Y1
VPADDW 32(CX), Y0, Y2
VPADDW 64(CX), Y0, Y3
VPADDW 96(CX), Y0, Y4
VPADDW 128(CX), Y0, Y5
VPADDW 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -591,8 +550,7 @@ uint16AddScalarBlockLoop:
uint16AddScalarTailLoop:
CMPQ BX, $0x00000010
JL uint16AddScalarDone
VMOVDQU (CX), Y1
VPADDW Y0, Y1, Y1
VPADDW (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -665,18 +623,12 @@ TEXT ·uint32AddScalarAvx2Asm(SB), NOSPLIT, $0-56
uint32AddScalarBlockLoop:
CMPQ BX, $0x00000030
JL uint32AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDD Y0, Y1, Y1
VPADDD Y0, Y2, Y2
VPADDD Y0, Y3, Y3
VPADDD Y0, Y4, Y4
VPADDD Y0, Y5, Y5
VPADDD Y0, Y6, Y6
VPADDD (CX), Y0, Y1
VPADDD 32(CX), Y0, Y2
VPADDD 64(CX), Y0, Y3
VPADDD 96(CX), Y0, Y4
VPADDD 128(CX), Y0, Y5
VPADDD 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -691,8 +643,7 @@ uint32AddScalarBlockLoop:
uint32AddScalarTailLoop:
CMPQ BX, $0x00000008
JL uint32AddScalarDone
VMOVDQU (CX), Y1
VPADDD Y0, Y1, Y1
VPADDD (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -765,18 +716,12 @@ TEXT ·uint64AddScalarAvx2Asm(SB), NOSPLIT, $0-56
uint64AddScalarBlockLoop:
CMPQ BX, $0x00000018
JL uint64AddScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPADDQ Y0, Y1, Y1
VPADDQ Y0, Y2, Y2
VPADDQ Y0, Y3, Y3
VPADDQ Y0, Y4, Y4
VPADDQ Y0, Y5, Y5
VPADDQ Y0, Y6, Y6
VPADDQ (CX), Y0, Y1
VPADDQ 32(CX), Y0, Y2
VPADDQ 64(CX), Y0, Y3
VPADDQ 96(CX), Y0, Y4
VPADDQ 128(CX), Y0, Y5
VPADDQ 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -791,8 +736,7 @@ uint64AddScalarBlockLoop:
uint64AddScalarTailLoop:
CMPQ BX, $0x00000004
JL uint64AddScalarDone
VMOVDQU (CX), Y1
VPADDQ Y0, Y1, Y1
VPADDQ (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -864,18 +808,12 @@ TEXT ·float32AddScalarAvx2Asm(SB), NOSPLIT, $0-56
float32AddScalarBlockLoop:
CMPQ DX, $0x00000030
JL float32AddScalarTailLoop
VMOVUPS (AX), Y1
VMOVUPS 32(AX), Y2
VMOVUPS 64(AX), Y3
VMOVUPS 96(AX), Y4
VMOVUPS 128(AX), Y5
VMOVUPS 160(AX), Y6
VADDPS Y0, Y1, Y1
VADDPS Y0, Y2, Y2
VADDPS Y0, Y3, Y3
VADDPS Y0, Y4, Y4
VADDPS Y0, Y5, Y5
VADDPS Y0, Y6, Y6
VADDPS (AX), Y0, Y1
VADDPS 32(AX), Y0, Y2
VADDPS 64(AX), Y0, Y3
VADDPS 96(AX), Y0, Y4
VADDPS 128(AX), Y0, Y5
VADDPS 160(AX), Y0, Y6
VMOVUPS Y1, (CX)
VMOVUPS Y2, 32(CX)
VMOVUPS Y3, 64(CX)
......@@ -890,8 +828,7 @@ float32AddScalarBlockLoop:
float32AddScalarTailLoop:
CMPQ DX, $0x00000008
JL float32AddScalarDone
VMOVUPS (AX), Y1
VADDPS Y0, Y1, Y1
VADDPS (AX), Y0, Y1
VMOVUPS Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......@@ -963,18 +900,12 @@ TEXT ·float64AddScalarAvx2Asm(SB), NOSPLIT, $0-56
float64AddScalarBlockLoop:
CMPQ DX, $0x00000018
JL float64AddScalarTailLoop
VMOVUPD (AX), Y1
VMOVUPD 32(AX), Y2
VMOVUPD 64(AX), Y3
VMOVUPD 96(AX), Y4
VMOVUPD 128(AX), Y5
VMOVUPD 160(AX), Y6
VADDPD Y0, Y1, Y1
VADDPD Y0, Y2, Y2
VADDPD Y0, Y3, Y3
VADDPD Y0, Y4, Y4
VADDPD Y0, Y5, Y5
VADDPD Y0, Y6, Y6
VADDPD (AX), Y0, Y1
VADDPD 32(AX), Y0, Y2
VADDPD 64(AX), Y0, Y3
VADDPD 96(AX), Y0, Y4
VADDPD 128(AX), Y0, Y5
VADDPD 160(AX), Y0, Y6
VMOVUPD Y1, (CX)
VMOVUPD Y2, 32(CX)
VMOVUPD Y3, 64(CX)
......@@ -989,8 +920,7 @@ float64AddScalarBlockLoop:
float64AddScalarTailLoop:
CMPQ DX, $0x00000004
JL float64AddScalarDone
VMOVUPD (AX), Y1
VADDPD Y0, Y1, Y1
VADDPD (AX), Y0, Y1
VMOVUPD Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......
......@@ -83,30 +83,18 @@ TEXT ·int8AddScalarAvx512Asm(SB), NOSPLIT, $0-56
int8AddScalarBlockLoop:
CMPQ BX, $0x00000300
JL int8AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDB Z0, Z1, Z1
VPADDB Z0, Z2, Z2
VPADDB Z0, Z3, Z3
VPADDB Z0, Z4, Z4
VPADDB Z0, Z5, Z5
VPADDB Z0, Z6, Z6
VPADDB Z0, Z7, Z7
VPADDB Z0, Z8, Z8
VPADDB Z0, Z9, Z9
VPADDB Z0, Z10, Z10
VPADDB Z0, Z11, Z11
VPADDB Z0, Z12, Z12
VPADDB (CX), Z0, Z1
VPADDB 64(CX), Z0, Z2
VPADDB 128(CX), Z0, Z3
VPADDB 192(CX), Z0, Z4
VPADDB 256(CX), Z0, Z5
VPADDB 320(CX), Z0, Z6
VPADDB 384(CX), Z0, Z7
VPADDB 448(CX), Z0, Z8
VPADDB 512(CX), Z0, Z9
VPADDB 576(CX), Z0, Z10
VPADDB 640(CX), Z0, Z11
VPADDB 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -127,8 +115,7 @@ int8AddScalarBlockLoop:
int8AddScalarTailLoop:
CMPQ BX, $0x00000040
JL int8AddScalarDone
VMOVDQU32 (CX), Z1
VPADDB Z0, Z1, Z1
VPADDB (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -219,30 +206,18 @@ TEXT ·int16AddScalarAvx512Asm(SB), NOSPLIT, $0-56
int16AddScalarBlockLoop:
CMPQ BX, $0x00000180
JL int16AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDW Z0, Z1, Z1
VPADDW Z0, Z2, Z2
VPADDW Z0, Z3, Z3
VPADDW Z0, Z4, Z4
VPADDW Z0, Z5, Z5
VPADDW Z0, Z6, Z6
VPADDW Z0, Z7, Z7
VPADDW Z0, Z8, Z8
VPADDW Z0, Z9, Z9
VPADDW Z0, Z10, Z10
VPADDW Z0, Z11, Z11
VPADDW Z0, Z12, Z12
VPADDW (CX), Z0, Z1
VPADDW 64(CX), Z0, Z2
VPADDW 128(CX), Z0, Z3
VPADDW 192(CX), Z0, Z4
VPADDW 256(CX), Z0, Z5
VPADDW 320(CX), Z0, Z6
VPADDW 384(CX), Z0, Z7
VPADDW 448(CX), Z0, Z8
VPADDW 512(CX), Z0, Z9
VPADDW 576(CX), Z0, Z10
VPADDW 640(CX), Z0, Z11
VPADDW 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -263,8 +238,7 @@ int16AddScalarBlockLoop:
int16AddScalarTailLoop:
CMPQ BX, $0x00000020
JL int16AddScalarDone
VMOVDQU32 (CX), Z1
VPADDW Z0, Z1, Z1
VPADDW (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -355,30 +329,18 @@ TEXT ·int32AddScalarAvx512Asm(SB), NOSPLIT, $0-56
int32AddScalarBlockLoop:
CMPQ BX, $0x000000c0
JL int32AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDD Z0, Z1, Z1
VPADDD Z0, Z2, Z2
VPADDD Z0, Z3, Z3
VPADDD Z0, Z4, Z4
VPADDD Z0, Z5, Z5
VPADDD Z0, Z6, Z6
VPADDD Z0, Z7, Z7
VPADDD Z0, Z8, Z8
VPADDD Z0, Z9, Z9
VPADDD Z0, Z10, Z10
VPADDD Z0, Z11, Z11
VPADDD Z0, Z12, Z12
VPADDD (CX), Z0, Z1
VPADDD 64(CX), Z0, Z2
VPADDD 128(CX), Z0, Z3
VPADDD 192(CX), Z0, Z4
VPADDD 256(CX), Z0, Z5
VPADDD 320(CX), Z0, Z6
VPADDD 384(CX), Z0, Z7
VPADDD 448(CX), Z0, Z8
VPADDD 512(CX), Z0, Z9
VPADDD 576(CX), Z0, Z10
VPADDD 640(CX), Z0, Z11
VPADDD 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -399,8 +361,7 @@ int32AddScalarBlockLoop:
int32AddScalarTailLoop:
CMPQ BX, $0x00000010
JL int32AddScalarDone
VMOVDQU32 (CX), Z1
VPADDD Z0, Z1, Z1
VPADDD (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -491,30 +452,18 @@ TEXT ·int64AddScalarAvx512Asm(SB), NOSPLIT, $0-56
int64AddScalarBlockLoop:
CMPQ BX, $0x00000060
JL int64AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDQ Z0, Z1, Z1
VPADDQ Z0, Z2, Z2
VPADDQ Z0, Z3, Z3
VPADDQ Z0, Z4, Z4
VPADDQ Z0, Z5, Z5
VPADDQ Z0, Z6, Z6
VPADDQ Z0, Z7, Z7
VPADDQ Z0, Z8, Z8
VPADDQ Z0, Z9, Z9
VPADDQ Z0, Z10, Z10
VPADDQ Z0, Z11, Z11
VPADDQ Z0, Z12, Z12
VPADDQ (CX), Z0, Z1
VPADDQ 64(CX), Z0, Z2
VPADDQ 128(CX), Z0, Z3
VPADDQ 192(CX), Z0, Z4
VPADDQ 256(CX), Z0, Z5
VPADDQ 320(CX), Z0, Z6
VPADDQ 384(CX), Z0, Z7
VPADDQ 448(CX), Z0, Z8
VPADDQ 512(CX), Z0, Z9
VPADDQ 576(CX), Z0, Z10
VPADDQ 640(CX), Z0, Z11
VPADDQ 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -535,8 +484,7 @@ int64AddScalarBlockLoop:
int64AddScalarTailLoop:
CMPQ BX, $0x00000008
JL int64AddScalarDone
VMOVDQU32 (CX), Z1
VPADDQ Z0, Z1, Z1
VPADDQ (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -627,30 +575,18 @@ TEXT ·uint8AddScalarAvx512Asm(SB), NOSPLIT, $0-56
uint8AddScalarBlockLoop:
CMPQ BX, $0x00000300
JL uint8AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDB Z0, Z1, Z1
VPADDB Z0, Z2, Z2
VPADDB Z0, Z3, Z3
VPADDB Z0, Z4, Z4
VPADDB Z0, Z5, Z5
VPADDB Z0, Z6, Z6
VPADDB Z0, Z7, Z7
VPADDB Z0, Z8, Z8
VPADDB Z0, Z9, Z9
VPADDB Z0, Z10, Z10
VPADDB Z0, Z11, Z11
VPADDB Z0, Z12, Z12
VPADDB (CX), Z0, Z1
VPADDB 64(CX), Z0, Z2
VPADDB 128(CX), Z0, Z3
VPADDB 192(CX), Z0, Z4
VPADDB 256(CX), Z0, Z5
VPADDB 320(CX), Z0, Z6
VPADDB 384(CX), Z0, Z7
VPADDB 448(CX), Z0, Z8
VPADDB 512(CX), Z0, Z9
VPADDB 576(CX), Z0, Z10
VPADDB 640(CX), Z0, Z11
VPADDB 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -671,8 +607,7 @@ uint8AddScalarBlockLoop:
uint8AddScalarTailLoop:
CMPQ BX, $0x00000040
JL uint8AddScalarDone
VMOVDQU32 (CX), Z1
VPADDB Z0, Z1, Z1
VPADDB (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -763,30 +698,18 @@ TEXT ·uint16AddScalarAvx512Asm(SB), NOSPLIT, $0-56
uint16AddScalarBlockLoop:
CMPQ BX, $0x00000180
JL uint16AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDW Z0, Z1, Z1
VPADDW Z0, Z2, Z2
VPADDW Z0, Z3, Z3
VPADDW Z0, Z4, Z4
VPADDW Z0, Z5, Z5
VPADDW Z0, Z6, Z6
VPADDW Z0, Z7, Z7
VPADDW Z0, Z8, Z8
VPADDW Z0, Z9, Z9
VPADDW Z0, Z10, Z10
VPADDW Z0, Z11, Z11
VPADDW Z0, Z12, Z12
VPADDW (CX), Z0, Z1
VPADDW 64(CX), Z0, Z2
VPADDW 128(CX), Z0, Z3
VPADDW 192(CX), Z0, Z4
VPADDW 256(CX), Z0, Z5
VPADDW 320(CX), Z0, Z6
VPADDW 384(CX), Z0, Z7
VPADDW 448(CX), Z0, Z8
VPADDW 512(CX), Z0, Z9
VPADDW 576(CX), Z0, Z10
VPADDW 640(CX), Z0, Z11
VPADDW 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -807,8 +730,7 @@ uint16AddScalarBlockLoop:
uint16AddScalarTailLoop:
CMPQ BX, $0x00000020
JL uint16AddScalarDone
VMOVDQU32 (CX), Z1
VPADDW Z0, Z1, Z1
VPADDW (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -899,30 +821,18 @@ TEXT ·uint32AddScalarAvx512Asm(SB), NOSPLIT, $0-56
uint32AddScalarBlockLoop:
CMPQ BX, $0x000000c0
JL uint32AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDD Z0, Z1, Z1
VPADDD Z0, Z2, Z2
VPADDD Z0, Z3, Z3
VPADDD Z0, Z4, Z4
VPADDD Z0, Z5, Z5
VPADDD Z0, Z6, Z6
VPADDD Z0, Z7, Z7
VPADDD Z0, Z8, Z8
VPADDD Z0, Z9, Z9
VPADDD Z0, Z10, Z10
VPADDD Z0, Z11, Z11
VPADDD Z0, Z12, Z12
VPADDD (CX), Z0, Z1
VPADDD 64(CX), Z0, Z2
VPADDD 128(CX), Z0, Z3
VPADDD 192(CX), Z0, Z4
VPADDD 256(CX), Z0, Z5
VPADDD 320(CX), Z0, Z6
VPADDD 384(CX), Z0, Z7
VPADDD 448(CX), Z0, Z8
VPADDD 512(CX), Z0, Z9
VPADDD 576(CX), Z0, Z10
VPADDD 640(CX), Z0, Z11
VPADDD 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -943,8 +853,7 @@ uint32AddScalarBlockLoop:
uint32AddScalarTailLoop:
CMPQ BX, $0x00000010
JL uint32AddScalarDone
VMOVDQU32 (CX), Z1
VPADDD Z0, Z1, Z1
VPADDD (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -1035,30 +944,18 @@ TEXT ·uint64AddScalarAvx512Asm(SB), NOSPLIT, $0-56
uint64AddScalarBlockLoop:
CMPQ BX, $0x00000060
JL uint64AddScalarTailLoop
VMOVDQU32 (CX), Z1
VMOVDQU32 64(CX), Z2
VMOVDQU32 128(CX), Z3
VMOVDQU32 192(CX), Z4
VMOVDQU32 256(CX), Z5
VMOVDQU32 320(CX), Z6
VMOVDQU32 384(CX), Z7
VMOVDQU32 448(CX), Z8
VMOVDQU32 512(CX), Z9
VMOVDQU32 576(CX), Z10
VMOVDQU32 640(CX), Z11
VMOVDQU32 704(CX), Z12
VPADDQ Z0, Z1, Z1
VPADDQ Z0, Z2, Z2
VPADDQ Z0, Z3, Z3
VPADDQ Z0, Z4, Z4
VPADDQ Z0, Z5, Z5
VPADDQ Z0, Z6, Z6
VPADDQ Z0, Z7, Z7
VPADDQ Z0, Z8, Z8
VPADDQ Z0, Z9, Z9
VPADDQ Z0, Z10, Z10
VPADDQ Z0, Z11, Z11
VPADDQ Z0, Z12, Z12
VPADDQ (CX), Z0, Z1
VPADDQ 64(CX), Z0, Z2
VPADDQ 128(CX), Z0, Z3
VPADDQ 192(CX), Z0, Z4
VPADDQ 256(CX), Z0, Z5
VPADDQ 320(CX), Z0, Z6
VPADDQ 384(CX), Z0, Z7
VPADDQ 448(CX), Z0, Z8
VPADDQ 512(CX), Z0, Z9
VPADDQ 576(CX), Z0, Z10
VPADDQ 640(CX), Z0, Z11
VPADDQ 704(CX), Z0, Z12
VMOVDQU32 Z1, (DX)
VMOVDQU32 Z2, 64(DX)
VMOVDQU32 Z3, 128(DX)
......@@ -1079,8 +976,7 @@ uint64AddScalarBlockLoop:
uint64AddScalarTailLoop:
CMPQ BX, $0x00000008
JL uint64AddScalarDone
VMOVDQU32 (CX), Z1
VPADDQ Z0, Z1, Z1
VPADDQ (CX), Z0, Z1
VMOVDQU32 Z1, (DX)
ADDQ $0x00000040, CX
ADDQ $0x00000040, DX
......@@ -1170,30 +1066,18 @@ TEXT ·float32AddScalarAvx512Asm(SB), NOSPLIT, $0-56
float32AddScalarBlockLoop:
CMPQ DX, $0x000000c0
JL float32AddScalarTailLoop
VMOVUPS (AX), Z1
VMOVUPS 64(AX), Z2
VMOVUPS 128(AX), Z3
VMOVUPS 192(AX), Z4
VMOVUPS 256(AX), Z5
VMOVUPS 320(AX), Z6
VMOVUPS 384(AX), Z7
VMOVUPS 448(AX), Z8
VMOVUPS 512(AX), Z9
VMOVUPS 576(AX), Z10
VMOVUPS 640(AX), Z11
VMOVUPS 704(AX), Z12
VADDPS Z0, Z1, Z1
VADDPS Z0, Z2, Z2
VADDPS Z0, Z3, Z3
VADDPS Z0, Z4, Z4
VADDPS Z0, Z5, Z5
VADDPS Z0, Z6, Z6
VADDPS Z0, Z7, Z7
VADDPS Z0, Z8, Z8
VADDPS Z0, Z9, Z9
VADDPS Z0, Z10, Z10
VADDPS Z0, Z11, Z11
VADDPS Z0, Z12, Z12
VADDPS (AX), Z0, Z1
VADDPS 64(AX), Z0, Z2
VADDPS 128(AX), Z0, Z3
VADDPS 192(AX), Z0, Z4
VADDPS 256(AX), Z0, Z5
VADDPS 320(AX), Z0, Z6
VADDPS 384(AX), Z0, Z7
VADDPS 448(AX), Z0, Z8
VADDPS 512(AX), Z0, Z9
VADDPS 576(AX), Z0, Z10
VADDPS 640(AX), Z0, Z11
VADDPS 704(AX), Z0, Z12
VMOVUPS Z1, (CX)
VMOVUPS Z2, 64(CX)
VMOVUPS Z3, 128(CX)
......@@ -1214,8 +1098,7 @@ float32AddScalarBlockLoop:
float32AddScalarTailLoop:
CMPQ DX, $0x00000010
JL float32AddScalarDone
VMOVUPS (AX), Z1
VADDPS Z0, Z1, Z1
VADDPS (AX), Z0, Z1
VMOVUPS Z1, (CX)
ADDQ $0x00000040, AX
ADDQ $0x00000040, CX
......@@ -1305,30 +1188,18 @@ TEXT ·float64AddScalarAvx512Asm(SB), NOSPLIT, $0-56
float64AddScalarBlockLoop:
CMPQ DX, $0x00000060
JL float64AddScalarTailLoop
VMOVUPD (AX), Z1
VMOVUPD 64(AX), Z2
VMOVUPD 128(AX), Z3
VMOVUPD 192(AX), Z4
VMOVUPD 256(AX), Z5
VMOVUPD 320(AX), Z6
VMOVUPD 384(AX), Z7
VMOVUPD 448(AX), Z8
VMOVUPD 512(AX), Z9
VMOVUPD 576(AX), Z10
VMOVUPD 640(AX), Z11
VMOVUPD 704(AX), Z12
VADDPD Z0, Z1, Z1
VADDPD Z0, Z2, Z2
VADDPD Z0, Z3, Z3
VADDPD Z0, Z4, Z4
VADDPD Z0, Z5, Z5
VADDPD Z0, Z6, Z6
VADDPD Z0, Z7, Z7
VADDPD Z0, Z8, Z8
VADDPD Z0, Z9, Z9
VADDPD Z0, Z10, Z10
VADDPD Z0, Z11, Z11
VADDPD Z0, Z12, Z12
VADDPD (AX), Z0, Z1
VADDPD 64(AX), Z0, Z2
VADDPD 128(AX), Z0, Z3
VADDPD 192(AX), Z0, Z4
VADDPD 256(AX), Z0, Z5
VADDPD 320(AX), Z0, Z6
VADDPD 384(AX), Z0, Z7
VADDPD 448(AX), Z0, Z8
VADDPD 512(AX), Z0, Z9
VADDPD 576(AX), Z0, Z10
VADDPD 640(AX), Z0, Z11
VADDPD 704(AX), Z0, Z12
VMOVUPD Z1, (CX)
VMOVUPD Z2, 64(CX)
VMOVUPD Z3, 128(CX)
......@@ -1349,8 +1220,7 @@ float64AddScalarBlockLoop:
float64AddScalarTailLoop:
CMPQ DX, $0x00000008
JL float64AddScalarDone
VMOVUPD (AX), Z1
VADDPD Z0, Z1, Z1
VADDPD (AX), Z0, Z1
VMOVUPD Z1, (CX)
ADDQ $0x00000040, AX
ADDQ $0x00000040, CX
......
......@@ -65,18 +65,12 @@ TEXT ·int8SubScalarAvx2Asm(SB), NOSPLIT, $0-56
int8SubScalarBlockLoop:
CMPQ BX, $0x000000c0
JL int8SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBB Y0, Y1, Y1
VPSUBB Y0, Y2, Y2
VPSUBB Y0, Y3, Y3
VPSUBB Y0, Y4, Y4
VPSUBB Y0, Y5, Y5
VPSUBB Y0, Y6, Y6
VPSUBB (CX), Y0, Y1
VPSUBB 32(CX), Y0, Y2
VPSUBB 64(CX), Y0, Y3
VPSUBB 96(CX), Y0, Y4
VPSUBB 128(CX), Y0, Y5
VPSUBB 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -91,8 +85,7 @@ int8SubScalarBlockLoop:
int8SubScalarTailLoop:
CMPQ BX, $0x00000020
JL int8SubScalarDone
VMOVDQU (CX), Y1
VPSUBB Y0, Y1, Y1
VPSUBB (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -115,12 +108,18 @@ TEXT ·int8SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
int8SubByScalarBlockLoop:
CMPQ BX, $0x000000c0
JL int8SubByScalarTailLoop
VPSUBB (CX), Y0, Y1
VPSUBB 32(CX), Y0, Y2
VPSUBB 64(CX), Y0, Y3
VPSUBB 96(CX), Y0, Y4
VPSUBB 128(CX), Y0, Y5
VPSUBB 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBB Y0, Y1, Y1
VPSUBB Y0, Y2, Y2
VPSUBB Y0, Y3, Y3
VPSUBB Y0, Y4, Y4
VPSUBB Y0, Y5, Y5
VPSUBB Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -135,7 +134,8 @@ int8SubByScalarBlockLoop:
int8SubByScalarTailLoop:
CMPQ BX, $0x00000020
JL int8SubByScalarDone
VPSUBB (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBB Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -208,18 +208,12 @@ TEXT ·int16SubScalarAvx2Asm(SB), NOSPLIT, $0-56
int16SubScalarBlockLoop:
CMPQ BX, $0x00000060
JL int16SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBW Y0, Y1, Y1
VPSUBW Y0, Y2, Y2
VPSUBW Y0, Y3, Y3
VPSUBW Y0, Y4, Y4
VPSUBW Y0, Y5, Y5
VPSUBW Y0, Y6, Y6
VPSUBW (CX), Y0, Y1
VPSUBW 32(CX), Y0, Y2
VPSUBW 64(CX), Y0, Y3
VPSUBW 96(CX), Y0, Y4
VPSUBW 128(CX), Y0, Y5
VPSUBW 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -234,8 +228,7 @@ int16SubScalarBlockLoop:
int16SubScalarTailLoop:
CMPQ BX, $0x00000010
JL int16SubScalarDone
VMOVDQU (CX), Y1
VPSUBW Y0, Y1, Y1
VPSUBW (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -258,12 +251,18 @@ TEXT ·int16SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
int16SubByScalarBlockLoop:
CMPQ BX, $0x00000060
JL int16SubByScalarTailLoop
VPSUBW (CX), Y0, Y1
VPSUBW 32(CX), Y0, Y2
VPSUBW 64(CX), Y0, Y3
VPSUBW 96(CX), Y0, Y4
VPSUBW 128(CX), Y0, Y5
VPSUBW 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBW Y0, Y1, Y1
VPSUBW Y0, Y2, Y2
VPSUBW Y0, Y3, Y3
VPSUBW Y0, Y4, Y4
VPSUBW Y0, Y5, Y5
VPSUBW Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -278,7 +277,8 @@ int16SubByScalarBlockLoop:
int16SubByScalarTailLoop:
CMPQ BX, $0x00000010
JL int16SubByScalarDone
VPSUBW (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBW Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -351,18 +351,12 @@ TEXT ·int32SubScalarAvx2Asm(SB), NOSPLIT, $0-56
int32SubScalarBlockLoop:
CMPQ BX, $0x00000030
JL int32SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBD Y0, Y1, Y1
VPSUBD Y0, Y2, Y2
VPSUBD Y0, Y3, Y3
VPSUBD Y0, Y4, Y4
VPSUBD Y0, Y5, Y5
VPSUBD Y0, Y6, Y6
VPSUBD (CX), Y0, Y1
VPSUBD 32(CX), Y0, Y2
VPSUBD 64(CX), Y0, Y3
VPSUBD 96(CX), Y0, Y4
VPSUBD 128(CX), Y0, Y5
VPSUBD 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -377,8 +371,7 @@ int32SubScalarBlockLoop:
int32SubScalarTailLoop:
CMPQ BX, $0x00000008
JL int32SubScalarDone
VMOVDQU (CX), Y1
VPSUBD Y0, Y1, Y1
VPSUBD (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -401,12 +394,18 @@ TEXT ·int32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
int32SubByScalarBlockLoop:
CMPQ BX, $0x00000030
JL int32SubByScalarTailLoop
VPSUBD (CX), Y0, Y1
VPSUBD 32(CX), Y0, Y2
VPSUBD 64(CX), Y0, Y3
VPSUBD 96(CX), Y0, Y4
VPSUBD 128(CX), Y0, Y5
VPSUBD 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBD Y0, Y1, Y1
VPSUBD Y0, Y2, Y2
VPSUBD Y0, Y3, Y3
VPSUBD Y0, Y4, Y4
VPSUBD Y0, Y5, Y5
VPSUBD Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -421,7 +420,8 @@ int32SubByScalarBlockLoop:
int32SubByScalarTailLoop:
CMPQ BX, $0x00000008
JL int32SubByScalarDone
VPSUBD (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBD Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -494,18 +494,12 @@ TEXT ·int64SubScalarAvx2Asm(SB), NOSPLIT, $0-56
int64SubScalarBlockLoop:
CMPQ BX, $0x00000018
JL int64SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBQ Y0, Y1, Y1
VPSUBQ Y0, Y2, Y2
VPSUBQ Y0, Y3, Y3
VPSUBQ Y0, Y4, Y4
VPSUBQ Y0, Y5, Y5
VPSUBQ Y0, Y6, Y6
VPSUBQ (CX), Y0, Y1
VPSUBQ 32(CX), Y0, Y2
VPSUBQ 64(CX), Y0, Y3
VPSUBQ 96(CX), Y0, Y4
VPSUBQ 128(CX), Y0, Y5
VPSUBQ 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -520,8 +514,7 @@ int64SubScalarBlockLoop:
int64SubScalarTailLoop:
CMPQ BX, $0x00000004
JL int64SubScalarDone
VMOVDQU (CX), Y1
VPSUBQ Y0, Y1, Y1
VPSUBQ (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -544,12 +537,18 @@ TEXT ·int64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
int64SubByScalarBlockLoop:
CMPQ BX, $0x00000018
JL int64SubByScalarTailLoop
VPSUBQ (CX), Y0, Y1
VPSUBQ 32(CX), Y0, Y2
VPSUBQ 64(CX), Y0, Y3
VPSUBQ 96(CX), Y0, Y4
VPSUBQ 128(CX), Y0, Y5
VPSUBQ 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBQ Y0, Y1, Y1
VPSUBQ Y0, Y2, Y2
VPSUBQ Y0, Y3, Y3
VPSUBQ Y0, Y4, Y4
VPSUBQ Y0, Y5, Y5
VPSUBQ Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -564,7 +563,8 @@ int64SubByScalarBlockLoop:
int64SubByScalarTailLoop:
CMPQ BX, $0x00000004
JL int64SubByScalarDone
VPSUBQ (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBQ Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -637,18 +637,12 @@ TEXT ·uint8SubScalarAvx2Asm(SB), NOSPLIT, $0-56
uint8SubScalarBlockLoop:
CMPQ BX, $0x000000c0
JL uint8SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBB Y0, Y1, Y1
VPSUBB Y0, Y2, Y2
VPSUBB Y0, Y3, Y3
VPSUBB Y0, Y4, Y4
VPSUBB Y0, Y5, Y5
VPSUBB Y0, Y6, Y6
VPSUBB (CX), Y0, Y1
VPSUBB 32(CX), Y0, Y2
VPSUBB 64(CX), Y0, Y3
VPSUBB 96(CX), Y0, Y4
VPSUBB 128(CX), Y0, Y5
VPSUBB 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -663,8 +657,7 @@ uint8SubScalarBlockLoop:
uint8SubScalarTailLoop:
CMPQ BX, $0x00000020
JL uint8SubScalarDone
VMOVDQU (CX), Y1
VPSUBB Y0, Y1, Y1
VPSUBB (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -687,12 +680,18 @@ TEXT ·uint8SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
uint8SubByScalarBlockLoop:
CMPQ BX, $0x000000c0
JL uint8SubByScalarTailLoop
VPSUBB (CX), Y0, Y1
VPSUBB 32(CX), Y0, Y2
VPSUBB 64(CX), Y0, Y3
VPSUBB 96(CX), Y0, Y4
VPSUBB 128(CX), Y0, Y5
VPSUBB 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBB Y0, Y1, Y1
VPSUBB Y0, Y2, Y2
VPSUBB Y0, Y3, Y3
VPSUBB Y0, Y4, Y4
VPSUBB Y0, Y5, Y5
VPSUBB Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -707,7 +706,8 @@ uint8SubByScalarBlockLoop:
uint8SubByScalarTailLoop:
CMPQ BX, $0x00000020
JL uint8SubByScalarDone
VPSUBB (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBB Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -780,18 +780,12 @@ TEXT ·uint16SubScalarAvx2Asm(SB), NOSPLIT, $0-56
uint16SubScalarBlockLoop:
CMPQ BX, $0x00000060
JL uint16SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBW Y0, Y1, Y1
VPSUBW Y0, Y2, Y2
VPSUBW Y0, Y3, Y3
VPSUBW Y0, Y4, Y4
VPSUBW Y0, Y5, Y5
VPSUBW Y0, Y6, Y6
VPSUBW (CX), Y0, Y1
VPSUBW 32(CX), Y0, Y2
VPSUBW 64(CX), Y0, Y3
VPSUBW 96(CX), Y0, Y4
VPSUBW 128(CX), Y0, Y5
VPSUBW 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -806,8 +800,7 @@ uint16SubScalarBlockLoop:
uint16SubScalarTailLoop:
CMPQ BX, $0x00000010
JL uint16SubScalarDone
VMOVDQU (CX), Y1
VPSUBW Y0, Y1, Y1
VPSUBW (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -830,12 +823,18 @@ TEXT ·uint16SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
uint16SubByScalarBlockLoop:
CMPQ BX, $0x00000060
JL uint16SubByScalarTailLoop
VPSUBW (CX), Y0, Y1
VPSUBW 32(CX), Y0, Y2
VPSUBW 64(CX), Y0, Y3
VPSUBW 96(CX), Y0, Y4
VPSUBW 128(CX), Y0, Y5
VPSUBW 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBW Y0, Y1, Y1
VPSUBW Y0, Y2, Y2
VPSUBW Y0, Y3, Y3
VPSUBW Y0, Y4, Y4
VPSUBW Y0, Y5, Y5
VPSUBW Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -850,7 +849,8 @@ uint16SubByScalarBlockLoop:
uint16SubByScalarTailLoop:
CMPQ BX, $0x00000010
JL uint16SubByScalarDone
VPSUBW (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBW Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -923,18 +923,12 @@ TEXT ·uint32SubScalarAvx2Asm(SB), NOSPLIT, $0-56
uint32SubScalarBlockLoop:
CMPQ BX, $0x00000030
JL uint32SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBD Y0, Y1, Y1
VPSUBD Y0, Y2, Y2
VPSUBD Y0, Y3, Y3
VPSUBD Y0, Y4, Y4
VPSUBD Y0, Y5, Y5
VPSUBD Y0, Y6, Y6
VPSUBD (CX), Y0, Y1
VPSUBD 32(CX), Y0, Y2
VPSUBD 64(CX), Y0, Y3
VPSUBD 96(CX), Y0, Y4
VPSUBD 128(CX), Y0, Y5
VPSUBD 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -949,8 +943,7 @@ uint32SubScalarBlockLoop:
uint32SubScalarTailLoop:
CMPQ BX, $0x00000008
JL uint32SubScalarDone
VMOVDQU (CX), Y1
VPSUBD Y0, Y1, Y1
VPSUBD (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -973,12 +966,18 @@ TEXT ·uint32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
uint32SubByScalarBlockLoop:
CMPQ BX, $0x00000030
JL uint32SubByScalarTailLoop
VPSUBD (CX), Y0, Y1
VPSUBD 32(CX), Y0, Y2
VPSUBD 64(CX), Y0, Y3
VPSUBD 96(CX), Y0, Y4
VPSUBD 128(CX), Y0, Y5
VPSUBD 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBD Y0, Y1, Y1
VPSUBD Y0, Y2, Y2
VPSUBD Y0, Y3, Y3
VPSUBD Y0, Y4, Y4
VPSUBD Y0, Y5, Y5
VPSUBD Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -993,7 +992,8 @@ uint32SubByScalarBlockLoop:
uint32SubByScalarTailLoop:
CMPQ BX, $0x00000008
JL uint32SubByScalarDone
VPSUBD (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBD Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -1066,18 +1066,12 @@ TEXT ·uint64SubScalarAvx2Asm(SB), NOSPLIT, $0-56
uint64SubScalarBlockLoop:
CMPQ BX, $0x00000018
JL uint64SubScalarTailLoop
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBQ Y0, Y1, Y1
VPSUBQ Y0, Y2, Y2
VPSUBQ Y0, Y3, Y3
VPSUBQ Y0, Y4, Y4
VPSUBQ Y0, Y5, Y5
VPSUBQ Y0, Y6, Y6
VPSUBQ (CX), Y0, Y1
VPSUBQ 32(CX), Y0, Y2
VPSUBQ 64(CX), Y0, Y3
VPSUBQ 96(CX), Y0, Y4
VPSUBQ 128(CX), Y0, Y5
VPSUBQ 160(CX), Y0, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -1092,8 +1086,7 @@ uint64SubScalarBlockLoop:
uint64SubScalarTailLoop:
CMPQ BX, $0x00000004
JL uint64SubScalarDone
VMOVDQU (CX), Y1
VPSUBQ Y0, Y1, Y1
VPSUBQ (CX), Y0, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -1116,12 +1109,18 @@ TEXT ·uint64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
uint64SubByScalarBlockLoop:
CMPQ BX, $0x00000018
JL uint64SubByScalarTailLoop
VPSUBQ (CX), Y0, Y1
VPSUBQ 32(CX), Y0, Y2
VPSUBQ 64(CX), Y0, Y3
VPSUBQ 96(CX), Y0, Y4
VPSUBQ 128(CX), Y0, Y5
VPSUBQ 160(CX), Y0, Y6
VMOVDQU (CX), Y1
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y3
VMOVDQU 96(CX), Y4
VMOVDQU 128(CX), Y5
VMOVDQU 160(CX), Y6
VPSUBQ Y0, Y1, Y1
VPSUBQ Y0, Y2, Y2
VPSUBQ Y0, Y3, Y3
VPSUBQ Y0, Y4, Y4
VPSUBQ Y0, Y5, Y5
VPSUBQ Y0, Y6, Y6
VMOVDQU Y1, (DX)
VMOVDQU Y2, 32(DX)
VMOVDQU Y3, 64(DX)
......@@ -1136,7 +1135,8 @@ uint64SubByScalarBlockLoop:
uint64SubByScalarTailLoop:
CMPQ BX, $0x00000004
JL uint64SubByScalarDone
VPSUBQ (CX), Y0, Y1
VMOVDQU (CX), Y1
VPSUBQ Y0, Y1, Y1
VMOVDQU Y1, (DX)
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
......@@ -1208,18 +1208,12 @@ TEXT ·float32SubScalarAvx2Asm(SB), NOSPLIT, $0-56
float32SubScalarBlockLoop:
CMPQ DX, $0x00000030
JL float32SubScalarTailLoop
VMOVUPS (AX), Y1
VMOVUPS 32(AX), Y2
VMOVUPS 64(AX), Y3
VMOVUPS 96(AX), Y4
VMOVUPS 128(AX), Y5
VMOVUPS 160(AX), Y6
VSUBPS Y0, Y1, Y1
VSUBPS Y0, Y2, Y2
VSUBPS Y0, Y3, Y3
VSUBPS Y0, Y4, Y4
VSUBPS Y0, Y5, Y5
VSUBPS Y0, Y6, Y6
VSUBPS (AX), Y0, Y1
VSUBPS 32(AX), Y0, Y2
VSUBPS 64(AX), Y0, Y3
VSUBPS 96(AX), Y0, Y4
VSUBPS 128(AX), Y0, Y5
VSUBPS 160(AX), Y0, Y6
VMOVUPS Y1, (CX)
VMOVUPS Y2, 32(CX)
VMOVUPS Y3, 64(CX)
......@@ -1234,8 +1228,7 @@ float32SubScalarBlockLoop:
float32SubScalarTailLoop:
CMPQ DX, $0x00000008
JL float32SubScalarDone
VMOVUPS (AX), Y1
VSUBPS Y0, Y1, Y1
VSUBPS (AX), Y0, Y1
VMOVUPS Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......@@ -1257,12 +1250,18 @@ TEXT ·float32SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
float32SubByScalarBlockLoop:
CMPQ DX, $0x00000030
JL float32SubByScalarTailLoop
VSUBPS (AX), Y0, Y1
VSUBPS 32(AX), Y0, Y2
VSUBPS 64(AX), Y0, Y3
VSUBPS 96(AX), Y0, Y4
VSUBPS 128(AX), Y0, Y5
VSUBPS 160(AX), Y0, Y6
VMOVUPS (AX), Y1
VMOVUPS 32(AX), Y2
VMOVUPS 64(AX), Y3
VMOVUPS 96(AX), Y4
VMOVUPS 128(AX), Y5
VMOVUPS 160(AX), Y6
VSUBPS Y0, Y1, Y1
VSUBPS Y0, Y2, Y2
VSUBPS Y0, Y3, Y3
VSUBPS Y0, Y4, Y4
VSUBPS Y0, Y5, Y5
VSUBPS Y0, Y6, Y6
VMOVUPS Y1, (CX)
VMOVUPS Y2, 32(CX)
VMOVUPS Y3, 64(CX)
......@@ -1277,7 +1276,8 @@ float32SubByScalarBlockLoop:
float32SubByScalarTailLoop:
CMPQ DX, $0x00000008
JL float32SubByScalarDone
VSUBPS (AX), Y0, Y1
VMOVUPS (AX), Y1
VSUBPS Y0, Y1, Y1
VMOVUPS Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......@@ -1349,18 +1349,12 @@ TEXT ·float64SubScalarAvx2Asm(SB), NOSPLIT, $0-56
float64SubScalarBlockLoop:
CMPQ DX, $0x00000018
JL float64SubScalarTailLoop
VMOVUPD (AX), Y1
VMOVUPD 32(AX), Y2
VMOVUPD 64(AX), Y3
VMOVUPD 96(AX), Y4
VMOVUPD 128(AX), Y5
VMOVUPD 160(AX), Y6
VSUBPD Y0, Y1, Y1
VSUBPD Y0, Y2, Y2
VSUBPD Y0, Y3, Y3
VSUBPD Y0, Y4, Y4
VSUBPD Y0, Y5, Y5
VSUBPD Y0, Y6, Y6
VSUBPD (AX), Y0, Y1
VSUBPD 32(AX), Y0, Y2
VSUBPD 64(AX), Y0, Y3
VSUBPD 96(AX), Y0, Y4
VSUBPD 128(AX), Y0, Y5
VSUBPD 160(AX), Y0, Y6
VMOVUPD Y1, (CX)
VMOVUPD Y2, 32(CX)
VMOVUPD Y3, 64(CX)
......@@ -1375,8 +1369,7 @@ float64SubScalarBlockLoop:
float64SubScalarTailLoop:
CMPQ DX, $0x00000004
JL float64SubScalarDone
VMOVUPD (AX), Y1
VSUBPD Y0, Y1, Y1
VSUBPD (AX), Y0, Y1
VMOVUPD Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......@@ -1398,12 +1391,18 @@ TEXT ·float64SubByScalarAvx2Asm(SB), NOSPLIT, $0-56
float64SubByScalarBlockLoop:
CMPQ DX, $0x00000018
JL float64SubByScalarTailLoop
VSUBPD (AX), Y0, Y1
VSUBPD 32(AX), Y0, Y2
VSUBPD 64(AX), Y0, Y3
VSUBPD 96(AX), Y0, Y4
VSUBPD 128(AX), Y0, Y5
VSUBPD 160(AX), Y0, Y6
VMOVUPD (AX), Y1
VMOVUPD 32(AX), Y2
VMOVUPD 64(AX), Y3
VMOVUPD 96(AX), Y4
VMOVUPD 128(AX), Y5
VMOVUPD 160(AX), Y6
VSUBPD Y0, Y1, Y1
VSUBPD Y0, Y2, Y2
VSUBPD Y0, Y3, Y3
VSUBPD Y0, Y4, Y4
VSUBPD Y0, Y5, Y5
VSUBPD Y0, Y6, Y6
VMOVUPD Y1, (CX)
VMOVUPD Y2, 32(CX)
VMOVUPD Y3, 64(CX)
......@@ -1418,7 +1417,8 @@ float64SubByScalarBlockLoop:
float64SubByScalarTailLoop:
CMPQ DX, $0x00000004
JL float64SubByScalarDone
VSUBPD (AX), Y0, Y1
VMOVUPD (AX), Y1
VSUBPD Y0, Y1, Y1
VMOVUPD Y1, (CX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
......
This diff is collapsed.
This diff is collapsed.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment