diff --git a/pkg/vectorize/sum/avx2.s b/pkg/vectorize/sum/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..b1e8355ed454b081e9ef07401f2e1cdcace73b9d --- /dev/null +++ b/pkg/vectorize/sum/avx2.s @@ -0,0 +1,273 @@ +// Code generated by command: go run avx2.go -out sum/avx2.s -stubs sum/avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8SumAvx2Asm(x []int8, r []int8) +// Requires: AVX, AVX2, SSE2 +TEXT 路int8SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +int8SumBlockLoop: + CMPQ DX, $0x000000c0 + JL int8SumTailLoop + VPADDB (AX), Y0, Y0 + VPADDB 32(AX), Y1, Y1 + VPADDB 64(AX), Y2, Y2 + VPADDB 96(AX), Y3, Y3 + VPADDB 128(AX), Y4, Y4 + VPADDB 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x000000c0, DX + JMP int8SumBlockLoop + +int8SumTailLoop: + CMPQ DX, $0x00000004 + JL int8SumDone + VPADDB (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000020, DX + JMP int8SumTailLoop + +int8SumDone: + VPADDB Y0, Y1, Y0 + VPADDB Y0, Y2, Y0 + VPADDB Y0, Y3, Y0 + VPADDB Y0, Y4, Y0 + VPADDB Y0, Y5, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDB X1, X0 + MOVOU X0, (CX) + RET + +// func int16SumAvx2Asm(x []int16, r []int16) +// Requires: AVX, AVX2, SSE2 +TEXT 路int16SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +int16SumBlockLoop: + CMPQ DX, $0x00000060 + JL int16SumTailLoop + VPADDW (AX), Y0, Y0 + VPADDW 32(AX), Y1, Y1 + VPADDW 64(AX), Y2, Y2 + VPADDW 96(AX), Y3, Y3 + VPADDW 128(AX), Y4, Y4 + VPADDW 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x00000060, DX + JMP int16SumBlockLoop + +int16SumTailLoop: + CMPQ DX, $0x00000004 + JL int16SumDone + VPADDW (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000010, DX + JMP int16SumTailLoop + +int16SumDone: + VPADDW Y0, Y1, Y0 + VPADDW Y0, Y2, Y0 + VPADDW Y0, Y3, Y0 + VPADDW Y0, Y4, Y0 + VPADDW Y0, Y5, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDW X1, X0 + MOVOU X0, (CX) + RET + +// func int32SumAvx2Asm(x []int32, r []int32) +// Requires: AVX, AVX2, SSE2 +TEXT 路int32SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +int32SumBlockLoop: + CMPQ DX, $0x00000030 + JL int32SumTailLoop + VPADDD (AX), Y0, Y0 + VPADDD 32(AX), Y1, Y1 + VPADDD 64(AX), Y2, Y2 + VPADDD 96(AX), Y3, Y3 + VPADDD 128(AX), Y4, Y4 + VPADDD 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP int32SumBlockLoop + +int32SumTailLoop: + CMPQ DX, $0x00000004 + JL int32SumDone + VPADDD (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP int32SumTailLoop + +int32SumDone: + VPADDD Y0, Y1, Y0 + VPADDD Y0, Y2, Y0 + VPADDD Y0, Y3, Y0 + VPADDD Y0, Y4, Y0 + VPADDD Y0, Y5, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDD X1, X0 + MOVOU X0, (CX) + RET + +// func int64SumAvx2Asm(x []int64, r []int64) +// Requires: AVX, AVX2, SSE2 +TEXT 路int64SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +int64SumBlockLoop: + CMPQ DX, $0x00000018 + JL int64SumTailLoop + VPADDQ (AX), Y0, Y0 + VPADDQ 32(AX), Y1, Y1 + VPADDQ 64(AX), Y2, Y2 + VPADDQ 96(AX), Y3, Y3 + VPADDQ 128(AX), Y4, Y4 + VPADDQ 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x00000018, DX + JMP int64SumBlockLoop + +int64SumTailLoop: + CMPQ DX, $0x00000004 + JL int64SumDone + VPADDQ (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000004, DX + JMP int64SumTailLoop + +int64SumDone: + VPADDQ Y0, Y1, Y0 + VPADDQ Y0, Y2, Y0 + VPADDQ Y0, Y3, Y0 + VPADDQ Y0, Y4, Y0 + VPADDQ Y0, Y5, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDQ X1, X0 + MOVOU X0, (CX) + RET + +// func float32SumAvx2Asm(x []float32, r []float32) +// Requires: AVX, AVX2, SSE, SSE2 +TEXT 路float32SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +float32SumBlockLoop: + CMPQ DX, $0x00000030 + JL float32SumTailLoop + VADDPS (AX), Y0, Y0 + VADDPS 32(AX), Y1, Y1 + VADDPS 64(AX), Y2, Y2 + VADDPS 96(AX), Y3, Y3 + VADDPS 128(AX), Y4, Y4 + VADDPS 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x00000030, DX + JMP float32SumBlockLoop + +float32SumTailLoop: + CMPQ DX, $0x00000004 + JL float32SumDone + VADDPS (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000008, DX + JMP float32SumTailLoop + +float32SumDone: + VADDPS Y0, Y1, Y0 + VADDPS Y0, Y2, Y0 + VADDPS Y0, Y3, Y0 + VADDPS Y0, Y4, Y0 + VADDPS Y0, Y5, Y0 + VEXTRACTF128 $0x01, Y0, X1 + ADDPS X1, X0 + MOVOU X0, (CX) + RET + +// func float64SumAvx2Asm(x []float64, r []float64) +// Requires: AVX, AVX2, SSE2 +TEXT 路float64SumAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXOR Y0, Y0, Y0 + VPXOR Y1, Y1, Y1 + VPXOR Y2, Y2, Y2 + VPXOR Y3, Y3, Y3 + VPXOR Y4, Y4, Y4 + VPXOR Y5, Y5, Y5 + +float64SumBlockLoop: + CMPQ DX, $0x00000018 + JL float64SumTailLoop + VADDPD (AX), Y0, Y0 + VADDPD 32(AX), Y1, Y1 + VADDPD 64(AX), Y2, Y2 + VADDPD 96(AX), Y3, Y3 + VADDPD 128(AX), Y4, Y4 + VADDPD 160(AX), Y5, Y5 + ADDQ $0x000000c0, AX + SUBQ $0x00000018, DX + JMP float64SumBlockLoop + +float64SumTailLoop: + CMPQ DX, $0x00000004 + JL float64SumDone + VADDPD (AX), Y0, Y0 + ADDQ $0x00000020, AX + SUBQ $0x00000004, DX + JMP float64SumTailLoop + +float64SumDone: + VADDPD Y0, Y1, Y0 + VADDPD Y0, Y2, Y0 + VADDPD Y0, Y3, Y0 + VADDPD Y0, Y4, Y0 + VADDPD Y0, Y5, Y0 + VEXTRACTF128 $0x01, Y0, X1 + ADDPD X1, X0 + MOVOU X0, (CX) + RET diff --git a/pkg/vectorize/sum/avx2_stubs.go b/pkg/vectorize/sum/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..1f6ce6375d25b4c157ed5887551a7d9d2911ac91 --- /dev/null +++ b/pkg/vectorize/sum/avx2_stubs.go @@ -0,0 +1,15 @@ +// Code generated by command: go run avx2.go -out sum/avx2.s -stubs sum/avx2_stubs.go. DO NOT EDIT. + +package sum + +func int8SumAvx2Asm(x []int8, r []int8) + +func int16SumAvx2Asm(x []int16, r []int16) + +func int32SumAvx2Asm(x []int32, r []int32) + +func int64SumAvx2Asm(x []int64, r []int64) + +func float32SumAvx2Asm(x []float32, r []float32) + +func float64SumAvx2Asm(x []float64, r []float64) diff --git a/pkg/vectorize/sum/avx512.s b/pkg/vectorize/sum/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..67eed2162389fe7972baa3accce6e12a19d84728 --- /dev/null +++ b/pkg/vectorize/sum/avx512.s @@ -0,0 +1,393 @@ +// Code generated by command: go run avx512.go -out sum/avx512.s -stubs sum/avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func int8SumAvx512Asm(x []int8, r []int8) +// Requires: AVX2, AVX512BW, AVX512F, SSE2 +TEXT 路int8SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +int8SumBlockLoop: + CMPQ DX, $0x00000300 + JL int8SumTailLoop + VPADDB (AX), Z0, Z0 + VPADDB 64(AX), Z1, Z1 + VPADDB 128(AX), Z2, Z2 + VPADDB 192(AX), Z3, Z3 + VPADDB 256(AX), Z4, Z4 + VPADDB 320(AX), Z5, Z5 + VPADDB 384(AX), Z6, Z6 + VPADDB 448(AX), Z7, Z7 + VPADDB 512(AX), Z8, Z8 + VPADDB 576(AX), Z9, Z9 + VPADDB 640(AX), Z10, Z10 + VPADDB 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x00000300, DX + JMP int8SumBlockLoop + +int8SumTailLoop: + CMPQ DX, $0x00000004 + JL int8SumDone + VPADDB (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000040, DX + JMP int8SumTailLoop + +int8SumDone: + VPADDB Z0, Z1, Z0 + VPADDB Z0, Z2, Z0 + VPADDB Z0, Z3, Z0 + VPADDB Z0, Z4, Z0 + VPADDB Z0, Z5, Z0 + VPADDB Z0, Z6, Z0 + VPADDB Z0, Z7, Z0 + VPADDB Z0, Z8, Z0 + VPADDB Z0, Z9, Z0 + VPADDB Z0, Z10, Z0 + VPADDB Z0, Z11, Z0 + VEXTRACTI64X4 $0x01, Z0, Y1 + VPADDB Y1, Y0, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDB X1, X0 + MOVOU X0, (CX) + RET + +// func int16SumAvx512Asm(x []int16, r []int16) +// Requires: AVX2, AVX512BW, AVX512F, SSE2 +TEXT 路int16SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +int16SumBlockLoop: + CMPQ DX, $0x00000180 + JL int16SumTailLoop + VPADDW (AX), Z0, Z0 + VPADDW 64(AX), Z1, Z1 + VPADDW 128(AX), Z2, Z2 + VPADDW 192(AX), Z3, Z3 + VPADDW 256(AX), Z4, Z4 + VPADDW 320(AX), Z5, Z5 + VPADDW 384(AX), Z6, Z6 + VPADDW 448(AX), Z7, Z7 + VPADDW 512(AX), Z8, Z8 + VPADDW 576(AX), Z9, Z9 + VPADDW 640(AX), Z10, Z10 + VPADDW 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x00000180, DX + JMP int16SumBlockLoop + +int16SumTailLoop: + CMPQ DX, $0x00000004 + JL int16SumDone + VPADDW (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000020, DX + JMP int16SumTailLoop + +int16SumDone: + VPADDW Z0, Z1, Z0 + VPADDW Z0, Z2, Z0 + VPADDW Z0, Z3, Z0 + VPADDW Z0, Z4, Z0 + VPADDW Z0, Z5, Z0 + VPADDW Z0, Z6, Z0 + VPADDW Z0, Z7, Z0 + VPADDW Z0, Z8, Z0 + VPADDW Z0, Z9, Z0 + VPADDW Z0, Z10, Z0 + VPADDW Z0, Z11, Z0 + VEXTRACTI64X4 $0x01, Z0, Y1 + VPADDW Y1, Y0, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDW X1, X0 + MOVOU X0, (CX) + RET + +// func int32SumAvx512Asm(x []int32, r []int32) +// Requires: AVX2, AVX512F, SSE2 +TEXT 路int32SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +int32SumBlockLoop: + CMPQ DX, $0x000000c0 + JL int32SumTailLoop + VPADDD (AX), Z0, Z0 + VPADDD 64(AX), Z1, Z1 + VPADDD 128(AX), Z2, Z2 + VPADDD 192(AX), Z3, Z3 + VPADDD 256(AX), Z4, Z4 + VPADDD 320(AX), Z5, Z5 + VPADDD 384(AX), Z6, Z6 + VPADDD 448(AX), Z7, Z7 + VPADDD 512(AX), Z8, Z8 + VPADDD 576(AX), Z9, Z9 + VPADDD 640(AX), Z10, Z10 + VPADDD 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP int32SumBlockLoop + +int32SumTailLoop: + CMPQ DX, $0x00000004 + JL int32SumDone + VPADDD (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP int32SumTailLoop + +int32SumDone: + VPADDD Z0, Z1, Z0 + VPADDD Z0, Z2, Z0 + VPADDD Z0, Z3, Z0 + VPADDD Z0, Z4, Z0 + VPADDD Z0, Z5, Z0 + VPADDD Z0, Z6, Z0 + VPADDD Z0, Z7, Z0 + VPADDD Z0, Z8, Z0 + VPADDD Z0, Z9, Z0 + VPADDD Z0, Z10, Z0 + VPADDD Z0, Z11, Z0 + VEXTRACTI64X4 $0x01, Z0, Y1 + VPADDD Y1, Y0, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDD X1, X0 + MOVOU X0, (CX) + RET + +// func int64SumAvx512Asm(x []int64, r []int64) +// Requires: AVX2, AVX512F, SSE2 +TEXT 路int64SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +int64SumBlockLoop: + CMPQ DX, $0x00000060 + JL int64SumTailLoop + VPADDQ (AX), Z0, Z0 + VPADDQ 64(AX), Z1, Z1 + VPADDQ 128(AX), Z2, Z2 + VPADDQ 192(AX), Z3, Z3 + VPADDQ 256(AX), Z4, Z4 + VPADDQ 320(AX), Z5, Z5 + VPADDQ 384(AX), Z6, Z6 + VPADDQ 448(AX), Z7, Z7 + VPADDQ 512(AX), Z8, Z8 + VPADDQ 576(AX), Z9, Z9 + VPADDQ 640(AX), Z10, Z10 + VPADDQ 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP int64SumBlockLoop + +int64SumTailLoop: + CMPQ DX, $0x00000004 + JL int64SumDone + VPADDQ (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP int64SumTailLoop + +int64SumDone: + VPADDQ Z0, Z1, Z0 + VPADDQ Z0, Z2, Z0 + VPADDQ Z0, Z3, Z0 + VPADDQ Z0, Z4, Z0 + VPADDQ Z0, Z5, Z0 + VPADDQ Z0, Z6, Z0 + VPADDQ Z0, Z7, Z0 + VPADDQ Z0, Z8, Z0 + VPADDQ Z0, Z9, Z0 + VPADDQ Z0, Z10, Z0 + VPADDQ Z0, Z11, Z0 + VEXTRACTI64X4 $0x01, Z0, Y1 + VPADDQ Y1, Y0, Y0 + VEXTRACTI128 $0x01, Y0, X1 + PADDQ X1, X0 + MOVOU X0, (CX) + RET + +// func float32SumAvx512Asm(x []float32, r []float32) +// Requires: AVX, AVX2, AVX512F, SSE, SSE2 +TEXT 路float32SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +float32SumBlockLoop: + CMPQ DX, $0x000000c0 + JL float32SumTailLoop + VADDPS (AX), Z0, Z0 + VADDPS 64(AX), Z1, Z1 + VADDPS 128(AX), Z2, Z2 + VADDPS 192(AX), Z3, Z3 + VADDPS 256(AX), Z4, Z4 + VADDPS 320(AX), Z5, Z5 + VADDPS 384(AX), Z6, Z6 + VADDPS 448(AX), Z7, Z7 + VADDPS 512(AX), Z8, Z8 + VADDPS 576(AX), Z9, Z9 + VADDPS 640(AX), Z10, Z10 + VADDPS 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x000000c0, DX + JMP float32SumBlockLoop + +float32SumTailLoop: + CMPQ DX, $0x00000004 + JL float32SumDone + VADDPS (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000010, DX + JMP float32SumTailLoop + +float32SumDone: + VADDPS Z0, Z1, Z0 + VADDPS Z0, Z2, Z0 + VADDPS Z0, Z3, Z0 + VADDPS Z0, Z4, Z0 + VADDPS Z0, Z5, Z0 + VADDPS Z0, Z6, Z0 + VADDPS Z0, Z7, Z0 + VADDPS Z0, Z8, Z0 + VADDPS Z0, Z9, Z0 + VADDPS Z0, Z10, Z0 + VADDPS Z0, Z11, Z0 + VEXTRACTF32X8 $0x01, Z0, Y1 + VADDPS Y1, Y0, Y0 + VEXTRACTF128 $0x01, Y0, X1 + ADDPS X1, X0 + MOVOU X0, (CX) + RET + +// func float64SumAvx512Asm(x []float64, r []float64) +// Requires: AVX, AVX2, AVX512F, SSE2 +TEXT 路float64SumAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + VPXORD Z0, Z0, Z0 + VPXORD Z1, Z1, Z1 + VPXORD Z2, Z2, Z2 + VPXORD Z3, Z3, Z3 + VPXORD Z4, Z4, Z4 + VPXORD Z5, Z5, Z5 + VPXORD Z6, Z6, Z6 + VPXORD Z7, Z7, Z7 + VPXORD Z8, Z8, Z8 + VPXORD Z9, Z9, Z9 + VPXORD Z10, Z10, Z10 + VPXORD Z11, Z11, Z11 + +float64SumBlockLoop: + CMPQ DX, $0x00000060 + JL float64SumTailLoop + VADDPD (AX), Z0, Z0 + VADDPD 64(AX), Z1, Z1 + VADDPD 128(AX), Z2, Z2 + VADDPD 192(AX), Z3, Z3 + VADDPD 256(AX), Z4, Z4 + VADDPD 320(AX), Z5, Z5 + VADDPD 384(AX), Z6, Z6 + VADDPD 448(AX), Z7, Z7 + VADDPD 512(AX), Z8, Z8 + VADDPD 576(AX), Z9, Z9 + VADDPD 640(AX), Z10, Z10 + VADDPD 704(AX), Z11, Z11 + ADDQ $0x00000300, AX + SUBQ $0x00000060, DX + JMP float64SumBlockLoop + +float64SumTailLoop: + CMPQ DX, $0x00000004 + JL float64SumDone + VADDPD (AX), Z0, Z0 + ADDQ $0x00000040, AX + SUBQ $0x00000008, DX + JMP float64SumTailLoop + +float64SumDone: + VADDPD Z0, Z1, Z0 + VADDPD Z0, Z2, Z0 + VADDPD Z0, Z3, Z0 + VADDPD Z0, Z4, Z0 + VADDPD Z0, Z5, Z0 + VADDPD Z0, Z6, Z0 + VADDPD Z0, Z7, Z0 + VADDPD Z0, Z8, Z0 + VADDPD Z0, Z9, Z0 + VADDPD Z0, Z10, Z0 + VADDPD Z0, Z11, Z0 + VEXTRACTF64X4 $0x01, Z0, Y1 + VADDPD Y1, Y0, Y0 + VEXTRACTF128 $0x01, Y0, X1 + ADDPD X1, X0 + MOVOU X0, (CX) + RET diff --git a/pkg/vectorize/sum/avx512_stubs.go b/pkg/vectorize/sum/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..ceec9b7fbcd3f90159280f4014f41bf9cfbb62c3 --- /dev/null +++ b/pkg/vectorize/sum/avx512_stubs.go @@ -0,0 +1,15 @@ +// Code generated by command: go run avx512.go -out sum/avx512.s -stubs sum/avx512_stubs.go. DO NOT EDIT. + +package sum + +func int8SumAvx512Asm(x []int8, r []int8) + +func int16SumAvx512Asm(x []int16, r []int16) + +func int32SumAvx512Asm(x []int32, r []int32) + +func int64SumAvx512Asm(x []int64, r []int64) + +func float32SumAvx512Asm(x []float32, r []float32) + +func float64SumAvx512Asm(x []float64, r []float64) diff --git a/pkg/vectorize/sum/sum.go b/pkg/vectorize/sum/sum.go index 72cb1838c0fa153a4c259ebb0776ff3dde083cd7..9537e2d65dc5642da9555ca00c6512f604ec06b4 100644 --- a/pkg/vectorize/sum/sum.go +++ b/pkg/vectorize/sum/sum.go @@ -1,95 +1,508 @@ package sum -import "matrixbase/pkg/internal/cpu" +import ( + "golang.org/x/sys/cpu" +) var ( - i64Sum func([]int64) int64 - f64Sum func([]float64) float64 - i64SumSels func([]int64, []int64) int64 - f64SumSels func([]float64, []int64) float64 + int8Sum func([]int8) int8 + int8SumSels func([]int8, []int64) int8 + int16Sum func([]int16) int16 + int16SumSels func([]int16, []int64) int16 + int32Sum func([]int32) int32 + int32SumSels func([]int32, []int64) int32 + int64Sum func([]int64) int64 + int64SumSels func([]int64, []int64) int64 + uint8Sum func([]uint8) uint8 + uint8SumSels func([]uint8, []int64) uint8 + uint16Sum func([]uint16) uint16 + uint16SumSels func([]uint16, []int64) uint16 + uint32Sum func([]uint32) uint32 + uint32SumSels func([]uint32, []int64) uint32 + uint64Sum func([]uint64) uint64 + uint64SumSels func([]uint64, []int64) uint64 + float32Sum func([]float32) float32 + float32SumSels func([]float32, []int64) float32 + float64Sum func([]float64) float64 + float64SumSels func([]float64, []int64) float64 ) func init() { - if cpu.X86.HasAVX2 { - i64Sum = i64SumAvx - f64Sum = f64SumAvx + if cpu.X86.HasAVX512 { + int8Sum = int8SumAvx512 + int16Sum = int16SumAvx512 + int32Sum = int32SumAvx512 + int64Sum = int64SumAvx512 + float32Sum = float32SumAvx512 + float64Sum = float64SumAvx512 + } else if cpu.X86.HasAVX2 { + int8Sum = int8SumAvx2 + int16Sum = int16SumAvx2 + int32Sum = int32SumAvx2 + int64Sum = int64SumAvx2 + float32Sum = float32SumAvx2 + float64Sum = float64SumAvx2 } else { - i64Sum = i64SumPure - f64Sum = f64SumPure + int8Sum = int8SumPure + int16Sum = int16SumPure + int32Sum = int32SumPure + int64Sum = int64SumPure + float32Sum = float32SumPure + float64Sum = float64SumPure + } + uint8Sum = uint8SumPure + uint16Sum = uint16SumPure + uint32Sum = uint32SumPure + uint64Sum = uint64SumPure + + int8SumSels = int8SumSelsPure + int16SumSels = int16SumSelsPure + int32SumSels = int32SumSelsPure + int64SumSels = int64SumSelsPure + uint8SumSels = uint8SumSelsPure + uint16SumSels = uint16SumSelsPure + uint32SumSels = uint32SumSelsPure + uint64SumSels = uint64SumSelsPure + float32SumSels = float32SumSelsPure + float64SumSels = float64SumSelsPure +} + +func Int8Sum(xs []int8) int8 { + return int8Sum(xs) +} + +func int8SumPure(xs []int8) int8 { + var res int8 + + for _, x := range xs { + res += x + } + return res +} + +func int8SumAvx2(xs []int8) int8 { + const regItems int = 32 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func int8SumAvx512(xs []int8) int8 { + const regItems int = 64 / 1 + n := len(xs) / regItems + var rs [16]int8 + int8SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 16; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func Int8SumSels(xs []int8, sels []int64) int8 { + return int8SumSels(xs, sels) +} + +func int8SumSelsPure(xs []int8, sels []int64) int8 { + var res int8 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Int16Sum(xs []int16) int16 { + return int16Sum(xs) +} + +func int16SumPure(xs []int16) int16 { + var res int16 + + for _, x := range xs { + res += x + } + return res +} + +func int16SumAvx2(xs []int16) int16 { + const regItems int = 32 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func int16SumAvx512(xs []int16) int16 { + const regItems int = 64 / 2 + n := len(xs) / regItems + var rs [8]int16 + int16SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 8; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] } - i64SumSels = i64SumSelsPure - f64SumSels = f64SumSelsPure + return res } -func iSumAvx([]int64) int64 -func fSumAvx([]float64) float64 +func Int16SumSels(xs []int16, sels []int64) int16 { + return int16SumSels(xs, sels) +} + +func int16SumSelsPure(xs []int16, sels []int64) int16 { + var res int16 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Int32Sum(xs []int32) int32 { + return int32Sum(xs) +} -func I64Sum(xs []int64) int64 { - return i64Sum(xs) +func int32SumPure(xs []int32) int32 { + var res int32 + + for _, x := range xs { + res += x + } + return res } -func F64Sum(xs []float64) float64 { - return f64Sum(xs) +func int32SumAvx2(xs []int32) int32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res } -func I64SumSels(xs []int64, sels []int64) int64 { - return i64SumSels(xs, sels) +func int32SumAvx512(xs []int32) int32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]int32 + int32SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res } -func F64SumSels(xs []float64, sels []int64) float64 { - return f64SumSels(xs, sels) +func Int32SumSels(xs []int32, sels []int64) int32 { + return int32SumSels(xs, sels) } -func i64SumAvx(xs []int64) int64 { - n := len(xs) / 8 - r := iSumAvx(xs[:n*8]) - for i, j := n*8, len(xs); i < j; i++ { - r += xs[i] +func int32SumSelsPure(xs []int32, sels []int64) int32 { + var res int32 + + for _, sel := range sels { + res += xs[sel] } - return r + return res } -func i64SumPure(xs []int64) int64 { - var sum int64 +func Int64Sum(xs []int64) int64 { + return int64Sum(xs) +} + +func int64SumPure(xs []int64) int64 { + var res int64 for _, x := range xs { - sum += x + res += x } - return sum + return res } -func f64SumAvx(xs []float64) float64 { - n := len(xs) / 8 - r := fSumAvx(xs[:n*8]) - for i, j := n*8, len(xs); i < j; i++ { - r += xs[i] +func int64SumAvx2(xs []int64) int64 { + const regItems int = 32 / 8 + n := len(xs) / regItems + var rs [2]int64 + int64SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] } - return r + return res } -func f64SumPure(xs []float64) float64 { - var sum float64 +func int64SumAvx512(xs []int64) int64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]int64 + int64SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func Int64SumSels(xs []int64, sels []int64) int64 { + return int64SumSels(xs, sels) +} + +func int64SumSelsPure(xs []int64, sels []int64) int64 { + var res int64 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Uint8Sum(xs []uint8) uint8 { + return uint8Sum(xs) +} + +func uint8SumPure(xs []uint8) uint8 { + var res uint8 for _, x := range xs { - sum += x + res += x } - return sum + return res +} + +func Uint8SumSels(xs []uint8, sels []int64) uint8 { + return uint8SumSels(xs, sels) } -func i64SumSelsPure(xs []int64, sels []int64) int64 { - var sum int64 +func uint8SumSelsPure(xs []uint8, sels []int64) uint8 { + var res uint8 for _, sel := range sels { - sum += xs[sel] + res += xs[sel] } - return sum + return res +} + +func Uint16Sum(xs []uint16) uint16 { + return uint16Sum(xs) +} + +func uint16SumPure(xs []uint16) uint16 { + var res uint16 + + for _, x := range xs { + res += x + } + return res +} + +func Uint16SumSels(xs []uint16, sels []int64) uint16 { + return uint16SumSels(xs, sels) +} + +func uint16SumSelsPure(xs []uint16, sels []int64) uint16 { + var res uint16 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Uint32Sum(xs []uint32) uint32 { + return uint32Sum(xs) +} + +func uint32SumPure(xs []uint32) uint32 { + var res uint32 + + for _, x := range xs { + res += x + } + return res +} + +func Uint32SumSels(xs []uint32, sels []int64) uint32 { + return uint32SumSels(xs, sels) +} + +func uint32SumSelsPure(xs []uint32, sels []int64) uint32 { + var res uint32 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Uint64Sum(xs []uint64) uint64 { + return uint64Sum(xs) +} + +func uint64SumPure(xs []uint64) uint64 { + var res uint64 + + for _, x := range xs { + res += x + } + return res +} + +func Uint64SumSels(xs []uint64, sels []int64) uint64 { + return uint64SumSels(xs, sels) +} + +func uint64SumSelsPure(xs []uint64, sels []int64) uint64 { + var res uint64 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Float32Sum(xs []float32) float32 { + return float32Sum(xs) +} + +func float32SumPure(xs []float32) float32 { + var res float32 + + for _, x := range xs { + res += x + } + return res +} + +func float32SumAvx2(xs []float32) float32 { + const regItems int = 32 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func float32SumAvx512(xs []float32) float32 { + const regItems int = 64 / 4 + n := len(xs) / regItems + var rs [4]float32 + float32SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 4; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func Float32SumSels(xs []float32, sels []int64) float32 { + return float32SumSels(xs, sels) +} + +func float32SumSelsPure(xs []float32, sels []int64) float32 { + var res float32 + + for _, sel := range sels { + res += xs[sel] + } + return res +} + +func Float64Sum(xs []float64) float64 { + return float64Sum(xs) +} + +func float64SumPure(xs []float64) float64 { + var res float64 + + for _, x := range xs { + res += x + } + return res +} + +func float64SumAvx2(xs []float64) float64 { + const regItems int = 32 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64SumAvx2Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func float64SumAvx512(xs []float64) float64 { + const regItems int = 64 / 8 + n := len(xs) / regItems + var rs [2]float64 + float64SumAvx512Asm(xs[:n*regItems], rs[:]) + res := rs[0] + for i := 1; i < 2; i++ { + res += rs[i] + } + for i, j := n*regItems, len(xs); i < j; i++ { + res += xs[i] + } + return res +} + +func Float64SumSels(xs []float64, sels []int64) float64 { + return float64SumSels(xs, sels) } -func f64SumSelsPure(xs []float64, sels []int64) float64 { - var sum float64 +func float64SumSelsPure(xs []float64, sels []int64) float64 { + var res float64 for _, sel := range sels { - sum += xs[sel] + res += xs[sel] } - return sum + return res } diff --git a/pkg/vectorize/sum/sum_test.go b/pkg/vectorize/sum/sum_test.go index 3c14907c100e173b50dacc32a2778faa045dc096..297239b9dccf7c4ac14d027bebcd1b982fa30a31 100644 --- a/pkg/vectorize/sum/sum_test.go +++ b/pkg/vectorize/sum/sum_test.go @@ -23,12 +23,12 @@ func makeFbuffer(l int) []float64 { func TestF64Sum(t *testing.T) { xs := makeFbuffer(10000) - fmt.Printf("sum: %v\n", F64Sum(xs)) - fmt.Printf("pure sum: %v\n", f64SumPure(xs)) + fmt.Printf("sum: %v\n", Float64Sum(xs)) + fmt.Printf("pure sum: %v\n", float64SumPure(xs)) } func TestI64Sum(t *testing.T) { xs := makeIbuffer(10000) - fmt.Printf("sum: %v\n", I64Sum(xs)) - fmt.Printf("pure sum: %v\n", i64SumPure(xs)) + fmt.Printf("sum: %v\n", Int64Sum(xs)) + fmt.Printf("pure sum: %v\n", int64SumPure(xs)) }