diff --git a/go.mod b/go.mod index 506c10c74f057ae4c59be10898eae7083b95524c..27ba794f5b71cb58996e3e1598e8f1f79fdf8742 100644 --- a/go.mod +++ b/go.mod @@ -5,9 +5,12 @@ go 1.15 require ( github.com/aws/aws-sdk-go v1.37.14 github.com/klauspost/compress v1.11.7 + github.com/frankban/quicktest v1.11.3 // indirect github.com/mmcloughlin/avo v0.0.0-20210120082657-d60cc025fc3c // indirect github.com/pierrec/lz4 v2.6.0+incompatible github.com/pilosa/pilosa v1.4.0 github.com/traetox/goaio v0.0.0-20171005222435-46641abceb17 - golang.org/x/text v0.3.3 + golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 // indirect + golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect ) diff --git a/pkg/vectorize/length/avx2.s b/pkg/vectorize/length/avx2.s new file mode 100644 index 0000000000000000000000000000000000000000..769a286b5d7b9132e69207735975215c96efd3a8 --- /dev/null +++ b/pkg/vectorize/length/avx2.s @@ -0,0 +1,68 @@ +// Code generated by command: go run avx2.go -out avx2.s -stubs avx2_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func strLengthAvx2Asm(x []uint32, r []int64) +// Requires: AVX, AVX2, SSE2 +TEXT ·strLengthAvx2Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + +blockloop: + CMPQ DX, $0x00000030 + JL tailloop + MOVOU (AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, (CX) + MOVOU 16(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 32(CX) + MOVOU 32(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 64(CX) + MOVOU 48(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 96(CX) + MOVOU 64(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 128(CX) + MOVOU 80(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 160(CX) + MOVOU 96(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 192(CX) + MOVOU 112(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 224(CX) + MOVOU 128(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 256(CX) + MOVOU 144(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 288(CX) + MOVOU 160(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 320(CX) + MOVOU 176(AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, 352(CX) + ADDQ $0x000000c0, AX + ADDQ $0x00000180, CX + SUBQ $0x00000030, DX + JMP blockloop + +tailloop: + CMPQ DX, $0x00000004 + JL done + MOVOU (AX), X0 + VPMOVZXDQ X0, Y0 + VMOVDQU Y0, (CX) + ADDQ $0x00000010, AX + ADDQ $0x00000020, CX + SUBQ $0x00000004, DX + JMP tailloop + +done: + RET diff --git a/pkg/vectorize/length/avx2_stubs.go b/pkg/vectorize/length/avx2_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..b5d7dd9571ab93d53c61dea09109e6f173b04fa9 --- /dev/null +++ b/pkg/vectorize/length/avx2_stubs.go @@ -0,0 +1,5 @@ +// Code generated by command: go run avx2.go -out avx2.s -stubs avx2_stubs.go. DO NOT EDIT. + +package length + +func strLengthAvx2Asm(x []uint32, r []int64) diff --git a/pkg/vectorize/length/avx512.s b/pkg/vectorize/length/avx512.s new file mode 100644 index 0000000000000000000000000000000000000000..ecab3f25c61e67efc0b1179e84a19b80e42f9e58 --- /dev/null +++ b/pkg/vectorize/length/avx512.s @@ -0,0 +1,68 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +#include "textflag.h" + +// func strLengthAvx512Asm(x []uint32, r []int64) +// Requires: AVX, AVX512F +TEXT ·strLengthAvx512Asm(SB), NOSPLIT, $0-48 + MOVQ x_base+0(FP), AX + MOVQ r_base+24(FP), CX + MOVQ x_len+8(FP), DX + +blockloop: + CMPQ DX, $0x00000060 + JL tailloop + VMOVDQU (AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, (CX) + VMOVDQU 16(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 32(CX) + VMOVDQU 32(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 64(CX) + VMOVDQU 48(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 96(CX) + VMOVDQU 64(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 128(CX) + VMOVDQU 80(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 160(CX) + VMOVDQU 96(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 192(CX) + VMOVDQU 112(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 224(CX) + VMOVDQU 128(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 256(CX) + VMOVDQU 144(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 288(CX) + VMOVDQU 160(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 320(CX) + VMOVDQU 176(AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, 352(CX) + ADDQ $0x00000180, AX + ADDQ $0x00000300, CX + SUBQ $0x00000060, DX + JMP blockloop + +tailloop: + CMPQ DX, $0x00000008 + JL done + VMOVDQU (AX), Y0 + VPMOVZXDQ Y0, Z0 + VMOVDQU64 Z0, (CX) + ADDQ $0x00000020, AX + ADDQ $0x00000040, CX + SUBQ $0x00000008, DX + JMP tailloop + +done: + RET diff --git a/pkg/vectorize/length/avx512_stubs.go b/pkg/vectorize/length/avx512_stubs.go new file mode 100644 index 0000000000000000000000000000000000000000..11f0131a1137eb5a3d2db0e6f3940d02854fb2f4 --- /dev/null +++ b/pkg/vectorize/length/avx512_stubs.go @@ -0,0 +1,5 @@ +// Code generated by command: go run avx512.go -out avx512.s -stubs avx512_stubs.go. DO NOT EDIT. + +package length + +func strLengthAvx512Asm(x []uint32, r []int64) diff --git a/pkg/vectorize/length/length.go b/pkg/vectorize/length/length.go index 4b33315111ebd67996a9b64504e0f0d8ece1145b..afb74c68b282f97b552c161eac3d3917a8942172 100644 --- a/pkg/vectorize/length/length.go +++ b/pkg/vectorize/length/length.go @@ -1,21 +1,51 @@ package length -import "matrixbase/pkg/container/vector" +import ( + "matrixbase/pkg/container/types" + + "golang.org/x/sys/cpu" +) var ( - bytesLength func(*vector.Bytes, []int64) []int64 + strLength func(*types.Bytes, []int64) []int64 ) func init() { - bytesLength = bytesLengthPure + if cpu.X86.HasAVX2 { + strLength = strLengthAvx512 + } else if cpu.X86.HasAVX2 { + strLength = strLengthAvx2 + } else { + strLength = strLengthPure + } } -func BytesLength(xs *vector.Bytes, rs []int64) []int64 { - return bytesLength(xs, rs) +func StrLength(xs *types.Bytes, rs []int64) []int64 { + return strLength(xs, rs) +} + +func strLengthAvx2(xs *types.Bytes, rs []int64) []int64 { + lengths := xs.Lengths + n := len(lengths) / 4 + strLengthAvx2Asm(lengths, rs) + for i, j := n*4, len(lengths); i < j; i++ { + rs[i] = int64(lengths[i]) + } + return rs +} + +func strLengthAvx512(xs *types.Bytes, rs []int64) []int64 { + lengths := xs.Lengths + n := len(lengths) / 8 + strLengthAvx512Asm(lengths, rs) + for i, j := n*8, len(lengths); i < j; i++ { + rs[i] = int64(lengths[i]) + } + return rs } -func bytesLengthPure(xs *vector.Bytes, rs []int64) []int64 { - for i, n := range xs.Ns { +func strLengthPure(xs *types.Bytes, rs []int64) []int64 { + for i, n := range xs.Lengths { rs[i] = int64(n) } return rs