diff --git a/pkg/vectorize/sum/avx2.s b/pkg/vectorize/sum/avx2.s
new file mode 100644
index 0000000000000000000000000000000000000000..b1e8355ed454b081e9ef07401f2e1cdcace73b9d
--- /dev/null
+++ b/pkg/vectorize/sum/avx2.s
@@ -0,0 +1,273 @@
+// Code generated by command: go run avx2.go -out sum/avx2.s -stubs sum/avx2_stubs.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+// func int8SumAvx2Asm(x []int8, r []int8)
+// Requires: AVX, AVX2, SSE2
+TEXT 路int8SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+int8SumBlockLoop:
+	CMPQ   DX, $0x000000c0
+	JL     int8SumTailLoop
+	VPADDB (AX), Y0, Y0
+	VPADDB 32(AX), Y1, Y1
+	VPADDB 64(AX), Y2, Y2
+	VPADDB 96(AX), Y3, Y3
+	VPADDB 128(AX), Y4, Y4
+	VPADDB 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x000000c0, DX
+	JMP    int8SumBlockLoop
+
+int8SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int8SumDone
+	VPADDB (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000020, DX
+	JMP    int8SumTailLoop
+
+int8SumDone:
+	VPADDB       Y0, Y1, Y0
+	VPADDB       Y0, Y2, Y0
+	VPADDB       Y0, Y3, Y0
+	VPADDB       Y0, Y4, Y0
+	VPADDB       Y0, Y5, Y0
+	VEXTRACTI128 $0x01, Y0, X1
+	PADDB        X1, X0
+	MOVOU        X0, (CX)
+	RET
+
+// func int16SumAvx2Asm(x []int16, r []int16)
+// Requires: AVX, AVX2, SSE2
+TEXT 路int16SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+int16SumBlockLoop:
+	CMPQ   DX, $0x00000060
+	JL     int16SumTailLoop
+	VPADDW (AX), Y0, Y0
+	VPADDW 32(AX), Y1, Y1
+	VPADDW 64(AX), Y2, Y2
+	VPADDW 96(AX), Y3, Y3
+	VPADDW 128(AX), Y4, Y4
+	VPADDW 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x00000060, DX
+	JMP    int16SumBlockLoop
+
+int16SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int16SumDone
+	VPADDW (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000010, DX
+	JMP    int16SumTailLoop
+
+int16SumDone:
+	VPADDW       Y0, Y1, Y0
+	VPADDW       Y0, Y2, Y0
+	VPADDW       Y0, Y3, Y0
+	VPADDW       Y0, Y4, Y0
+	VPADDW       Y0, Y5, Y0
+	VEXTRACTI128 $0x01, Y0, X1
+	PADDW        X1, X0
+	MOVOU        X0, (CX)
+	RET
+
+// func int32SumAvx2Asm(x []int32, r []int32)
+// Requires: AVX, AVX2, SSE2
+TEXT 路int32SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+int32SumBlockLoop:
+	CMPQ   DX, $0x00000030
+	JL     int32SumTailLoop
+	VPADDD (AX), Y0, Y0
+	VPADDD 32(AX), Y1, Y1
+	VPADDD 64(AX), Y2, Y2
+	VPADDD 96(AX), Y3, Y3
+	VPADDD 128(AX), Y4, Y4
+	VPADDD 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x00000030, DX
+	JMP    int32SumBlockLoop
+
+int32SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int32SumDone
+	VPADDD (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000008, DX
+	JMP    int32SumTailLoop
+
+int32SumDone:
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y0, Y3, Y0
+	VPADDD       Y0, Y4, Y0
+	VPADDD       Y0, Y5, Y0
+	VEXTRACTI128 $0x01, Y0, X1
+	PADDD        X1, X0
+	MOVOU        X0, (CX)
+	RET
+
+// func int64SumAvx2Asm(x []int64, r []int64)
+// Requires: AVX, AVX2, SSE2
+TEXT 路int64SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+int64SumBlockLoop:
+	CMPQ   DX, $0x00000018
+	JL     int64SumTailLoop
+	VPADDQ (AX), Y0, Y0
+	VPADDQ 32(AX), Y1, Y1
+	VPADDQ 64(AX), Y2, Y2
+	VPADDQ 96(AX), Y3, Y3
+	VPADDQ 128(AX), Y4, Y4
+	VPADDQ 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x00000018, DX
+	JMP    int64SumBlockLoop
+
+int64SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int64SumDone
+	VPADDQ (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000004, DX
+	JMP    int64SumTailLoop
+
+int64SumDone:
+	VPADDQ       Y0, Y1, Y0
+	VPADDQ       Y0, Y2, Y0
+	VPADDQ       Y0, Y3, Y0
+	VPADDQ       Y0, Y4, Y0
+	VPADDQ       Y0, Y5, Y0
+	VEXTRACTI128 $0x01, Y0, X1
+	PADDQ        X1, X0
+	MOVOU        X0, (CX)
+	RET
+
+// func float32SumAvx2Asm(x []float32, r []float32)
+// Requires: AVX, AVX2, SSE, SSE2
+TEXT 路float32SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+float32SumBlockLoop:
+	CMPQ   DX, $0x00000030
+	JL     float32SumTailLoop
+	VADDPS (AX), Y0, Y0
+	VADDPS 32(AX), Y1, Y1
+	VADDPS 64(AX), Y2, Y2
+	VADDPS 96(AX), Y3, Y3
+	VADDPS 128(AX), Y4, Y4
+	VADDPS 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x00000030, DX
+	JMP    float32SumBlockLoop
+
+float32SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     float32SumDone
+	VADDPS (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000008, DX
+	JMP    float32SumTailLoop
+
+float32SumDone:
+	VADDPS       Y0, Y1, Y0
+	VADDPS       Y0, Y2, Y0
+	VADDPS       Y0, Y3, Y0
+	VADDPS       Y0, Y4, Y0
+	VADDPS       Y0, Y5, Y0
+	VEXTRACTF128 $0x01, Y0, X1
+	ADDPS        X1, X0
+	MOVOU        X0, (CX)
+	RET
+
+// func float64SumAvx2Asm(x []float64, r []float64)
+// Requires: AVX, AVX2, SSE2
+TEXT 路float64SumAvx2Asm(SB), NOSPLIT, $0-48
+	MOVQ  x_base+0(FP), AX
+	MOVQ  r_base+24(FP), CX
+	MOVQ  x_len+8(FP), DX
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	VPXOR Y4, Y4, Y4
+	VPXOR Y5, Y5, Y5
+
+float64SumBlockLoop:
+	CMPQ   DX, $0x00000018
+	JL     float64SumTailLoop
+	VADDPD (AX), Y0, Y0
+	VADDPD 32(AX), Y1, Y1
+	VADDPD 64(AX), Y2, Y2
+	VADDPD 96(AX), Y3, Y3
+	VADDPD 128(AX), Y4, Y4
+	VADDPD 160(AX), Y5, Y5
+	ADDQ   $0x000000c0, AX
+	SUBQ   $0x00000018, DX
+	JMP    float64SumBlockLoop
+
+float64SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     float64SumDone
+	VADDPD (AX), Y0, Y0
+	ADDQ   $0x00000020, AX
+	SUBQ   $0x00000004, DX
+	JMP    float64SumTailLoop
+
+float64SumDone:
+	VADDPD       Y0, Y1, Y0
+	VADDPD       Y0, Y2, Y0
+	VADDPD       Y0, Y3, Y0
+	VADDPD       Y0, Y4, Y0
+	VADDPD       Y0, Y5, Y0
+	VEXTRACTF128 $0x01, Y0, X1
+	ADDPD        X1, X0
+	MOVOU        X0, (CX)
+	RET
diff --git a/pkg/vectorize/sum/avx2_stubs.go b/pkg/vectorize/sum/avx2_stubs.go
new file mode 100644
index 0000000000000000000000000000000000000000..1f6ce6375d25b4c157ed5887551a7d9d2911ac91
--- /dev/null
+++ b/pkg/vectorize/sum/avx2_stubs.go
@@ -0,0 +1,15 @@
+// Code generated by command: go run avx2.go -out sum/avx2.s -stubs sum/avx2_stubs.go. DO NOT EDIT.
+
+package sum
+
+func int8SumAvx2Asm(x []int8, r []int8)
+
+func int16SumAvx2Asm(x []int16, r []int16)
+
+func int32SumAvx2Asm(x []int32, r []int32)
+
+func int64SumAvx2Asm(x []int64, r []int64)
+
+func float32SumAvx2Asm(x []float32, r []float32)
+
+func float64SumAvx2Asm(x []float64, r []float64)
diff --git a/pkg/vectorize/sum/avx512.s b/pkg/vectorize/sum/avx512.s
new file mode 100644
index 0000000000000000000000000000000000000000..67eed2162389fe7972baa3accce6e12a19d84728
--- /dev/null
+++ b/pkg/vectorize/sum/avx512.s
@@ -0,0 +1,393 @@
+// Code generated by command: go run avx512.go -out sum/avx512.s -stubs sum/avx512_stubs.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+// func int8SumAvx512Asm(x []int8, r []int8)
+// Requires: AVX2, AVX512BW, AVX512F, SSE2
+TEXT 路int8SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+int8SumBlockLoop:
+	CMPQ   DX, $0x00000300
+	JL     int8SumTailLoop
+	VPADDB (AX), Z0, Z0
+	VPADDB 64(AX), Z1, Z1
+	VPADDB 128(AX), Z2, Z2
+	VPADDB 192(AX), Z3, Z3
+	VPADDB 256(AX), Z4, Z4
+	VPADDB 320(AX), Z5, Z5
+	VPADDB 384(AX), Z6, Z6
+	VPADDB 448(AX), Z7, Z7
+	VPADDB 512(AX), Z8, Z8
+	VPADDB 576(AX), Z9, Z9
+	VPADDB 640(AX), Z10, Z10
+	VPADDB 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x00000300, DX
+	JMP    int8SumBlockLoop
+
+int8SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int8SumDone
+	VPADDB (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000040, DX
+	JMP    int8SumTailLoop
+
+int8SumDone:
+	VPADDB        Z0, Z1, Z0
+	VPADDB        Z0, Z2, Z0
+	VPADDB        Z0, Z3, Z0
+	VPADDB        Z0, Z4, Z0
+	VPADDB        Z0, Z5, Z0
+	VPADDB        Z0, Z6, Z0
+	VPADDB        Z0, Z7, Z0
+	VPADDB        Z0, Z8, Z0
+	VPADDB        Z0, Z9, Z0
+	VPADDB        Z0, Z10, Z0
+	VPADDB        Z0, Z11, Z0
+	VEXTRACTI64X4 $0x01, Z0, Y1
+	VPADDB        Y1, Y0, Y0
+	VEXTRACTI128  $0x01, Y0, X1
+	PADDB         X1, X0
+	MOVOU         X0, (CX)
+	RET
+
+// func int16SumAvx512Asm(x []int16, r []int16)
+// Requires: AVX2, AVX512BW, AVX512F, SSE2
+TEXT 路int16SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+int16SumBlockLoop:
+	CMPQ   DX, $0x00000180
+	JL     int16SumTailLoop
+	VPADDW (AX), Z0, Z0
+	VPADDW 64(AX), Z1, Z1
+	VPADDW 128(AX), Z2, Z2
+	VPADDW 192(AX), Z3, Z3
+	VPADDW 256(AX), Z4, Z4
+	VPADDW 320(AX), Z5, Z5
+	VPADDW 384(AX), Z6, Z6
+	VPADDW 448(AX), Z7, Z7
+	VPADDW 512(AX), Z8, Z8
+	VPADDW 576(AX), Z9, Z9
+	VPADDW 640(AX), Z10, Z10
+	VPADDW 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x00000180, DX
+	JMP    int16SumBlockLoop
+
+int16SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int16SumDone
+	VPADDW (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000020, DX
+	JMP    int16SumTailLoop
+
+int16SumDone:
+	VPADDW        Z0, Z1, Z0
+	VPADDW        Z0, Z2, Z0
+	VPADDW        Z0, Z3, Z0
+	VPADDW        Z0, Z4, Z0
+	VPADDW        Z0, Z5, Z0
+	VPADDW        Z0, Z6, Z0
+	VPADDW        Z0, Z7, Z0
+	VPADDW        Z0, Z8, Z0
+	VPADDW        Z0, Z9, Z0
+	VPADDW        Z0, Z10, Z0
+	VPADDW        Z0, Z11, Z0
+	VEXTRACTI64X4 $0x01, Z0, Y1
+	VPADDW        Y1, Y0, Y0
+	VEXTRACTI128  $0x01, Y0, X1
+	PADDW         X1, X0
+	MOVOU         X0, (CX)
+	RET
+
+// func int32SumAvx512Asm(x []int32, r []int32)
+// Requires: AVX2, AVX512F, SSE2
+TEXT 路int32SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+int32SumBlockLoop:
+	CMPQ   DX, $0x000000c0
+	JL     int32SumTailLoop
+	VPADDD (AX), Z0, Z0
+	VPADDD 64(AX), Z1, Z1
+	VPADDD 128(AX), Z2, Z2
+	VPADDD 192(AX), Z3, Z3
+	VPADDD 256(AX), Z4, Z4
+	VPADDD 320(AX), Z5, Z5
+	VPADDD 384(AX), Z6, Z6
+	VPADDD 448(AX), Z7, Z7
+	VPADDD 512(AX), Z8, Z8
+	VPADDD 576(AX), Z9, Z9
+	VPADDD 640(AX), Z10, Z10
+	VPADDD 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x000000c0, DX
+	JMP    int32SumBlockLoop
+
+int32SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int32SumDone
+	VPADDD (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000010, DX
+	JMP    int32SumTailLoop
+
+int32SumDone:
+	VPADDD        Z0, Z1, Z0
+	VPADDD        Z0, Z2, Z0
+	VPADDD        Z0, Z3, Z0
+	VPADDD        Z0, Z4, Z0
+	VPADDD        Z0, Z5, Z0
+	VPADDD        Z0, Z6, Z0
+	VPADDD        Z0, Z7, Z0
+	VPADDD        Z0, Z8, Z0
+	VPADDD        Z0, Z9, Z0
+	VPADDD        Z0, Z10, Z0
+	VPADDD        Z0, Z11, Z0
+	VEXTRACTI64X4 $0x01, Z0, Y1
+	VPADDD        Y1, Y0, Y0
+	VEXTRACTI128  $0x01, Y0, X1
+	PADDD         X1, X0
+	MOVOU         X0, (CX)
+	RET
+
+// func int64SumAvx512Asm(x []int64, r []int64)
+// Requires: AVX2, AVX512F, SSE2
+TEXT 路int64SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+int64SumBlockLoop:
+	CMPQ   DX, $0x00000060
+	JL     int64SumTailLoop
+	VPADDQ (AX), Z0, Z0
+	VPADDQ 64(AX), Z1, Z1
+	VPADDQ 128(AX), Z2, Z2
+	VPADDQ 192(AX), Z3, Z3
+	VPADDQ 256(AX), Z4, Z4
+	VPADDQ 320(AX), Z5, Z5
+	VPADDQ 384(AX), Z6, Z6
+	VPADDQ 448(AX), Z7, Z7
+	VPADDQ 512(AX), Z8, Z8
+	VPADDQ 576(AX), Z9, Z9
+	VPADDQ 640(AX), Z10, Z10
+	VPADDQ 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x00000060, DX
+	JMP    int64SumBlockLoop
+
+int64SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     int64SumDone
+	VPADDQ (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000008, DX
+	JMP    int64SumTailLoop
+
+int64SumDone:
+	VPADDQ        Z0, Z1, Z0
+	VPADDQ        Z0, Z2, Z0
+	VPADDQ        Z0, Z3, Z0
+	VPADDQ        Z0, Z4, Z0
+	VPADDQ        Z0, Z5, Z0
+	VPADDQ        Z0, Z6, Z0
+	VPADDQ        Z0, Z7, Z0
+	VPADDQ        Z0, Z8, Z0
+	VPADDQ        Z0, Z9, Z0
+	VPADDQ        Z0, Z10, Z0
+	VPADDQ        Z0, Z11, Z0
+	VEXTRACTI64X4 $0x01, Z0, Y1
+	VPADDQ        Y1, Y0, Y0
+	VEXTRACTI128  $0x01, Y0, X1
+	PADDQ         X1, X0
+	MOVOU         X0, (CX)
+	RET
+
+// func float32SumAvx512Asm(x []float32, r []float32)
+// Requires: AVX, AVX2, AVX512F, SSE, SSE2
+TEXT 路float32SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+float32SumBlockLoop:
+	CMPQ   DX, $0x000000c0
+	JL     float32SumTailLoop
+	VADDPS (AX), Z0, Z0
+	VADDPS 64(AX), Z1, Z1
+	VADDPS 128(AX), Z2, Z2
+	VADDPS 192(AX), Z3, Z3
+	VADDPS 256(AX), Z4, Z4
+	VADDPS 320(AX), Z5, Z5
+	VADDPS 384(AX), Z6, Z6
+	VADDPS 448(AX), Z7, Z7
+	VADDPS 512(AX), Z8, Z8
+	VADDPS 576(AX), Z9, Z9
+	VADDPS 640(AX), Z10, Z10
+	VADDPS 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x000000c0, DX
+	JMP    float32SumBlockLoop
+
+float32SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     float32SumDone
+	VADDPS (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000010, DX
+	JMP    float32SumTailLoop
+
+float32SumDone:
+	VADDPS        Z0, Z1, Z0
+	VADDPS        Z0, Z2, Z0
+	VADDPS        Z0, Z3, Z0
+	VADDPS        Z0, Z4, Z0
+	VADDPS        Z0, Z5, Z0
+	VADDPS        Z0, Z6, Z0
+	VADDPS        Z0, Z7, Z0
+	VADDPS        Z0, Z8, Z0
+	VADDPS        Z0, Z9, Z0
+	VADDPS        Z0, Z10, Z0
+	VADDPS        Z0, Z11, Z0
+	VEXTRACTF32X8 $0x01, Z0, Y1
+	VADDPS        Y1, Y0, Y0
+	VEXTRACTF128  $0x01, Y0, X1
+	ADDPS         X1, X0
+	MOVOU         X0, (CX)
+	RET
+
+// func float64SumAvx512Asm(x []float64, r []float64)
+// Requires: AVX, AVX2, AVX512F, SSE2
+TEXT 路float64SumAvx512Asm(SB), NOSPLIT, $0-48
+	MOVQ   x_base+0(FP), AX
+	MOVQ   r_base+24(FP), CX
+	MOVQ   x_len+8(FP), DX
+	VPXORD Z0, Z0, Z0
+	VPXORD Z1, Z1, Z1
+	VPXORD Z2, Z2, Z2
+	VPXORD Z3, Z3, Z3
+	VPXORD Z4, Z4, Z4
+	VPXORD Z5, Z5, Z5
+	VPXORD Z6, Z6, Z6
+	VPXORD Z7, Z7, Z7
+	VPXORD Z8, Z8, Z8
+	VPXORD Z9, Z9, Z9
+	VPXORD Z10, Z10, Z10
+	VPXORD Z11, Z11, Z11
+
+float64SumBlockLoop:
+	CMPQ   DX, $0x00000060
+	JL     float64SumTailLoop
+	VADDPD (AX), Z0, Z0
+	VADDPD 64(AX), Z1, Z1
+	VADDPD 128(AX), Z2, Z2
+	VADDPD 192(AX), Z3, Z3
+	VADDPD 256(AX), Z4, Z4
+	VADDPD 320(AX), Z5, Z5
+	VADDPD 384(AX), Z6, Z6
+	VADDPD 448(AX), Z7, Z7
+	VADDPD 512(AX), Z8, Z8
+	VADDPD 576(AX), Z9, Z9
+	VADDPD 640(AX), Z10, Z10
+	VADDPD 704(AX), Z11, Z11
+	ADDQ   $0x00000300, AX
+	SUBQ   $0x00000060, DX
+	JMP    float64SumBlockLoop
+
+float64SumTailLoop:
+	CMPQ   DX, $0x00000004
+	JL     float64SumDone
+	VADDPD (AX), Z0, Z0
+	ADDQ   $0x00000040, AX
+	SUBQ   $0x00000008, DX
+	JMP    float64SumTailLoop
+
+float64SumDone:
+	VADDPD        Z0, Z1, Z0
+	VADDPD        Z0, Z2, Z0
+	VADDPD        Z0, Z3, Z0
+	VADDPD        Z0, Z4, Z0
+	VADDPD        Z0, Z5, Z0
+	VADDPD        Z0, Z6, Z0
+	VADDPD        Z0, Z7, Z0
+	VADDPD        Z0, Z8, Z0
+	VADDPD        Z0, Z9, Z0
+	VADDPD        Z0, Z10, Z0
+	VADDPD        Z0, Z11, Z0
+	VEXTRACTF64X4 $0x01, Z0, Y1
+	VADDPD        Y1, Y0, Y0
+	VEXTRACTF128  $0x01, Y0, X1
+	ADDPD         X1, X0
+	MOVOU         X0, (CX)
+	RET
diff --git a/pkg/vectorize/sum/avx512_stubs.go b/pkg/vectorize/sum/avx512_stubs.go
new file mode 100644
index 0000000000000000000000000000000000000000..ceec9b7fbcd3f90159280f4014f41bf9cfbb62c3
--- /dev/null
+++ b/pkg/vectorize/sum/avx512_stubs.go
@@ -0,0 +1,15 @@
+// Code generated by command: go run avx512.go -out sum/avx512.s -stubs sum/avx512_stubs.go. DO NOT EDIT.
+
+package sum
+
+func int8SumAvx512Asm(x []int8, r []int8)
+
+func int16SumAvx512Asm(x []int16, r []int16)
+
+func int32SumAvx512Asm(x []int32, r []int32)
+
+func int64SumAvx512Asm(x []int64, r []int64)
+
+func float32SumAvx512Asm(x []float32, r []float32)
+
+func float64SumAvx512Asm(x []float64, r []float64)
diff --git a/pkg/vectorize/sum/sum.go b/pkg/vectorize/sum/sum.go
index 72cb1838c0fa153a4c259ebb0776ff3dde083cd7..9537e2d65dc5642da9555ca00c6512f604ec06b4 100644
--- a/pkg/vectorize/sum/sum.go
+++ b/pkg/vectorize/sum/sum.go
@@ -1,95 +1,508 @@
 package sum
 
-import "matrixbase/pkg/internal/cpu"
+import (
+	"golang.org/x/sys/cpu"
+)
 
 var (
-	i64Sum     func([]int64) int64
-	f64Sum     func([]float64) float64
-	i64SumSels func([]int64, []int64) int64
-	f64SumSels func([]float64, []int64) float64
+	int8Sum        func([]int8) int8
+	int8SumSels    func([]int8, []int64) int8
+	int16Sum       func([]int16) int16
+	int16SumSels   func([]int16, []int64) int16
+	int32Sum       func([]int32) int32
+	int32SumSels   func([]int32, []int64) int32
+	int64Sum       func([]int64) int64
+	int64SumSels   func([]int64, []int64) int64
+	uint8Sum       func([]uint8) uint8
+	uint8SumSels   func([]uint8, []int64) uint8
+	uint16Sum      func([]uint16) uint16
+	uint16SumSels  func([]uint16, []int64) uint16
+	uint32Sum      func([]uint32) uint32
+	uint32SumSels  func([]uint32, []int64) uint32
+	uint64Sum      func([]uint64) uint64
+	uint64SumSels  func([]uint64, []int64) uint64
+	float32Sum     func([]float32) float32
+	float32SumSels func([]float32, []int64) float32
+	float64Sum     func([]float64) float64
+	float64SumSels func([]float64, []int64) float64
 )
 
 func init() {
-	if cpu.X86.HasAVX2 {
-		i64Sum = i64SumAvx
-		f64Sum = f64SumAvx
+	if cpu.X86.HasAVX512 {
+		int8Sum = int8SumAvx512
+		int16Sum = int16SumAvx512
+		int32Sum = int32SumAvx512
+		int64Sum = int64SumAvx512
+		float32Sum = float32SumAvx512
+		float64Sum = float64SumAvx512
+	} else if cpu.X86.HasAVX2 {
+		int8Sum = int8SumAvx2
+		int16Sum = int16SumAvx2
+		int32Sum = int32SumAvx2
+		int64Sum = int64SumAvx2
+		float32Sum = float32SumAvx2
+		float64Sum = float64SumAvx2
 	} else {
-		i64Sum = i64SumPure
-		f64Sum = f64SumPure
+		int8Sum = int8SumPure
+		int16Sum = int16SumPure
+		int32Sum = int32SumPure
+		int64Sum = int64SumPure
+		float32Sum = float32SumPure
+		float64Sum = float64SumPure
+	}
+	uint8Sum = uint8SumPure
+	uint16Sum = uint16SumPure
+	uint32Sum = uint32SumPure
+	uint64Sum = uint64SumPure
+
+	int8SumSels = int8SumSelsPure
+	int16SumSels = int16SumSelsPure
+	int32SumSels = int32SumSelsPure
+	int64SumSels = int64SumSelsPure
+	uint8SumSels = uint8SumSelsPure
+	uint16SumSels = uint16SumSelsPure
+	uint32SumSels = uint32SumSelsPure
+	uint64SumSels = uint64SumSelsPure
+	float32SumSels = float32SumSelsPure
+	float64SumSels = float64SumSelsPure
+}
+
+func Int8Sum(xs []int8) int8 {
+	return int8Sum(xs)
+}
+
+func int8SumPure(xs []int8) int8 {
+	var res int8
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func int8SumAvx2(xs []int8) int8 {
+	const regItems int = 32 / 1
+	n := len(xs) / regItems
+	var rs [16]int8
+	int8SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 16; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func int8SumAvx512(xs []int8) int8 {
+	const regItems int = 64 / 1
+	n := len(xs) / regItems
+	var rs [16]int8
+	int8SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 16; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func Int8SumSels(xs []int8, sels []int64) int8 {
+	return int8SumSels(xs, sels)
+}
+
+func int8SumSelsPure(xs []int8, sels []int64) int8 {
+	var res int8
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Int16Sum(xs []int16) int16 {
+	return int16Sum(xs)
+}
+
+func int16SumPure(xs []int16) int16 {
+	var res int16
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func int16SumAvx2(xs []int16) int16 {
+	const regItems int = 32 / 2
+	n := len(xs) / regItems
+	var rs [8]int16
+	int16SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 8; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func int16SumAvx512(xs []int16) int16 {
+	const regItems int = 64 / 2
+	n := len(xs) / regItems
+	var rs [8]int16
+	int16SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 8; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
 	}
-	i64SumSels = i64SumSelsPure
-	f64SumSels = f64SumSelsPure
+	return res
 }
 
-func iSumAvx([]int64) int64
-func fSumAvx([]float64) float64
+func Int16SumSels(xs []int16, sels []int64) int16 {
+	return int16SumSels(xs, sels)
+}
+
+func int16SumSelsPure(xs []int16, sels []int64) int16 {
+	var res int16
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Int32Sum(xs []int32) int32 {
+	return int32Sum(xs)
+}
 
-func I64Sum(xs []int64) int64 {
-	return i64Sum(xs)
+func int32SumPure(xs []int32) int32 {
+	var res int32
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
 }
 
-func F64Sum(xs []float64) float64 {
-	return f64Sum(xs)
+func int32SumAvx2(xs []int32) int32 {
+	const regItems int = 32 / 4
+	n := len(xs) / regItems
+	var rs [4]int32
+	int32SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 4; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
 }
 
-func I64SumSels(xs []int64, sels []int64) int64 {
-	return i64SumSels(xs, sels)
+func int32SumAvx512(xs []int32) int32 {
+	const regItems int = 64 / 4
+	n := len(xs) / regItems
+	var rs [4]int32
+	int32SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 4; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
 }
 
-func F64SumSels(xs []float64, sels []int64) float64 {
-	return f64SumSels(xs, sels)
+func Int32SumSels(xs []int32, sels []int64) int32 {
+	return int32SumSels(xs, sels)
 }
 
-func i64SumAvx(xs []int64) int64 {
-	n := len(xs) / 8
-	r := iSumAvx(xs[:n*8])
-	for i, j := n*8, len(xs); i < j; i++ {
-		r += xs[i]
+func int32SumSelsPure(xs []int32, sels []int64) int32 {
+	var res int32
+
+	for _, sel := range sels {
+		res += xs[sel]
 	}
-	return r
+	return res
 }
 
-func i64SumPure(xs []int64) int64 {
-	var sum int64
+func Int64Sum(xs []int64) int64 {
+	return int64Sum(xs)
+}
+
+func int64SumPure(xs []int64) int64 {
+	var res int64
 
 	for _, x := range xs {
-		sum += x
+		res += x
 	}
-	return sum
+	return res
 }
 
-func f64SumAvx(xs []float64) float64 {
-	n := len(xs) / 8
-	r := fSumAvx(xs[:n*8])
-	for i, j := n*8, len(xs); i < j; i++ {
-		r += xs[i]
+func int64SumAvx2(xs []int64) int64 {
+	const regItems int = 32 / 8
+	n := len(xs) / regItems
+	var rs [2]int64
+	int64SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 2; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
 	}
-	return r
+	return res
 }
 
-func f64SumPure(xs []float64) float64 {
-	var sum float64
+func int64SumAvx512(xs []int64) int64 {
+	const regItems int = 64 / 8
+	n := len(xs) / regItems
+	var rs [2]int64
+	int64SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 2; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func Int64SumSels(xs []int64, sels []int64) int64 {
+	return int64SumSels(xs, sels)
+}
+
+func int64SumSelsPure(xs []int64, sels []int64) int64 {
+	var res int64
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Uint8Sum(xs []uint8) uint8 {
+	return uint8Sum(xs)
+}
+
+func uint8SumPure(xs []uint8) uint8 {
+	var res uint8
 
 	for _, x := range xs {
-		sum += x
+		res += x
 	}
-	return sum
+	return res
+}
+
+func Uint8SumSels(xs []uint8, sels []int64) uint8 {
+	return uint8SumSels(xs, sels)
 }
 
-func i64SumSelsPure(xs []int64, sels []int64) int64 {
-	var sum int64
+func uint8SumSelsPure(xs []uint8, sels []int64) uint8 {
+	var res uint8
 
 	for _, sel := range sels {
-		sum += xs[sel]
+		res += xs[sel]
 	}
-	return sum
+	return res
+}
+
+func Uint16Sum(xs []uint16) uint16 {
+	return uint16Sum(xs)
+}
+
+func uint16SumPure(xs []uint16) uint16 {
+	var res uint16
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func Uint16SumSels(xs []uint16, sels []int64) uint16 {
+	return uint16SumSels(xs, sels)
+}
+
+func uint16SumSelsPure(xs []uint16, sels []int64) uint16 {
+	var res uint16
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Uint32Sum(xs []uint32) uint32 {
+	return uint32Sum(xs)
+}
+
+func uint32SumPure(xs []uint32) uint32 {
+	var res uint32
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func Uint32SumSels(xs []uint32, sels []int64) uint32 {
+	return uint32SumSels(xs, sels)
+}
+
+func uint32SumSelsPure(xs []uint32, sels []int64) uint32 {
+	var res uint32
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Uint64Sum(xs []uint64) uint64 {
+	return uint64Sum(xs)
+}
+
+func uint64SumPure(xs []uint64) uint64 {
+	var res uint64
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func Uint64SumSels(xs []uint64, sels []int64) uint64 {
+	return uint64SumSels(xs, sels)
+}
+
+func uint64SumSelsPure(xs []uint64, sels []int64) uint64 {
+	var res uint64
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Float32Sum(xs []float32) float32 {
+	return float32Sum(xs)
+}
+
+func float32SumPure(xs []float32) float32 {
+	var res float32
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func float32SumAvx2(xs []float32) float32 {
+	const regItems int = 32 / 4
+	n := len(xs) / regItems
+	var rs [4]float32
+	float32SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 4; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func float32SumAvx512(xs []float32) float32 {
+	const regItems int = 64 / 4
+	n := len(xs) / regItems
+	var rs [4]float32
+	float32SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 4; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func Float32SumSels(xs []float32, sels []int64) float32 {
+	return float32SumSels(xs, sels)
+}
+
+func float32SumSelsPure(xs []float32, sels []int64) float32 {
+	var res float32
+
+	for _, sel := range sels {
+		res += xs[sel]
+	}
+	return res
+}
+
+func Float64Sum(xs []float64) float64 {
+	return float64Sum(xs)
+}
+
+func float64SumPure(xs []float64) float64 {
+	var res float64
+
+	for _, x := range xs {
+		res += x
+	}
+	return res
+}
+
+func float64SumAvx2(xs []float64) float64 {
+	const regItems int = 32 / 8
+	n := len(xs) / regItems
+	var rs [2]float64
+	float64SumAvx2Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 2; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func float64SumAvx512(xs []float64) float64 {
+	const regItems int = 64 / 8
+	n := len(xs) / regItems
+	var rs [2]float64
+	float64SumAvx512Asm(xs[:n*regItems], rs[:])
+	res := rs[0]
+	for i := 1; i < 2; i++ {
+		res += rs[i]
+	}
+	for i, j := n*regItems, len(xs); i < j; i++ {
+		res += xs[i]
+	}
+	return res
+}
+
+func Float64SumSels(xs []float64, sels []int64) float64 {
+	return float64SumSels(xs, sels)
 }
 
-func f64SumSelsPure(xs []float64, sels []int64) float64 {
-	var sum float64
+func float64SumSelsPure(xs []float64, sels []int64) float64 {
+	var res float64
 
 	for _, sel := range sels {
-		sum += xs[sel]
+		res += xs[sel]
 	}
-	return sum
+	return res
 }
diff --git a/pkg/vectorize/sum/sum_test.go b/pkg/vectorize/sum/sum_test.go
index 3c14907c100e173b50dacc32a2778faa045dc096..297239b9dccf7c4ac14d027bebcd1b982fa30a31 100644
--- a/pkg/vectorize/sum/sum_test.go
+++ b/pkg/vectorize/sum/sum_test.go
@@ -23,12 +23,12 @@ func makeFbuffer(l int) []float64 {
 
 func TestF64Sum(t *testing.T) {
 	xs := makeFbuffer(10000)
-	fmt.Printf("sum: %v\n", F64Sum(xs))
-	fmt.Printf("pure sum: %v\n", f64SumPure(xs))
+	fmt.Printf("sum: %v\n", Float64Sum(xs))
+	fmt.Printf("pure sum: %v\n", float64SumPure(xs))
 }
 
 func TestI64Sum(t *testing.T) {
 	xs := makeIbuffer(10000)
-	fmt.Printf("sum: %v\n", I64Sum(xs))
-	fmt.Printf("pure sum: %v\n", i64SumPure(xs))
+	fmt.Printf("sum: %v\n", Int64Sum(xs))
+	fmt.Printf("pure sum: %v\n", int64SumPure(xs))
 }