Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 364503a

Browse files
committed
crypto/sha512: implement sha512 with ARMv8.2 features
ARMv8.2 support SIMD instrisics to accelerate SHA512 operations. Here SHA512 in assembly with the NEON instruction in implemented. The following benchmark was run on Apple M1 chip. Compare to the pure golang implementation (the implementation insha512block.go) the time difference is listed in the following. benchmark old ns/op new ns/op delta BenchmarkHash8Bytes-8 506.6 ns/op 177.4 ns/op -64.98% BenchmarkHash1K-8 4034 ns/op 1192 ns/op -70.45% BenchmarkHash8K-8 28418 ns/op 8635 ns/op -69.61% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes-8 15.79 MB/s 45.11 MB/s 2.86x BenchmarkHash1K-8 253.81 MB/s 859.03 MB/s 3.38x BenchmarkHash8K-8 288.27 MB/s 948.68 MB/s 3.29x
1 parent 901510e commit 364503a

File tree

3 files changed

+246
-2
lines changed

3 files changed

+246
-2
lines changed

‎src/crypto/sha512/sha512block_arm64.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build arm64
6+
// +build arm64
7+
8+
package sha512
9+
10+
//go:noescape
11+
func blockNEON(dig *digest, p []byte)
12+
13+
func block(dig *digest, p []byte) {
14+
// FIXME using cpu.ARM64.HasSHA512 to protect the devices without SHA512 instructions
15+
blockNEON(dig, p)
16+
}

‎src/crypto/sha512/sha512block_arm64.s

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "textflag.h"
6+
7+
// Load data from "p" and covert it from little endian to big endian (AMR64 is little endian)
8+
#define LOAD_AND_CONVERT_ENDIAN(s) \
9+
VLD1.P 16(R1), [s.B16]; \
10+
VREV64 s.B16, s.B16;
11+
12+
#define SIGMA_ROUND(s, q0, q1, q2, q3) \
13+
VLD1.P 16(R3), [V27.D2]; \
14+
VADD s.D2, V27.D2, V28.D2; \
15+
VEXT 8ドル, V28.B16, V28.B16, V28.B16; \
16+
VADD V28.D2, q3.D2, V28.D2; \
17+
VEXT 8ドル, q3.B16, q2.B16, V29.B16; \
18+
VEXT 8ドル, q2.B16, q1.B16, V30.B16; \
19+
SHA512H V30.D2, V29, V28; \
20+
VSHL 0ドル, V28.D2, V29.D2; \
21+
SHA512H2 q0.D2, q1, V28; \
22+
VSHL 0ドル, V28.D2, q3.D2; \
23+
VADD q1.D2, V29.D2, q1.D2;
24+
25+
#define GAMMA_ROUND(s0, s1, s2, s3, s4) \
26+
VEXT 8ドル, s3.B16, s2.B16, V30.B16; \
27+
SHA512SU0 s1.D2, s0.D2; \
28+
SHA512SU1 V30.D2, s4.D2, s0.D2;
29+
30+
// func blockNEON(dig *digest, p []byte)
31+
TEXT ·blockNEON(SB), NOSPLIT, 0ドル-32
32+
MOVD dig+0(FP), R0
33+
MOVD p_base+8(FP), R1
34+
MOVD p_len+16(FP), R2
35+
MOVD $round_consts(SB), R3
36+
37+
VLD1.P 32(R0), [V0.D2, V1.D2] // ab, cd
38+
VLD1 (R0), [V2.D2, V3.D2] // ef, gh
39+
40+
VSHL 0ドル, V0.D2, V23.D2
41+
VSHL 0ドル, V1.D2, V24.D2
42+
VSHL 0ドル, V2.D2, V25.D2
43+
VSHL 0ドル, V3.D2, V26.D2
44+
45+
loop:
46+
LOAD_AND_CONVERT_ENDIAN(V4)
47+
SIGMA_ROUND(V4, V0, V1, V2, V3) // 0
48+
LOAD_AND_CONVERT_ENDIAN(V5)
49+
SIGMA_ROUND(V5, V3, V0, V1, V2) // 2
50+
LOAD_AND_CONVERT_ENDIAN(V6)
51+
SIGMA_ROUND(V6, V2, V3, V0, V1) // 4
52+
LOAD_AND_CONVERT_ENDIAN(V7)
53+
SIGMA_ROUND(V7, V1, V2, V3, V0) // 6
54+
LOAD_AND_CONVERT_ENDIAN(V8)
55+
SIGMA_ROUND(V8, V0, V1, V2, V3) // 8
56+
LOAD_AND_CONVERT_ENDIAN(V9)
57+
SIGMA_ROUND(V9, V3, V0, V1, V2) // 10
58+
LOAD_AND_CONVERT_ENDIAN(V10)
59+
SIGMA_ROUND(V10, V2, V3, V0, V1) // 12
60+
LOAD_AND_CONVERT_ENDIAN(V11)
61+
SIGMA_ROUND(V11, V1, V2, V3, V0) // 14
62+
63+
GAMMA_ROUND(V4, V5, V8, V9, V11)
64+
SIGMA_ROUND(V4, V0, V1, V2, V3) // 16
65+
GAMMA_ROUND(V5, V6, V9, V10, V4)
66+
SIGMA_ROUND(V5, V3, V0, V1, V2) // 18
67+
GAMMA_ROUND(V6, V7, V10, V11, V5)
68+
SIGMA_ROUND(V6, V2, V3, V0, V1) // 20
69+
GAMMA_ROUND(V7, V8, V11, V4, V6)
70+
SIGMA_ROUND(V7, V1, V2, V3, V0) // 22
71+
GAMMA_ROUND(V8, V9, V4, V5, V7)
72+
SIGMA_ROUND(V8, V0, V1, V2, V3) // 24
73+
GAMMA_ROUND(V9, V10, V5, V6, V8)
74+
SIGMA_ROUND(V9, V3, V0, V1, V2) // 26
75+
GAMMA_ROUND(V10, V11, V6, V7, V9)
76+
SIGMA_ROUND(V10, V2, V3, V0, V1) // 28
77+
GAMMA_ROUND(V11, V4, V7, V8, V10)
78+
SIGMA_ROUND(V11, V1, V2, V3, V0) // 30
79+
80+
GAMMA_ROUND(V4, V5, V8, V9, V11)
81+
SIGMA_ROUND(V4, V0, V1, V2, V3) // 32
82+
GAMMA_ROUND(V5, V6, V9, V10, V4)
83+
SIGMA_ROUND(V5, V3, V0, V1, V2) // 34
84+
GAMMA_ROUND(V6, V7, V10, V11, V5)
85+
SIGMA_ROUND(V6, V2, V3, V0, V1) // 36
86+
GAMMA_ROUND(V7, V8, V11, V4, V6)
87+
SIGMA_ROUND(V7, V1, V2, V3, V0) // 38
88+
GAMMA_ROUND(V8, V9, V4, V5, V7)
89+
SIGMA_ROUND(V8, V0, V1, V2, V3) // 40
90+
GAMMA_ROUND(V9, V10, V5, V6, V8)
91+
SIGMA_ROUND(V9, V3, V0, V1, V2) // 42
92+
GAMMA_ROUND(V10, V11, V6, V7, V9)
93+
SIGMA_ROUND(V10, V2, V3, V0, V1) // 44
94+
GAMMA_ROUND(V11, V4, V7, V8, V10)
95+
SIGMA_ROUND(V11, V1, V2, V3, V0) // 46
96+
97+
GAMMA_ROUND(V4, V5, V8, V9, V11)
98+
SIGMA_ROUND(V4, V0, V1, V2, V3) // 48
99+
GAMMA_ROUND(V5, V6, V9, V10, V4)
100+
SIGMA_ROUND(V5, V3, V0, V1, V2) // 50
101+
GAMMA_ROUND(V6, V7, V10, V11, V5)
102+
SIGMA_ROUND(V6, V2, V3, V0, V1) // 52
103+
GAMMA_ROUND(V7, V8, V11, V4, V6)
104+
SIGMA_ROUND(V7, V1, V2, V3, V0) // 54
105+
GAMMA_ROUND(V8, V9, V4, V5, V7)
106+
SIGMA_ROUND(V8, V0, V1, V2, V3) // 56
107+
GAMMA_ROUND(V9, V10, V5, V6, V8)
108+
SIGMA_ROUND(V9, V3, V0, V1, V2) // 58
109+
GAMMA_ROUND(V10, V11, V6, V7, V9)
110+
SIGMA_ROUND(V10, V2, V3, V0, V1) // 60
111+
GAMMA_ROUND(V11, V4, V7, V8, V10)
112+
SIGMA_ROUND(V11, V1, V2, V3, V0) // 62
113+
114+
GAMMA_ROUND(V4, V5, V8, V9, V11)
115+
SIGMA_ROUND(V4, V0, V1, V2, V3) // 64
116+
GAMMA_ROUND(V5, V6, V9, V10, V4)
117+
SIGMA_ROUND(V5, V3, V0, V1, V2) // 66
118+
GAMMA_ROUND(V6, V7, V10, V11, V5)
119+
SIGMA_ROUND(V6, V2, V3, V0, V1) // 68
120+
GAMMA_ROUND(V7, V8, V11, V4, V6)
121+
SIGMA_ROUND(V7, V1, V2, V3, V0) // 70
122+
GAMMA_ROUND(V8, V9, V4, V5, V7)
123+
SIGMA_ROUND(V8, V0, V1, V2, V3) // 72
124+
GAMMA_ROUND(V9, V10, V5, V6, V8)
125+
SIGMA_ROUND(V9, V3, V0, V1, V2) // 74
126+
GAMMA_ROUND(V10, V11, V6, V7, V9)
127+
SIGMA_ROUND(V10, V2, V3, V0, V1) // 76
128+
GAMMA_ROUND(V11, V4, V7, V8, V10)
129+
SIGMA_ROUND(V11, V1, V2, V3, V0) // 78
130+
131+
VADD V0.D2, V23.D2, V23.D2
132+
VADD V1.D2, V24.D2, V24.D2
133+
VADD V2.D2, V25.D2, V25.D2
134+
VADD V3.D2, V26.D2, V26.D2
135+
136+
// reset round constant table pointer to the pointer of beginning of the table
137+
SUBS 640ドル, R3, R3
138+
139+
// check the whether consume all the input
140+
SUBS 128ドル, R2, R2
141+
CBNZ R2, loop
142+
143+
SUBS 32ドル, R0, R0
144+
VST1.P [V23.D2, V24.D2], 32(R0)
145+
VST1 [V25.D2, V26.D2], (R0)
146+
RET
147+
148+
DATA round_consts<>+0x000(SB)/8, 0ドルx428a2f98d728ae22
149+
DATA round_consts<>+0x008(SB)/8, 0ドルx7137449123ef65cd
150+
DATA round_consts<>+0x010(SB)/8, 0ドルxb5c0fbcfec4d3b2f
151+
DATA round_consts<>+0x018(SB)/8, 0ドルxe9b5dba58189dbbc
152+
DATA round_consts<>+0x020(SB)/8, 0ドルx3956c25bf348b538
153+
DATA round_consts<>+0x028(SB)/8, 0ドルx59f111f1b605d019
154+
DATA round_consts<>+0x030(SB)/8, 0ドルx923f82a4af194f9b
155+
DATA round_consts<>+0x038(SB)/8, 0ドルxab1c5ed5da6d8118
156+
DATA round_consts<>+0x040(SB)/8, 0ドルxd807aa98a3030242
157+
DATA round_consts<>+0x048(SB)/8, 0ドルx12835b0145706fbe
158+
DATA round_consts<>+0x050(SB)/8, 0ドルx243185be4ee4b28c
159+
DATA round_consts<>+0x058(SB)/8, 0ドルx550c7dc3d5ffb4e2
160+
DATA round_consts<>+0x060(SB)/8, 0ドルx72be5d74f27b896f
161+
DATA round_consts<>+0x068(SB)/8, 0ドルx80deb1fe3b1696b1
162+
DATA round_consts<>+0x070(SB)/8, 0ドルx9bdc06a725c71235
163+
DATA round_consts<>+0x078(SB)/8, 0ドルxc19bf174cf692694
164+
DATA round_consts<>+0x080(SB)/8, 0ドルxe49b69c19ef14ad2
165+
DATA round_consts<>+0x088(SB)/8, 0ドルxefbe4786384f25e3
166+
DATA round_consts<>+0x090(SB)/8, 0ドルx0fc19dc68b8cd5b5
167+
DATA round_consts<>+0x098(SB)/8, 0ドルx240ca1cc77ac9c65
168+
DATA round_consts<>+0x0A0(SB)/8, 0ドルx2de92c6f592b0275
169+
DATA round_consts<>+0x0A8(SB)/8, 0ドルx4a7484aa6ea6e483
170+
DATA round_consts<>+0x0B0(SB)/8, 0ドルx5cb0a9dcbd41fbd4
171+
DATA round_consts<>+0x0B8(SB)/8, 0ドルx76f988da831153b5
172+
DATA round_consts<>+0x0C0(SB)/8, 0ドルx983e5152ee66dfab
173+
DATA round_consts<>+0x0C8(SB)/8, 0ドルxa831c66d2db43210
174+
DATA round_consts<>+0x0D0(SB)/8, 0ドルxb00327c898fb213f
175+
DATA round_consts<>+0x0D8(SB)/8, 0ドルxbf597fc7beef0ee4
176+
DATA round_consts<>+0x0E0(SB)/8, 0ドルxc6e00bf33da88fc2
177+
DATA round_consts<>+0x0E8(SB)/8, 0ドルxd5a79147930aa725
178+
DATA round_consts<>+0x0F0(SB)/8, 0ドルx06ca6351e003826f
179+
DATA round_consts<>+0x0F8(SB)/8, 0ドルx142929670a0e6e70
180+
DATA round_consts<>+0x100(SB)/8, 0ドルx27b70a8546d22ffc
181+
DATA round_consts<>+0x108(SB)/8, 0ドルx2e1b21385c26c926
182+
DATA round_consts<>+0x110(SB)/8, 0ドルx4d2c6dfc5ac42aed
183+
DATA round_consts<>+0x118(SB)/8, 0ドルx53380d139d95b3df
184+
DATA round_consts<>+0x120(SB)/8, 0ドルx650a73548baf63de
185+
DATA round_consts<>+0x128(SB)/8, 0ドルx766a0abb3c77b2a8
186+
DATA round_consts<>+0x130(SB)/8, 0ドルx81c2c92e47edaee6
187+
DATA round_consts<>+0x138(SB)/8, 0ドルx92722c851482353b
188+
DATA round_consts<>+0x140(SB)/8, 0ドルxa2bfe8a14cf10364
189+
DATA round_consts<>+0x148(SB)/8, 0ドルxa81a664bbc423001
190+
DATA round_consts<>+0x150(SB)/8, 0ドルxc24b8b70d0f89791
191+
DATA round_consts<>+0x158(SB)/8, 0ドルxc76c51a30654be30
192+
DATA round_consts<>+0x160(SB)/8, 0ドルxd192e819d6ef5218
193+
DATA round_consts<>+0x168(SB)/8, 0ドルxd69906245565a910
194+
DATA round_consts<>+0x170(SB)/8, 0ドルxf40e35855771202a
195+
DATA round_consts<>+0x178(SB)/8, 0ドルx106aa07032bbd1b8
196+
DATA round_consts<>+0x180(SB)/8, 0ドルx19a4c116b8d2d0c8
197+
DATA round_consts<>+0x188(SB)/8, 0ドルx1e376c085141ab53
198+
DATA round_consts<>+0x190(SB)/8, 0ドルx2748774cdf8eeb99
199+
DATA round_consts<>+0x198(SB)/8, 0ドルx34b0bcb5e19b48a8
200+
DATA round_consts<>+0x1A0(SB)/8, 0ドルx391c0cb3c5c95a63
201+
DATA round_consts<>+0x1A8(SB)/8, 0ドルx4ed8aa4ae3418acb
202+
DATA round_consts<>+0x1B0(SB)/8, 0ドルx5b9cca4f7763e373
203+
DATA round_consts<>+0x1B8(SB)/8, 0ドルx682e6ff3d6b2b8a3
204+
DATA round_consts<>+0x1C0(SB)/8, 0ドルx748f82ee5defb2fc
205+
DATA round_consts<>+0x1C8(SB)/8, 0ドルx78a5636f43172f60
206+
DATA round_consts<>+0x1D0(SB)/8, 0ドルx84c87814a1f0ab72
207+
DATA round_consts<>+0x1D8(SB)/8, 0ドルx8cc702081a6439ec
208+
DATA round_consts<>+0x1E0(SB)/8, 0ドルx90befffa23631e28
209+
DATA round_consts<>+0x1E8(SB)/8, 0ドルxa4506cebde82bde9
210+
DATA round_consts<>+0x1F0(SB)/8, 0ドルxbef9a3f7b2c67915
211+
DATA round_consts<>+0x1F8(SB)/8, 0ドルxc67178f2e372532b
212+
DATA round_consts<>+0x200(SB)/8, 0ドルxca273eceea26619c
213+
DATA round_consts<>+0x208(SB)/8, 0ドルxd186b8c721c0c207
214+
DATA round_consts<>+0x210(SB)/8, 0ドルxeada7dd6cde0eb1e
215+
DATA round_consts<>+0x218(SB)/8, 0ドルxf57d4f7fee6ed178
216+
DATA round_consts<>+0x220(SB)/8, 0ドルx06f067aa72176fba
217+
DATA round_consts<>+0x228(SB)/8, 0ドルx0a637dc5a2c898a6
218+
DATA round_consts<>+0x230(SB)/8, 0ドルx113f9804bef90dae
219+
DATA round_consts<>+0x238(SB)/8, 0ドルx1b710b35131c471b
220+
DATA round_consts<>+0x240(SB)/8, 0ドルx28db77f523047d84
221+
DATA round_consts<>+0x248(SB)/8, 0ドルx32caab7b40c72493
222+
DATA round_consts<>+0x250(SB)/8, 0ドルx3c9ebe0a15c9bebc
223+
DATA round_consts<>+0x258(SB)/8, 0ドルx431d67c49c100d4c
224+
DATA round_consts<>+0x260(SB)/8, 0ドルx4cc5d4becb3e42b6
225+
DATA round_consts<>+0x268(SB)/8, 0ドルx597f299cfc657e2a
226+
DATA round_consts<>+0x270(SB)/8, 0ドルx5fcb6fab3ad6faec
227+
DATA round_consts<>+0x278(SB)/8, 0ドルx6c44198c4a475817
228+
GLOBL round_consts(SB), (NOPTR+RODATA), 640ドル

‎src/crypto/sha512/sha512block_generic.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !s390x && !ppc64le
6-
// +build !amd64,!s390x,!ppc64le
5+
//go:build !amd64 && !s390x && !ppc64le && !arm64
6+
// +build !amd64,!s390x,!ppc64le,!arm64
77

88
package sha512
99

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /