|
| 1 | +// Copyright 2021 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +#include "textflag.h" |
| 6 | + |
| 7 | +// Load data from "p" and covert it from little endian to big endian (AMR64 is little endian) |
| 8 | +#define LOAD_AND_CONVERT_ENDIAN(s) \ |
| 9 | +VLD1.P 16(R1), [s.B16]; \ |
| 10 | +VREV64 s.B16, s.B16; |
| 11 | + |
| 12 | +#define SIGMA_ROUND(s, q0, q1, q2, q3) \ |
| 13 | +VLD1.P 16(R3), [V27.D2]; \ |
| 14 | +VADD s.D2, V27.D2, V28.D2; \ |
| 15 | +VEXT 8ドル, V28.B16, V28.B16, V28.B16; \ |
| 16 | +VADD V28.D2, q3.D2, V28.D2; \ |
| 17 | +VEXT 8ドル, q3.B16, q2.B16, V29.B16; \ |
| 18 | +VEXT 8ドル, q2.B16, q1.B16, V30.B16; \ |
| 19 | +SHA512H V30.D2, V29, V28; \ |
| 20 | +VSHL 0ドル, V28.D2, V29.D2; \ |
| 21 | +SHA512H2 q0.D2, q1, V28; \ |
| 22 | +VSHL 0ドル, V28.D2, q3.D2; \ |
| 23 | +VADD q1.D2, V29.D2, q1.D2; |
| 24 | + |
| 25 | +#define GAMMA_ROUND(s0, s1, s2, s3, s4) \ |
| 26 | +VEXT 8ドル, s3.B16, s2.B16, V30.B16; \ |
| 27 | +SHA512SU0 s1.D2, s0.D2; \ |
| 28 | +SHA512SU1 V30.D2, s4.D2, s0.D2; |
| 29 | + |
| 30 | +// func blockNEON(dig *digest, p []byte) |
| 31 | +TEXT ·blockNEON(SB), NOSPLIT, 0ドル-32 |
| 32 | + MOVD dig+0(FP), R0 |
| 33 | + MOVD p_base+8(FP), R1 |
| 34 | + MOVD p_len+16(FP), R2 |
| 35 | + MOVD $round_consts(SB), R3 |
| 36 | + |
| 37 | + VLD1.P 32(R0), [V0.D2, V1.D2] // ab, cd |
| 38 | + VLD1 (R0), [V2.D2, V3.D2] // ef, gh |
| 39 | + |
| 40 | + VSHL 0ドル, V0.D2, V23.D2 |
| 41 | + VSHL 0ドル, V1.D2, V24.D2 |
| 42 | + VSHL 0ドル, V2.D2, V25.D2 |
| 43 | + VSHL 0ドル, V3.D2, V26.D2 |
| 44 | + |
| 45 | +loop: |
| 46 | + LOAD_AND_CONVERT_ENDIAN(V4) |
| 47 | + SIGMA_ROUND(V4, V0, V1, V2, V3) // 0 |
| 48 | + LOAD_AND_CONVERT_ENDIAN(V5) |
| 49 | + SIGMA_ROUND(V5, V3, V0, V1, V2) // 2 |
| 50 | + LOAD_AND_CONVERT_ENDIAN(V6) |
| 51 | + SIGMA_ROUND(V6, V2, V3, V0, V1) // 4 |
| 52 | + LOAD_AND_CONVERT_ENDIAN(V7) |
| 53 | + SIGMA_ROUND(V7, V1, V2, V3, V0) // 6 |
| 54 | + LOAD_AND_CONVERT_ENDIAN(V8) |
| 55 | + SIGMA_ROUND(V8, V0, V1, V2, V3) // 8 |
| 56 | + LOAD_AND_CONVERT_ENDIAN(V9) |
| 57 | + SIGMA_ROUND(V9, V3, V0, V1, V2) // 10 |
| 58 | + LOAD_AND_CONVERT_ENDIAN(V10) |
| 59 | + SIGMA_ROUND(V10, V2, V3, V0, V1) // 12 |
| 60 | + LOAD_AND_CONVERT_ENDIAN(V11) |
| 61 | + SIGMA_ROUND(V11, V1, V2, V3, V0) // 14 |
| 62 | + |
| 63 | + GAMMA_ROUND(V4, V5, V8, V9, V11) |
| 64 | + SIGMA_ROUND(V4, V0, V1, V2, V3) // 16 |
| 65 | + GAMMA_ROUND(V5, V6, V9, V10, V4) |
| 66 | + SIGMA_ROUND(V5, V3, V0, V1, V2) // 18 |
| 67 | + GAMMA_ROUND(V6, V7, V10, V11, V5) |
| 68 | + SIGMA_ROUND(V6, V2, V3, V0, V1) // 20 |
| 69 | + GAMMA_ROUND(V7, V8, V11, V4, V6) |
| 70 | + SIGMA_ROUND(V7, V1, V2, V3, V0) // 22 |
| 71 | + GAMMA_ROUND(V8, V9, V4, V5, V7) |
| 72 | + SIGMA_ROUND(V8, V0, V1, V2, V3) // 24 |
| 73 | + GAMMA_ROUND(V9, V10, V5, V6, V8) |
| 74 | + SIGMA_ROUND(V9, V3, V0, V1, V2) // 26 |
| 75 | + GAMMA_ROUND(V10, V11, V6, V7, V9) |
| 76 | + SIGMA_ROUND(V10, V2, V3, V0, V1) // 28 |
| 77 | + GAMMA_ROUND(V11, V4, V7, V8, V10) |
| 78 | + SIGMA_ROUND(V11, V1, V2, V3, V0) // 30 |
| 79 | + |
| 80 | + GAMMA_ROUND(V4, V5, V8, V9, V11) |
| 81 | + SIGMA_ROUND(V4, V0, V1, V2, V3) // 32 |
| 82 | + GAMMA_ROUND(V5, V6, V9, V10, V4) |
| 83 | + SIGMA_ROUND(V5, V3, V0, V1, V2) // 34 |
| 84 | + GAMMA_ROUND(V6, V7, V10, V11, V5) |
| 85 | + SIGMA_ROUND(V6, V2, V3, V0, V1) // 36 |
| 86 | + GAMMA_ROUND(V7, V8, V11, V4, V6) |
| 87 | + SIGMA_ROUND(V7, V1, V2, V3, V0) // 38 |
| 88 | + GAMMA_ROUND(V8, V9, V4, V5, V7) |
| 89 | + SIGMA_ROUND(V8, V0, V1, V2, V3) // 40 |
| 90 | + GAMMA_ROUND(V9, V10, V5, V6, V8) |
| 91 | + SIGMA_ROUND(V9, V3, V0, V1, V2) // 42 |
| 92 | + GAMMA_ROUND(V10, V11, V6, V7, V9) |
| 93 | + SIGMA_ROUND(V10, V2, V3, V0, V1) // 44 |
| 94 | + GAMMA_ROUND(V11, V4, V7, V8, V10) |
| 95 | + SIGMA_ROUND(V11, V1, V2, V3, V0) // 46 |
| 96 | + |
| 97 | + GAMMA_ROUND(V4, V5, V8, V9, V11) |
| 98 | + SIGMA_ROUND(V4, V0, V1, V2, V3) // 48 |
| 99 | + GAMMA_ROUND(V5, V6, V9, V10, V4) |
| 100 | + SIGMA_ROUND(V5, V3, V0, V1, V2) // 50 |
| 101 | + GAMMA_ROUND(V6, V7, V10, V11, V5) |
| 102 | + SIGMA_ROUND(V6, V2, V3, V0, V1) // 52 |
| 103 | + GAMMA_ROUND(V7, V8, V11, V4, V6) |
| 104 | + SIGMA_ROUND(V7, V1, V2, V3, V0) // 54 |
| 105 | + GAMMA_ROUND(V8, V9, V4, V5, V7) |
| 106 | + SIGMA_ROUND(V8, V0, V1, V2, V3) // 56 |
| 107 | + GAMMA_ROUND(V9, V10, V5, V6, V8) |
| 108 | + SIGMA_ROUND(V9, V3, V0, V1, V2) // 58 |
| 109 | + GAMMA_ROUND(V10, V11, V6, V7, V9) |
| 110 | + SIGMA_ROUND(V10, V2, V3, V0, V1) // 60 |
| 111 | + GAMMA_ROUND(V11, V4, V7, V8, V10) |
| 112 | + SIGMA_ROUND(V11, V1, V2, V3, V0) // 62 |
| 113 | + |
| 114 | + GAMMA_ROUND(V4, V5, V8, V9, V11) |
| 115 | + SIGMA_ROUND(V4, V0, V1, V2, V3) // 64 |
| 116 | + GAMMA_ROUND(V5, V6, V9, V10, V4) |
| 117 | + SIGMA_ROUND(V5, V3, V0, V1, V2) // 66 |
| 118 | + GAMMA_ROUND(V6, V7, V10, V11, V5) |
| 119 | + SIGMA_ROUND(V6, V2, V3, V0, V1) // 68 |
| 120 | + GAMMA_ROUND(V7, V8, V11, V4, V6) |
| 121 | + SIGMA_ROUND(V7, V1, V2, V3, V0) // 70 |
| 122 | + GAMMA_ROUND(V8, V9, V4, V5, V7) |
| 123 | + SIGMA_ROUND(V8, V0, V1, V2, V3) // 72 |
| 124 | + GAMMA_ROUND(V9, V10, V5, V6, V8) |
| 125 | + SIGMA_ROUND(V9, V3, V0, V1, V2) // 74 |
| 126 | + GAMMA_ROUND(V10, V11, V6, V7, V9) |
| 127 | + SIGMA_ROUND(V10, V2, V3, V0, V1) // 76 |
| 128 | + GAMMA_ROUND(V11, V4, V7, V8, V10) |
| 129 | + SIGMA_ROUND(V11, V1, V2, V3, V0) // 78 |
| 130 | + |
| 131 | + VADD V0.D2, V23.D2, V23.D2 |
| 132 | + VADD V1.D2, V24.D2, V24.D2 |
| 133 | + VADD V2.D2, V25.D2, V25.D2 |
| 134 | + VADD V3.D2, V26.D2, V26.D2 |
| 135 | + |
| 136 | + // reset round constant table pointer to the pointer of beginning of the table |
| 137 | + SUBS 640ドル, R3, R3 |
| 138 | + |
| 139 | + // check the whether consume all the input |
| 140 | + SUBS 128ドル, R2, R2 |
| 141 | + CBNZ R2, loop |
| 142 | + |
| 143 | + SUBS 32ドル, R0, R0 |
| 144 | + VST1.P [V23.D2, V24.D2], 32(R0) |
| 145 | + VST1 [V25.D2, V26.D2], (R0) |
| 146 | + RET |
| 147 | + |
| 148 | +DATA round_consts+0x000(SB)/8, 0ドルx428a2f98d728ae22 |
| 149 | +DATA round_consts+0x008(SB)/8, 0ドルx7137449123ef65cd |
| 150 | +DATA round_consts+0x010(SB)/8, 0ドルxb5c0fbcfec4d3b2f |
| 151 | +DATA round_consts+0x018(SB)/8, 0ドルxe9b5dba58189dbbc |
| 152 | +DATA round_consts+0x020(SB)/8, 0ドルx3956c25bf348b538 |
| 153 | +DATA round_consts+0x028(SB)/8, 0ドルx59f111f1b605d019 |
| 154 | +DATA round_consts+0x030(SB)/8, 0ドルx923f82a4af194f9b |
| 155 | +DATA round_consts+0x038(SB)/8, 0ドルxab1c5ed5da6d8118 |
| 156 | +DATA round_consts+0x040(SB)/8, 0ドルxd807aa98a3030242 |
| 157 | +DATA round_consts+0x048(SB)/8, 0ドルx12835b0145706fbe |
| 158 | +DATA round_consts+0x050(SB)/8, 0ドルx243185be4ee4b28c |
| 159 | +DATA round_consts+0x058(SB)/8, 0ドルx550c7dc3d5ffb4e2 |
| 160 | +DATA round_consts+0x060(SB)/8, 0ドルx72be5d74f27b896f |
| 161 | +DATA round_consts+0x068(SB)/8, 0ドルx80deb1fe3b1696b1 |
| 162 | +DATA round_consts+0x070(SB)/8, 0ドルx9bdc06a725c71235 |
| 163 | +DATA round_consts+0x078(SB)/8, 0ドルxc19bf174cf692694 |
| 164 | +DATA round_consts+0x080(SB)/8, 0ドルxe49b69c19ef14ad2 |
| 165 | +DATA round_consts+0x088(SB)/8, 0ドルxefbe4786384f25e3 |
| 166 | +DATA round_consts+0x090(SB)/8, 0ドルx0fc19dc68b8cd5b5 |
| 167 | +DATA round_consts+0x098(SB)/8, 0ドルx240ca1cc77ac9c65 |
| 168 | +DATA round_consts+0x0A0(SB)/8, 0ドルx2de92c6f592b0275 |
| 169 | +DATA round_consts+0x0A8(SB)/8, 0ドルx4a7484aa6ea6e483 |
| 170 | +DATA round_consts+0x0B0(SB)/8, 0ドルx5cb0a9dcbd41fbd4 |
| 171 | +DATA round_consts+0x0B8(SB)/8, 0ドルx76f988da831153b5 |
| 172 | +DATA round_consts+0x0C0(SB)/8, 0ドルx983e5152ee66dfab |
| 173 | +DATA round_consts+0x0C8(SB)/8, 0ドルxa831c66d2db43210 |
| 174 | +DATA round_consts+0x0D0(SB)/8, 0ドルxb00327c898fb213f |
| 175 | +DATA round_consts+0x0D8(SB)/8, 0ドルxbf597fc7beef0ee4 |
| 176 | +DATA round_consts+0x0E0(SB)/8, 0ドルxc6e00bf33da88fc2 |
| 177 | +DATA round_consts+0x0E8(SB)/8, 0ドルxd5a79147930aa725 |
| 178 | +DATA round_consts+0x0F0(SB)/8, 0ドルx06ca6351e003826f |
| 179 | +DATA round_consts+0x0F8(SB)/8, 0ドルx142929670a0e6e70 |
| 180 | +DATA round_consts+0x100(SB)/8, 0ドルx27b70a8546d22ffc |
| 181 | +DATA round_consts+0x108(SB)/8, 0ドルx2e1b21385c26c926 |
| 182 | +DATA round_consts+0x110(SB)/8, 0ドルx4d2c6dfc5ac42aed |
| 183 | +DATA round_consts+0x118(SB)/8, 0ドルx53380d139d95b3df |
| 184 | +DATA round_consts+0x120(SB)/8, 0ドルx650a73548baf63de |
| 185 | +DATA round_consts+0x128(SB)/8, 0ドルx766a0abb3c77b2a8 |
| 186 | +DATA round_consts+0x130(SB)/8, 0ドルx81c2c92e47edaee6 |
| 187 | +DATA round_consts+0x138(SB)/8, 0ドルx92722c851482353b |
| 188 | +DATA round_consts+0x140(SB)/8, 0ドルxa2bfe8a14cf10364 |
| 189 | +DATA round_consts+0x148(SB)/8, 0ドルxa81a664bbc423001 |
| 190 | +DATA round_consts+0x150(SB)/8, 0ドルxc24b8b70d0f89791 |
| 191 | +DATA round_consts+0x158(SB)/8, 0ドルxc76c51a30654be30 |
| 192 | +DATA round_consts+0x160(SB)/8, 0ドルxd192e819d6ef5218 |
| 193 | +DATA round_consts+0x168(SB)/8, 0ドルxd69906245565a910 |
| 194 | +DATA round_consts+0x170(SB)/8, 0ドルxf40e35855771202a |
| 195 | +DATA round_consts+0x178(SB)/8, 0ドルx106aa07032bbd1b8 |
| 196 | +DATA round_consts+0x180(SB)/8, 0ドルx19a4c116b8d2d0c8 |
| 197 | +DATA round_consts+0x188(SB)/8, 0ドルx1e376c085141ab53 |
| 198 | +DATA round_consts+0x190(SB)/8, 0ドルx2748774cdf8eeb99 |
| 199 | +DATA round_consts+0x198(SB)/8, 0ドルx34b0bcb5e19b48a8 |
| 200 | +DATA round_consts+0x1A0(SB)/8, 0ドルx391c0cb3c5c95a63 |
| 201 | +DATA round_consts+0x1A8(SB)/8, 0ドルx4ed8aa4ae3418acb |
| 202 | +DATA round_consts+0x1B0(SB)/8, 0ドルx5b9cca4f7763e373 |
| 203 | +DATA round_consts+0x1B8(SB)/8, 0ドルx682e6ff3d6b2b8a3 |
| 204 | +DATA round_consts+0x1C0(SB)/8, 0ドルx748f82ee5defb2fc |
| 205 | +DATA round_consts+0x1C8(SB)/8, 0ドルx78a5636f43172f60 |
| 206 | +DATA round_consts+0x1D0(SB)/8, 0ドルx84c87814a1f0ab72 |
| 207 | +DATA round_consts+0x1D8(SB)/8, 0ドルx8cc702081a6439ec |
| 208 | +DATA round_consts+0x1E0(SB)/8, 0ドルx90befffa23631e28 |
| 209 | +DATA round_consts+0x1E8(SB)/8, 0ドルxa4506cebde82bde9 |
| 210 | +DATA round_consts+0x1F0(SB)/8, 0ドルxbef9a3f7b2c67915 |
| 211 | +DATA round_consts+0x1F8(SB)/8, 0ドルxc67178f2e372532b |
| 212 | +DATA round_consts+0x200(SB)/8, 0ドルxca273eceea26619c |
| 213 | +DATA round_consts+0x208(SB)/8, 0ドルxd186b8c721c0c207 |
| 214 | +DATA round_consts+0x210(SB)/8, 0ドルxeada7dd6cde0eb1e |
| 215 | +DATA round_consts+0x218(SB)/8, 0ドルxf57d4f7fee6ed178 |
| 216 | +DATA round_consts+0x220(SB)/8, 0ドルx06f067aa72176fba |
| 217 | +DATA round_consts+0x228(SB)/8, 0ドルx0a637dc5a2c898a6 |
| 218 | +DATA round_consts+0x230(SB)/8, 0ドルx113f9804bef90dae |
| 219 | +DATA round_consts+0x238(SB)/8, 0ドルx1b710b35131c471b |
| 220 | +DATA round_consts+0x240(SB)/8, 0ドルx28db77f523047d84 |
| 221 | +DATA round_consts+0x248(SB)/8, 0ドルx32caab7b40c72493 |
| 222 | +DATA round_consts+0x250(SB)/8, 0ドルx3c9ebe0a15c9bebc |
| 223 | +DATA round_consts+0x258(SB)/8, 0ドルx431d67c49c100d4c |
| 224 | +DATA round_consts+0x260(SB)/8, 0ドルx4cc5d4becb3e42b6 |
| 225 | +DATA round_consts+0x268(SB)/8, 0ドルx597f299cfc657e2a |
| 226 | +DATA round_consts+0x270(SB)/8, 0ドルx5fcb6fab3ad6faec |
| 227 | +DATA round_consts+0x278(SB)/8, 0ドルx6c44198c4a475817 |
| 228 | +GLOBL round_consts(SB), (NOPTR+RODATA), 640ドル |
0 commit comments