1 /* $OpenBSD: memcpy.S,v 1.6 2015年06月08日 14:22:05 jsg Exp $ */ 2 /* $NetBSD: memcpy.S,v 1.2 2001年11月20日 00:29:20 chris Exp $ */ 3 4 /*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <machine/asm.h> 34 35 /* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66.syntax unified 67 68 ENTRY(memcpy) 69 ENTRY_NP(memmove) 70 /* Determine copy direction */ 71 cmp r1, r0 72 73 moveq pc, lr 74 75 /* save leaf functions having to store this away */ 76 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 77 78 bcc Lmemcpy_backwards 79 80 /* start of forwards copy */ 81 subs r2, r2, #4 82 blt Lmemcpy_fl4 /* less than 4 bytes */ 83 ands r12, r0, #3 84 bne Lmemcpy_fdestul /* oh unaligned destination addr */ 85 ands r12, r1, #3 86 bne Lmemcpy_fsrcul /* oh unaligned source addr */ 87 88 Lmemcpy_ft8: 89 /* We have aligned source and destination */ 90 subs r2, r2, #8 91 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 92 subs r2, r2, #0x14 93 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 94 stmdb sp!, {r4} /* borrow r4 */ 95 96 /* blat 32 bytes at a time */ 97 /* XXX for really big copies perhaps we should use more registers */ 98 Lmemcpy_floop32: 99 ldmia r1!, {r3, r4, r12, lr} 100 stmia r0!, {r3, r4, r12, lr} 101 ldmia r1!, {r3, r4, r12, lr} 102 stmia r0!, {r3, r4, r12, lr} 103 subs r2, r2, #0x20 104 bge Lmemcpy_floop32 105 106 cmn r2, #0x10 107 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 108 stmiage r0!, {r3, r4, r12, lr} 109 subge r2, r2, #0x10 110 ldmia sp!, {r4} /* return r4 */ 111 112 Lmemcpy_fl32: 113 adds r2, r2, #0x14 114 115 /* blat 12 bytes at a time */ 116 Lmemcpy_floop12: 117 ldmiage r1!, {r3, r12, lr} 118 stmiage r0!, {r3, r12, lr} 119 subsge r2, r2, #0x0c 120 bge Lmemcpy_floop12 121 122 Lmemcpy_fl12: 123 adds r2, r2, #8 124 blt Lmemcpy_fl4 125 126 subs r2, r2, #4 127 ldrlt r3, [r1], #4 128 strlt r3, [r0], #4 129 ldmiage r1!, {r3, r12} 130 stmiage r0!, {r3, r12} 131 subge r2, r2, #4 132 133 Lmemcpy_fl4: 134 /* less than 4 bytes to go */ 135 adds r2, r2, #4 136#ifdef __APCS_26_ 137 ldmiaeq sp!, {r0, pc}^ /* done */ 138#else 139 ldmiaeq sp!, {r0, pc} /* done */ 140#endif 141 /* copy the crud byte at a time */ 142 cmp r2, #2 143 ldrb r3, [r1], #1 144 strb r3, [r0], #1 145 ldrbge r3, [r1], #1 146 strbge r3, [r0], #1 147 ldrbgt r3, [r1], #1 148 strbgt r3, [r0], #1 149 ldmia sp!, {r0, pc} 150 151 /* erg - unaligned destination */ 152 Lmemcpy_fdestul: 153 rsb r12, r12, #4 154 cmp r12, #2 155 156 /* align destination with byte copies */ 157 ldrb r3, [r1], #1 158 strb r3, [r0], #1 159 ldrbge r3, [r1], #1 160 strbge r3, [r0], #1 161 ldrbgt r3, [r1], #1 162 strbgt r3, [r0], #1 163 subs r2, r2, r12 164 blt Lmemcpy_fl4 /* less the 4 bytes */ 165 166 ands r12, r1, #3 167 beq Lmemcpy_ft8 /* we have an aligned source */ 168 169 /* erg - unaligned source */ 170 /* This is where it gets nasty ... */ 171 Lmemcpy_fsrcul: 172 bic r1, r1, #3 173 ldr lr, [r1], #4 174 cmp r12, #2 175 bgt Lmemcpy_fsrcul3 176 beq Lmemcpy_fsrcul2 177 cmp r2, #0x0c 178 blt Lmemcpy_fsrcul1loop4 179 sub r2, r2, #0x0c 180 stmdb sp!, {r4, r5} 181 182 Lmemcpy_fsrcul1loop16: 183 mov r3, lr, lsr #8 184 ldmia r1!, {r4, r5, r12, lr} 185 orr r3, r3, r4, lsl #24 186 mov r4, r4, lsr #8 187 orr r4, r4, r5, lsl #24 188 mov r5, r5, lsr #8 189 orr r5, r5, r12, lsl #24 190 mov r12, r12, lsr #8 191 orr r12, r12, lr, lsl #24 192 stmia r0!, {r3-r5, r12} 193 subs r2, r2, #0x10 194 bge Lmemcpy_fsrcul1loop16 195 ldmia sp!, {r4, r5} 196 adds r2, r2, #0x0c 197 blt Lmemcpy_fsrcul1l4 198 199 Lmemcpy_fsrcul1loop4: 200 mov r12, lr, lsr #8 201 ldr lr, [r1], #4 202 orr r12, r12, lr, lsl #24 203 str r12, [r0], #4 204 subs r2, r2, #4 205 bge Lmemcpy_fsrcul1loop4 206 207 Lmemcpy_fsrcul1l4: 208 sub r1, r1, #3 209 b Lmemcpy_fl4 210 211 Lmemcpy_fsrcul2: 212 cmp r2, #0x0c 213 blt Lmemcpy_fsrcul2loop4 214 sub r2, r2, #0x0c 215 stmdb sp!, {r4, r5} 216 217 Lmemcpy_fsrcul2loop16: 218 mov r3, lr, lsr #16 219 ldmia r1!, {r4, r5, r12, lr} 220 orr r3, r3, r4, lsl #16 221 mov r4, r4, lsr #16 222 orr r4, r4, r5, lsl #16 223 mov r5, r5, lsr #16 224 orr r5, r5, r12, lsl #16 225 mov r12, r12, lsr #16 226 orr r12, r12, lr, lsl #16 227 stmia r0!, {r3-r5, r12} 228 subs r2, r2, #0x10 229 bge Lmemcpy_fsrcul2loop16 230 ldmia sp!, {r4, r5} 231 adds r2, r2, #0x0c 232 blt Lmemcpy_fsrcul2l4 233 234 Lmemcpy_fsrcul2loop4: 235 mov r12, lr, lsr #16 236 ldr lr, [r1], #4 237 orr r12, r12, lr, lsl #16 238 str r12, [r0], #4 239 subs r2, r2, #4 240 bge Lmemcpy_fsrcul2loop4 241 242 Lmemcpy_fsrcul2l4: 243 sub r1, r1, #2 244 b Lmemcpy_fl4 245 246 Lmemcpy_fsrcul3: 247 cmp r2, #0x0c 248 blt Lmemcpy_fsrcul3loop4 249 sub r2, r2, #0x0c 250 stmdb sp!, {r4, r5} 251 252 Lmemcpy_fsrcul3loop16: 253 mov r3, lr, lsr #24 254 ldmia r1!, {r4, r5, r12, lr} 255 orr r3, r3, r4, lsl #8 256 mov r4, r4, lsr #24 257 orr r4, r4, r5, lsl #8 258 mov r5, r5, lsr #24 259 orr r5, r5, r12, lsl #8 260 mov r12, r12, lsr #24 261 orr r12, r12, lr, lsl #8 262 stmia r0!, {r3-r5, r12} 263 subs r2, r2, #0x10 264 bge Lmemcpy_fsrcul3loop16 265 ldmia sp!, {r4, r5} 266 adds r2, r2, #0x0c 267 blt Lmemcpy_fsrcul3l4 268 269 Lmemcpy_fsrcul3loop4: 270 mov r12, lr, lsr #24 271 ldr lr, [r1], #4 272 orr r12, r12, lr, lsl #8 273 str r12, [r0], #4 274 subs r2, r2, #4 275 bge Lmemcpy_fsrcul3loop4 276 277 Lmemcpy_fsrcul3l4: 278 sub r1, r1, #1 279 b Lmemcpy_fl4 280 281 Lmemcpy_backwards: 282 add r1, r1, r2 283 add r0, r0, r2 284 subs r2, r2, #4 285 blt Lmemcpy_bl4 /* less than 4 bytes */ 286 ands r12, r0, #3 287 bne Lmemcpy_bdestul /* oh unaligned destination addr */ 288 ands r12, r1, #3 289 bne Lmemcpy_bsrcul /* oh unaligned source addr */ 290 291 Lmemcpy_bt8: 292 /* We have aligned source and destination */ 293 subs r2, r2, #8 294 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 295 stmdb sp!, {r4} 296 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 297 blt Lmemcpy_bl32 298 299 /* blat 32 bytes at a time */ 300 /* XXX for really big copies perhaps we should use more registers */ 301 Lmemcpy_bloop32: 302 ldmdb r1!, {r3, r4, r12, lr} 303 stmdb r0!, {r3, r4, r12, lr} 304 ldmdb r1!, {r3, r4, r12, lr} 305 stmdb r0!, {r3, r4, r12, lr} 306 subs r2, r2, #0x20 307 bge Lmemcpy_bloop32 308 309 Lmemcpy_bl32: 310 cmn r2, #0x10 311 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 312 stmdbge r0!, {r3, r4, r12, lr} 313 subge r2, r2, #0x10 314 adds r2, r2, #0x14 315 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 316 stmdbge r0!, {r3, r12, lr} 317 subge r2, r2, #0x0c 318 ldmia sp!, {r4} 319 320 Lmemcpy_bl12: 321 adds r2, r2, #8 322 blt Lmemcpy_bl4 323 subs r2, r2, #4 324 ldrlt r3, [r1, #-4]! 325 strlt r3, [r0, #-4]! 326 ldmdbge r1!, {r3, r12} 327 stmdbge r0!, {r3, r12} 328 subge r2, r2, #4 329 330 Lmemcpy_bl4: 331 /* less than 4 bytes to go */ 332 adds r2, r2, #4 333 ldmiaeq sp!, {r0, pc} 334 335 /* copy the crud byte at a time */ 336 cmp r2, #2 337 ldrb r3, [r1, #-1]! 338 strb r3, [r0, #-1]! 339 ldrbge r3, [r1, #-1]! 340 strbge r3, [r0, #-1]! 341 ldrbgt r3, [r1, #-1]! 342 strbgt r3, [r0, #-1]! 343 ldmia sp!, {r0, pc} 344 345 /* erg - unaligned destination */ 346 Lmemcpy_bdestul: 347 cmp r12, #2 348 349 /* align destination with byte copies */ 350 ldrb r3, [r1, #-1]! 351 strb r3, [r0, #-1]! 352 ldrbge r3, [r1, #-1]! 353 strbge r3, [r0, #-1]! 354 ldrbgt r3, [r1, #-1]! 355 strbgt r3, [r0, #-1]! 356 subs r2, r2, r12 357 blt Lmemcpy_bl4 /* less than 4 bytes to go */ 358 ands r12, r1, #3 359 beq Lmemcpy_bt8 /* we have an aligned source */ 360 361 /* erg - unaligned source */ 362 /* This is where it gets nasty ... */ 363 Lmemcpy_bsrcul: 364 bic r1, r1, #3 365 ldr r3, [r1, #0] 366 cmp r12, #2 367 blt Lmemcpy_bsrcul1 368 beq Lmemcpy_bsrcul2 369 cmp r2, #0x0c 370 blt Lmemcpy_bsrcul3loop4 371 sub r2, r2, #0x0c 372 stmdb sp!, {r4, r5} 373 374 Lmemcpy_bsrcul3loop16: 375 mov lr, r3, lsl #8 376 ldmdb r1!, {r3-r5, r12} 377 orr lr, lr, r12, lsr #24 378 mov r12, r12, lsl #8 379 orr r12, r12, r5, lsr #24 380 mov r5, r5, lsl #8 381 orr r5, r5, r4, lsr #24 382 mov r4, r4, lsl #8 383 orr r4, r4, r3, lsr #24 384 stmdb r0!, {r4, r5, r12, lr} 385 subs r2, r2, #0x10 386 bge Lmemcpy_bsrcul3loop16 387 ldmia sp!, {r4, r5} 388 adds r2, r2, #0x0c 389 blt Lmemcpy_bsrcul3l4 390 391 Lmemcpy_bsrcul3loop4: 392 mov r12, r3, lsl #8 393 ldr r3, [r1, #-4]! 394 orr r12, r12, r3, lsr #24 395 str r12, [r0, #-4]! 396 subs r2, r2, #4 397 bge Lmemcpy_bsrcul3loop4 398 399 Lmemcpy_bsrcul3l4: 400 add r1, r1, #3 401 b Lmemcpy_bl4 402 403 Lmemcpy_bsrcul2: 404 cmp r2, #0x0c 405 blt Lmemcpy_bsrcul2loop4 406 sub r2, r2, #0x0c 407 stmdb sp!, {r4, r5} 408 409 Lmemcpy_bsrcul2loop16: 410 mov lr, r3, lsl #16 411 ldmdb r1!, {r3-r5, r12} 412 orr lr, lr, r12, lsr #16 413 mov r12, r12, lsl #16 414 orr r12, r12, r5, lsr #16 415 mov r5, r5, lsl #16 416 orr r5, r5, r4, lsr #16 417 mov r4, r4, lsl #16 418 orr r4, r4, r3, lsr #16 419 stmdb r0!, {r4, r5, r12, lr} 420 subs r2, r2, #0x10 421 bge Lmemcpy_bsrcul2loop16 422 ldmia sp!, {r4, r5} 423 adds r2, r2, #0x0c 424 blt Lmemcpy_bsrcul2l4 425 426 Lmemcpy_bsrcul2loop4: 427 mov r12, r3, lsl #16 428 ldr r3, [r1, #-4]! 429 orr r12, r12, r3, lsr #16 430 str r12, [r0, #-4]! 431 subs r2, r2, #4 432 bge Lmemcpy_bsrcul2loop4 433 434 Lmemcpy_bsrcul2l4: 435 add r1, r1, #2 436 b Lmemcpy_bl4 437 438 Lmemcpy_bsrcul1: 439 cmp r2, #0x0c 440 blt Lmemcpy_bsrcul1loop4 441 sub r2, r2, #0x0c 442 stmdb sp!, {r4, r5} 443 444 Lmemcpy_bsrcul1loop32: 445 mov lr, r3, lsl #24 446 ldmdb r1!, {r3-r5, r12} 447 orr lr, lr, r12, lsr #8 448 mov r12, r12, lsl #24 449 orr r12, r12, r5, lsr #8 450 mov r5, r5, lsl #24 451 orr r5, r5, r4, lsr #8 452 mov r4, r4, lsl #24 453 orr r4, r4, r3, lsr #8 454 stmdb r0!, {r4, r5, r12, lr} 455 subs r2, r2, #0x10 456 bge Lmemcpy_bsrcul1loop32 457 ldmia sp!, {r4, r5} 458 adds r2, r2, #0x0c 459 blt Lmemcpy_bsrcul1l4 460 461 Lmemcpy_bsrcul1loop4: 462 mov r12, r3, lsl #24 463 ldr r3, [r1, #-4]! 464 orr r12, r12, r3, lsr #8 465 str r12, [r0, #-4]! 466 subs r2, r2, #4 467 bge Lmemcpy_bsrcul1loop4 468 469 Lmemcpy_bsrcul1l4: 470 add r1, r1, #1 471 b Lmemcpy_bl4 472 473