blob: 81dd0c4235001521a16af5dbfbdebbcec5f85daa [file] [log] [blame]
/*
* ====================================================
* Copyright (C) 2007 by Ellips BV. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#include "x86_64mach.h"
.global SYM (memcpy)
SOTYPE_FUNCTION(memcpy)
SYM (memcpy):
movq rdi, rax /* Store destination in return value */
cmpq $16, rdx
jb byte_copy
movq rdi, r8 /* Align destination on quad word boundary */
andq $7, r8
jz quadword_aligned
movq $8, rcx
subq r8, rcx
subq rcx, rdx
rep movsb
quadword_aligned:
cmpq $256, rdx
jb quadword_copy
pushq rax
pushq r12
pushq r13
pushq r14
movq rdx, rcx /* Copy 128 bytes at a time with minimum cache polution */
shrq $7, rcx
.p2align 4
loop:
prefetchnta 768 (rsi)
prefetchnta 832 (rsi)
movq (rsi), rax
movq 8 (rsi), r8
movq 16 (rsi), r9
movq 24 (rsi), r10
movq 32 (rsi), r11
movq 40 (rsi), r12
movq 48 (rsi), r13
movq 56 (rsi), r14
movntiq rax, (rdi)
movntiq r8 , 8 (rdi)
movntiq r9 , 16 (rdi)
movntiq r10, 24 (rdi)
movntiq r11, 32 (rdi)
movntiq r12, 40 (rdi)
movntiq r13, 48 (rdi)
movntiq r14, 56 (rdi)
movq 64 (rsi), rax
movq 72 (rsi), r8
movq 80 (rsi), r9
movq 88 (rsi), r10
movq 96 (rsi), r11
movq 104 (rsi), r12
movq 112 (rsi), r13
movq 120 (rsi), r14
movntiq rax, 64 (rdi)
movntiq r8 , 72 (rdi)
movntiq r9 , 80 (rdi)
movntiq r10, 88 (rdi)
movntiq r11, 96 (rdi)
movntiq r12, 104 (rdi)
movntiq r13, 112 (rdi)
movntiq r14, 120 (rdi)
leaq 128 (rsi), rsi
leaq 128 (rdi), rdi
dec rcx
jnz loop
sfence
movq rdx, rcx
andq $127, rcx
rep movsb
popq r14
popq r13
popq r12
popq rax
ret
byte_copy:
movq rdx, rcx
rep movsb
ret
quadword_copy:
movq rdx, rcx
shrq $3, rcx
.p2align 4
rep movsq
movq rdx, rcx
andq $7, rcx
rep movsb /* Copy the remaining bytes */
ret