본문 바로가기
Bioinformatics/Algorithms

memcpy 성능 평가

by 임은천 2014. 9. 11.

본 글은 http://mail-index.netbsd.org/tech-perform/2002/10/16/0000.html 에서 가지고 왔다.

 

 

Subject: Performance of various memcpy()'s
To: None <tech-perform@netbsd.org>
From: Bang Jun-Young <junyoung@mogua.com>
List: tech-perform
Date: 10/16/2002 04:18:30

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,

About 14 monthes ago, I had some discussion on memcpy performance on
i386 platform here. Monthes later, I took a look into it again, and
now am coming with (not-so-)new benchmark results (attached). The
tests were performed on Athlon XP 1800 and DDR 256MB. 

From the results, it's obvious that memcpy() using MMX insns is the
best for in-cache sized data, typically 50-100% faster than plain old
memcpy for data <= 32 KB.

Another attached patch is i686 version of copyin(9) that makes use
of MMX insns. It works well with intops-only programs, but doesn't
with ones like XFree86 that uses FP ops. In this case, it would be
helpful if NPX handling code was imported from FreeBSD (they have
i586 optimized version of copyin/out(9)). Can anybody give me some
comments wrt this?

Jun-Young

-- 
Bang Jun-Young <junyoung@mogua.com>

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="memcpy_bench.txt"

addr1=0x804c000 addr2=0x804c080
memcpy 64B -- 16777216 loops
  aligned blocks
      libc memcpy                                        0.796562 s
      MMX memcpy using MOVQ                              0.332473 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.437861 s
      with simple MOVUSB (no prefetch)                   0.477142 s
      arjanv's MOVQ (with prefetch)                      0.397613 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.386256 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.468275 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.794225 s
      MMX memcpy using MOVQ                              0.408814 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.469252 s
      with simple MOVUSB (no prefetch)                   0.542820 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.931550 s
      MMX memcpy using MOVQ                              0.465778 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.556663 s
      with simple MOVUSB (no prefetch)                   0.545896 s

addr1=0x804c000 addr2=0x804c100
memcpy 128B -- 8388608 loops
  aligned blocks
      libc memcpy                                        0.511865 s
      MMX memcpy using MOVQ                              0.233085 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.314226 s
      with simple MOVUSB (no prefetch)                   0.363533 s
      arjanv's MOVQ (with prefetch)                      0.266980 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.255603 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.273115 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.513906 s
      MMX memcpy using MOVQ                              0.295375 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.351025 s
      with simple MOVUSB (no prefetch)                   0.412870 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.647541 s
      MMX memcpy using MOVQ                              0.381870 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.403322 s
      with simple MOVUSB (no prefetch)                   0.421661 s

addr1=0x804c000 addr2=0x804c200
memcpy 256B -- 4194304 loops
  aligned blocks
      libc memcpy                                        0.380581 s
      MMX memcpy using MOVQ                              0.173247 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.252765 s
      with simple MOVUSB (no prefetch)                   0.320588 s
      arjanv's MOVQ (with prefetch)                      0.196009 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.211234 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.198807 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.379022 s
      MMX memcpy using MOVQ                              0.241409 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.295835 s
      with simple MOVUSB (no prefetch)                   0.388839 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.505536 s
      MMX memcpy using MOVQ                              0.343646 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.318098 s
      with simple MOVUSB (no prefetch)                   0.359642 s

addr1=0x804c000 addr2=0x804c400
memcpy 512B -- 2097152 loops
  aligned blocks
      libc memcpy                                        0.309567 s
      MMX memcpy using MOVQ                              0.161895 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.259866 s
      with simple MOVUSB (no prefetch)                   0.299634 s
      arjanv's MOVQ (with prefetch)                      0.171824 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.204493 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.159063 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.310000 s
      MMX memcpy using MOVQ                              0.210169 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.293950 s
      with simple MOVUSB (no prefetch)                   0.347535 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.434541 s
      MMX memcpy using MOVQ                              0.318089 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.301053 s
      with simple MOVUSB (no prefetch)                   0.350758 s

addr1=0x804c000 addr2=0x804c800
memcpy 1024B -- 1048576 loops
  aligned blocks
      libc memcpy                                        0.276199 s
      MMX memcpy using MOVQ                              0.170408 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.232004 s
      with simple MOVUSB (no prefetch)                   0.274786 s
      arjanv's MOVQ (with prefetch)                      0.168275 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.192419 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.157286 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.276402 s
      MMX memcpy using MOVQ                              0.208041 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264838 s
      with simple MOVUSB (no prefetch)                   0.321226 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.399037 s
      MMX memcpy using MOVQ                              0.317386 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.269808 s
      with simple MOVUSB (no prefetch)                   0.323063 s

addr1=0x804c000 addr2=0x804f000
memcpy 2048B -- 524288 loops
  aligned blocks
      libc memcpy                                        0.259386 s
      MMX memcpy using MOVQ                              0.164728 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.216017 s
      with simple MOVUSB (no prefetch)                   0.262353 s
      arjanv's MOVQ (with prefetch)                      0.160822 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.188910 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.148048 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.259205 s
      MMX memcpy using MOVQ                              0.194549 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264177 s
      with simple MOVUSB (no prefetch)                   0.308492 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.381286 s
      MMX memcpy using MOVQ                              0.306385 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.256044 s
      with simple MOVUSB (no prefetch)                   0.309575 s

addr1=0x8050000 addr2=0x8052000
memcpy 4kB -- 262144 loops
  aligned blocks
      libc memcpy                                        0.251069 s
      MMX memcpy using MOVQ                              0.161883 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.250987 s
      with simple MOVUSB (no prefetch)                   0.256146 s
      arjanv's MOVQ (with prefetch)                      0.251169 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.256027 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.207190 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.250998 s
      MMX memcpy using MOVQ                              0.188332 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.267415 s
      with simple MOVUSB (no prefetch)                   0.301825 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.372422 s
      MMX memcpy using MOVQ                              0.300877 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.264033 s
      with simple MOVUSB (no prefetch)                   0.302476 s

addr1=0x804f000 addr2=0x8054000
memcpy 8kB -- 131072 loops
  aligned blocks
      libc memcpy                                        0.246683 s
      MMX memcpy using MOVQ                              0.160469 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.223785 s
      with simple MOVUSB (no prefetch)                   0.253043 s
      arjanv's MOVQ (with prefetch)                      0.198100 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.220333 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.165994 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.246569 s
      MMX memcpy using MOVQ                              0.184975 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.244896 s
      with simple MOVUSB (no prefetch)                   0.298646 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.367977 s
      MMX memcpy using MOVQ                              0.298119 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.245495 s
      with simple MOVUSB (no prefetch)                   0.298924 s

addr1=0x804f000 addr2=0x8057000
memcpy 16kB -- 65536 loops
  aligned blocks
      libc memcpy                                        0.246980 s
      MMX memcpy using MOVQ                              0.159769 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.274188 s
      with simple MOVUSB (no prefetch)                   0.251510 s
      arjanv's MOVQ (with prefetch)                      0.174101 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.278145 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.306673 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.247309 s
      MMX memcpy using MOVQ                              0.183421 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.460254 s
      with simple MOVUSB (no prefetch)                   0.297058 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.368122 s
      MMX memcpy using MOVQ                              0.296768 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.299830 s
      with simple MOVUSB (no prefetch)                   0.297180 s

addr1=0x804f000 addr2=0x805c000
memcpy 32kB -- 32768 loops
  aligned blocks
      libc memcpy                                        0.246418 s
      MMX memcpy using MOVQ                              0.161774 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.291646 s
      with simple MOVUSB (no prefetch)                   0.252990 s
      arjanv's MOVQ (with prefetch)                      0.168720 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.220957 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.279949 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.253483 s
      MMX memcpy using MOVQ                              0.189459 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.330665 s
      with simple MOVUSB (no prefetch)                   0.299876 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.371691 s
      MMX memcpy using MOVQ                              0.280076 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.325644 s
      with simple MOVUSB (no prefetch)                   0.299598 s

addr1=0x805f000 addr2=0x8070000
memcpy 64kB -- 16384 loops
  aligned blocks
      libc memcpy                                        0.557651 s
      MMX memcpy using MOVQ                              0.484263 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.513905 s
      with simple MOVUSB (no prefetch)                   0.504620 s
      arjanv's MOVQ (with prefetch)                      0.481128 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.514562 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.513256 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.573247 s
      MMX memcpy using MOVQ                              0.577181 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511285 s
      with simple MOVUSB (no prefetch)                   0.596480 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.742978 s
      MMX memcpy using MOVQ                              0.657358 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.523992 s
      with simple MOVUSB (no prefetch)                   0.550855 s

addr1=0x805f000 addr2=0x8080000
memcpy 128kB -- 8192 loops
  aligned blocks
      libc memcpy                                        0.557362 s
      MMX memcpy using MOVQ                              0.480659 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.512456 s
      with simple MOVUSB (no prefetch)                   0.503718 s
      arjanv's MOVQ (with prefetch)                      0.477681 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.512652 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.511952 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        0.568122 s
      MMX memcpy using MOVQ                              0.575231 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511856 s
      with simple MOVUSB (no prefetch)                   0.594627 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        0.742519 s
      MMX memcpy using MOVQ                              0.642598 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.514444 s
      with simple MOVUSB (no prefetch)                   0.549701 s

addr1=0x805f000 addr2=0x80a0000
memcpy 256kB -- 4096 loops
  aligned blocks
      libc memcpy                                        3.312519 s
      MMX memcpy using MOVQ                              2.991133 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511574 s
      with simple MOVUSB (no prefetch)                   3.132933 s
      arjanv's MOVQ (with prefetch)                      2.855973 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        0.511921 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  0.511443 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        3.287035 s
      MMX memcpy using MOVQ                              3.142660 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.511720 s
      with simple MOVUSB (no prefetch)                   3.266173 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.434399 s
      MMX memcpy using MOVQ                              3.422942 s
      with mingo's MOVUSB (prefetch, non-temporal)       0.514526 s
      with simple MOVUSB (no prefetch)                   3.203671 s

addr1=0x805f000 addr2=0x80e0000
memcpy 512kB -- 2048 loops
  aligned blocks
      libc memcpy                                        3.318688 s
      MMX memcpy using MOVQ                              2.991518 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.113506 s
      with simple MOVUSB (no prefetch)                   3.136624 s
      arjanv's MOVQ (with prefetch)                      2.856525 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        1.760890 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  1.892791 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        3.288139 s
      MMX memcpy using MOVQ                              3.144040 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.998237 s
      with simple MOVUSB (no prefetch)                   3.270226 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.441033 s
      MMX memcpy using MOVQ                              3.426393 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.332106 s
      with simple MOVUSB (no prefetch)                   3.209592 s

addr1=0x805f000 addr2=0x8160000
memcpy 1024kB -- 1024 loops
  aligned blocks
      libc memcpy                                        3.158626 s
      MMX memcpy using MOVQ                              2.801466 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.963610 s
      with simple MOVUSB (no prefetch)                   2.986171 s
      arjanv's MOVQ (with prefetch)                      2.656920 s
      arjanv's MOVNTQ (with prefetch, for Athlon)        1.601385 s
      arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA  1.727029 s
  +0/+4 moderately unaligned blocks
      libc memcpy                                        2.989358 s
      MMX memcpy using MOVQ                              2.831822 s
      with mingo's MOVUSB (prefetch, non-temporal)       1.909301 s
      with simple MOVUSB (no prefetch)                   3.057700 s
  +10/+13 cruelly unaligned blocks
      libc memcpy                                        3.195734 s
      MMX memcpy using MOVQ                              3.108697 s
      with mingo's MOVUSB (prefetch, non-temporal)       2.108903 s
      with simple MOVUSB (no prefetch)                   3.039293 s


--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="memcpy_bench.c"

/* -*- c-file-style: "linux" -*- */

/* memcpy speed benchmark using different i86-specific routines. 
 *
 * Framework (C) 2001 by Martin Pool <mbp@samba.org>, based on speed.c
 * by tridge.
 *
 * Routines lifted from all kinds of places.
 *
 * You must not use floating-point code anywhere in this application
 * because it scribbles on the FP state and does not reset it.  */


#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/time.h>



#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))

#include <sys/resource.h>
struct rusage tp1,tp2;

static void start_timer()
{
	getrusage(RUSAGE_SELF,&tp1);
}


static long end_timer()
{
	getrusage(RUSAGE_SELF,&tp2);
#if 0
	printf ("tp1 = %ld.%05ld, tp2 = %ld.%05ld\n", 
		(long) tp1.ru_utime.tv_sec, (long) tp1.ru_utime.tv_usec, 
		(long) tp2.ru_utime.tv_sec, (long) tp2.ru_utime.tv_usec);
#endif

	return ((tp2.ru_utime.tv_sec - tp1.ru_utime.tv_sec) * 1000000 + 
		(tp2.ru_utime.tv_usec - tp1.ru_utime.tv_usec));
}




/*
 * By Ingo Molnar and Doug Ledford; hacked up to remove
 * kernel-specific stuff like saving/restoring float registers.
 *
 * http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0 */
void *
memcpy_movusb (void *to, const void *from, size_t n)
{
	size_t size;

#define STEP 0x20
#define ALIGN 0x10
	if ((unsigned long)to & (ALIGN-1)) {
		size = ALIGN - ((unsigned long)to & (ALIGN-1));
		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
				     "movups %%xmm0,(%1)\n\t"
				     :
				     : "r" (from),
				     "r" (to));
		n -= size;
		from += size;
		to += size;
	}
/*
 * If the copy would have tailings, take care of them
 * now instead of later
 */
	if (n & (ALIGN-1)) {
		size = n - ALIGN;
		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
				     "movups %%xmm0,(%1)\n\t"
				     :
				     : "r" (from + size),
				     "r" (to + size));
		n &= ~(ALIGN-1);
	}
/*
 * Prefetch the first two cachelines now.
 */
	__asm__ __volatile__("prefetchnta 0x00(%0)\n\t"
			     "prefetchnta 0x20(%0)\n\t"
			     :
			     : "r" (from));
	  
	while (n >= STEP) {
		__asm__ __volatile__(
			"movups 0x00(%0),%%xmm0\n\t"
			"movups 0x10(%0),%%xmm1\n\t"
			"movntps %%xmm0,0x00(%1)\n\t"
			"movntps %%xmm1,0x10(%1)\n\t"
			: 
			: "r" (from), "r" (to)
			: "memory");
		from += STEP;
		/*
		 * Note: Intermixing the prefetch at *exactly* this point
		 * in time has been shown to be the fastest possible.
		 * Timing these prefetch instructions is a complete black
		 * art with nothing but trial and error showing the way.
		 * To that extent, this optimum version was found by using
		 * a userland version of this routine that we clocked for
		 * lots of runs.  We then fiddled with ordering until we
		 * settled on our highest speen routines.  So, the long
		 * and short of this is, don't mess with instruction ordering
		 * here or suffer permance penalties you will.
		 */
		__asm__ __volatile__(
			"prefetchnta 0x20(%0)\n\t"
			: 
			: "r" (from));
		to += STEP;
		n -= STEP;
	}
	
	return to;
}

void *
memcpy_simple_movusb (void *to, const void *from, size_t n)
{
	size_t size;

#define STEP 0x20
#define ALIGN 0x10
	if ((unsigned long)to & (ALIGN-1)) {
		size = ALIGN - ((unsigned long)to & (ALIGN-1));
		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
				     "movups %%xmm0,(%1)\n\t"
				     :
				     : "r" (from),
				     "r" (to));
		n -= size;
		from += size;
		to += size;
	}
/*
 * If the copy would have tailings, take care of them
 * now instead of later
 */
	if (n & (ALIGN-1)) {
		size = n - ALIGN;
		__asm__ __volatile__("movups (%0),%%xmm0\n\t"
				     "movups %%xmm0,(%1)\n\t"
				     :
				     : "r" (from + size),
				     "r" (to + size));
		n &= ~(ALIGN-1);
	}

	while (n >= STEP) {
		__asm__ __volatile__(
			"movups 0x00(%0),%%xmm0\n\t"
			"movups 0x10(%0),%%xmm1\n\t"
			"movups %%xmm0,0x00(%1)\n\t"
			"movups %%xmm1,0x10(%1)\n\t"
			: 
			: "r" (from), "r" (to)
			: "memory");
		from += STEP;
		to += STEP;
		n -= STEP;
	}
	
	return to;
}


/* From Linux 2.4.8.  I think this must be aligned. */
void *
memcpy_mmx (void *to, const void *from, size_t len)
{
	int i;

	for(i = 0; i < len / 64; i++) {
      		__asm__ __volatile__ (
		   "movq (%0), %%mm0\n"
		   "\tmovq 8(%0), %%mm1\n"
		   "\tmovq 16(%0), %%mm2\n"
		   "\tmovq 24(%0), %%mm3\n"
		   "\tmovq %%mm0, (%1)\n"
		   "\tmovq %%mm1, 8(%1)\n"
		   "\tmovq %%mm2, 16(%1)\n"
		   "\tmovq %%mm3, 24(%1)\n"
		   "\tmovq 32(%0), %%mm0\n"
		   "\tmovq 40(%0), %%mm1\n"
		   "\tmovq 48(%0), %%mm2\n"
		   "\tmovq 56(%0), %%mm3\n"
		   "\tmovq %%mm0, 32(%1)\n"
		   "\tmovq %%mm1, 40(%1)\n"
		   "\tmovq %%mm2, 48(%1)\n"
		   "\tmovq %%mm3, 56(%1)\n"
		   : : "r" (from), "r" (to) : "memory");
		from += 64;
		to += 64;
	}

	if (len & 63)
		memcpy(to, from, len & 63);

	return to;
}

static void print_time (char const *msg, 
			long long loops,
			long t)
{
	printf("      %-50s %ld.%06ld s\n", msg, t/1000000,
	       t % 1000000);
}

void *
memcpy_arjanv (void *to, const void *from, size_t len)
{
	int i;

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		"   prefetchnta 256(%0)\n"
		: : "r" (from) );

	for(i=0; i<len/64; i++) {
		__asm__ __volatile__ (
			"1: prefetchnta 320(%0)\n"
			"2: movq (%0), %%mm0\n"
			"   movq 8(%0), %%mm1\n"
			"   movq 16(%0), %%mm2\n"
			"   movq 24(%0), %%mm3\n"
			"   movq %%mm0, (%1)\n"
			"   movq %%mm1, 8(%1)\n"
			"   movq %%mm2, 16(%1)\n"
			"   movq %%mm3, 24(%1)\n"
			"   movq 32(%0), %%mm0\n"
			"   movq 40(%0), %%mm1\n"
			"   movq 48(%0), %%mm2\n"
			"   movq 56(%0), %%mm3\n"
			"   movq %%mm0, 32(%1)\n"
			"   movq %%mm1, 40(%1)\n"
			"   movq %%mm2, 48(%1)\n"
			"   movq %%mm3, 56(%1)\n"
			: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}

	/*
	 *Now do the tail of the block
	 */
	if (len&63)
		memcpy(to, from, len&63);

	return to;
}

void *
memcpy_arjanv_movntq (void *to, const void *from, size_t len)
{
	int i;

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );

	for(i=0; i<len/64; i++) {
		__asm__ __volatile__ (
			"   prefetchnta 200(%0)\n"
			"   movq (%0), %%mm0\n"
			"   movq 8(%0), %%mm1\n"
			"   movq 16(%0), %%mm2\n"
			"   movq 24(%0), %%mm3\n"
			"   movq 32(%0), %%mm4\n"
			"   movq 40(%0), %%mm5\n"
			"   movq 48(%0), %%mm6\n"
			"   movq 56(%0), %%mm7\n"
			"   movntq %%mm0, (%1)\n"
			"   movntq %%mm1, 8(%1)\n"
			"   movntq %%mm2, 16(%1)\n"
			"   movntq %%mm3, 24(%1)\n"
			"   movntq %%mm4, 32(%1)\n"
			"   movntq %%mm5, 40(%1)\n"
			"   movntq %%mm6, 48(%1)\n"
			"   movntq %%mm7, 56(%1)\n"
			: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	/*
	 *Now do the tail of the block
	 */
	if (len&63)
		memcpy(to, from, len&63);
	
	return to;
}

void *
memcpy_arjanv_interleave (void *to, const void *from, size_t len)
{
	int i;

	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		: : "r" (from) );


	for(i=0; i<len/64; i++) {
		__asm__ __volatile__ (
			"   prefetchnta 168(%0)\n"
			"   movq (%0), %%mm0\n"
			"   movntq %%mm0, (%1)\n"
			"   movq 8(%0), %%mm1\n"
			"   movntq %%mm1, 8(%1)\n"
			"   movq 16(%0), %%mm2\n"
			"   movntq %%mm2, 16(%1)\n"
			"   movq 24(%0), %%mm3\n"
			"   movntq %%mm3, 24(%1)\n"
			"   movq 32(%0), %%mm4\n"
			"   movntq %%mm4, 32(%1)\n"
			"   movq 40(%0), %%mm5\n"
			"   movntq %%mm5, 40(%1)\n"
			"   movq 48(%0), %%mm6\n"
			"   movntq %%mm6, 48(%1)\n"
			"   movq 56(%0), %%mm7\n"
			"   movntq %%mm7, 56(%1)\n"
			: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	/*
	 *Now do the tail of the block
	 */
	if (len&63)
		memcpy(to, from, len&63);
	
	return to;
}

static void wrap (char *p1, 
		  char *p2,
		  size_t size,
		  long loops,
		  void *(*bfn) (void *, const void *, size_t),
		  const char *msg)
{
	long t;
	int i;

	memset(p2,42,size);

	start_timer();

	for (i=0; i<loops; i++)
		bfn (p1, p2, size);

	t = end_timer();

	print_time (msg, loops, t);
}

static void memcpy_test(size_t size)
{
	long loops = 1024*1024*1024 / size;

	/* We need to make sure the blocks are *VERY* aligned, because
	   MMX is potentially pretty fussy. */

	char *p1 = (char *) malloc (size+64);
	char *p2 = (char *) malloc (size+64);

	printf("addr1=%p addr2=%p\n", p1, p2);

	if (size > 2048)
		printf ("memcpy %dkB -- %ld loops\n", size>>10, loops);
	else
		printf ("memcpy %dB -- %ld loops\n", size, loops);


	printf ("  aligned blocks\n");

	wrap (p1, p2, size, loops, memcpy, "libc memcpy");
	wrap (p1, p2, size, loops, memcpy_mmx,
		"MMX memcpy using MOVQ");
	wrap(p1, p2, size, loops, memcpy_movusb,
		"with mingo's MOVUSB (prefetch, non-temporal)");
	wrap (p1, p2, size, loops, memcpy_simple_movusb,
	      "with simple MOVUSB (no prefetch)");
	wrap (p1, p2, size, loops, memcpy_arjanv,
	      "arjanv's MOVQ (with prefetch)");
	wrap (p1, p2, size, loops, memcpy_arjanv_movntq,
	      "arjanv's MOVNTQ (with prefetch, for Athlon)");
	wrap (p1, p2, size, loops, memcpy_arjanv_interleave,
	      "arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA");

	printf ("  +0/+4 moderately unaligned blocks\n");

	wrap (p1, p2+4, size, loops, memcpy, "libc memcpy");
	wrap (p1, p2+4, size, loops, memcpy_mmx,
		"MMX memcpy using MOVQ");
	wrap(p1, p2+4, size, loops, memcpy_movusb,
		"with mingo's MOVUSB (prefetch, non-temporal)");
	wrap (p1, p2+4, size, loops, memcpy_simple_movusb,
	      "with simple MOVUSB (no prefetch)");

	printf ("  +10/+13 cruelly unaligned blocks\n");

	wrap (p1+10, p2+13, size, loops, memcpy, "libc memcpy");
	wrap (p1+10, p2+13, size, loops, memcpy_mmx,
		"MMX memcpy using MOVQ");
	wrap(p1+10, p2+13, size, loops, memcpy_movusb,
		"with mingo's MOVUSB (prefetch, non-temporal)");
	wrap (p1+10, p2+13, size, loops, memcpy_simple_movusb,
	      "with simple MOVUSB (no prefetch)");

	puts("");

	free(p1); free(p2);
}


int main (void)
{
	memcpy_test(1<<6);
	memcpy_test(1<<7);
	memcpy_test(1<<8);
	memcpy_test(1<<9);
	memcpy_test(1<<10);
	memcpy_test(1<<11);
	memcpy_test(1<<12);
	memcpy_test(1<<13);
	memcpy_test(1<<14);
	memcpy_test(1<<15);
	memcpy_test(1<<16);
	memcpy_test(1<<17);
	memcpy_test(1<<18);
	memcpy_test(1<<19);
	memcpy_test(1<<20);

	return 0;
}

--mYCpIKhGyMATD0i+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="locore.s.diff"

Index: locore.s
===================================================================
RCS file: /usr/local/cvs/moguawin/sys/arch/i386/i386/locore.s,v
retrieving revision 1.5
diff -u -r1.5 locore.s
--- locore.s	2002/10/10 03:59:38	1.5
+++ locore.s	2002/10/15 18:55:38
@@ -951,7 +951,7 @@
 #define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
 #elif defined(I686_CPU)
 #define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
-#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
+#define	DEFAULT_COPYIN		_C_LABEL(i686_copyin)	/* XXX */
 #endif
 
 	.data
@@ -1159,6 +1159,103 @@
 	xorl	%eax,%eax
 	ret
 #endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i686_copyin)
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebx
+	GET_CURPCB(%eax)
+	movl	$_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax)
+	
+	movl	16(%esp),%eax
+	movl	20(%esp),%ecx
+	movl	24(%esp),%esi
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.  If it's not, then we only need to
+	 * check that each page is readable, and the CPU will do that for us.
+	 */
+	movl	%eax,%edx
+	addl	%esi,%edx
+	jc	_C_LABEL(i686_copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(i686_copy_efault)
+
+	xorl	%ebx,%ebx
+	movl	%esi,%edx
+	shrl	$6,%edx
+	cmpl	%edx,%ebx
+	jae	2f
+
+1:
+	movq 	(%eax),%mm0
+	movq	8(%eax),%mm1
+	movq	16(%eax),%mm2
+	movq	24(%eax),%mm3
+	movq	%mm0,(%ecx)
+	movq	%mm1,8(%ecx)
+	movq	%mm2,16(%ecx)
+	movq	%mm3,24(%ecx)
+	movq	32(%eax),%mm0
+	movq	40(%eax),%mm1
+	movq	48(%eax),%mm2
+	movq	56(%eax),%mm3
+	movq	%mm0,32(%ecx)
+	movq	%mm1,40(%ecx)
+	movq	%mm2,48(%ecx)
+	movq	%mm3,56(%ecx)
+
+	addl	$64,%eax
+	addl	$64,%ecx
+	incl	%ebx
+	cmpl	%edx,%ebx
+	jb	1b
+
+2:
+	movl	%esi,%edx
+	andl	$63,%edx
+	je	3f
+
+	movl	%eax,%esi
+	movl	%edx,%eax
+	movl	%ecx,%edi
+
+	/* bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl
+	rep
+	movsb
+
+3:
+	GET_CURPCB(%edx)
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	movl	%eax,PCB_ONFAULT(%edx)
+	ret
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_efault)
+	movl	$EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_fault)
+	GET_CURPCB(%edx)
+	movl	%eax,PCB_ONFAULT(%edx)
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+#endif /* I686_CPU */
 
 /* LINTSTUB: Ignore */
 NENTRY(copy_efault)

--mYCpIKhGyMATD0i+--

댓글