본 글은 http://mail-index.netbsd.org/tech-perform/2002/10/16/0000.html 에서 가지고 왔다.
Subject: Performance of various memcpy()'s
To: None <tech-perform@netbsd.org>
From: Bang Jun-Young <junyoung@mogua.com>
List: tech-perform
Date: 10/16/2002 04:18:30
--mYCpIKhGyMATD0i+ Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Hi, About 14 monthes ago, I had some discussion on memcpy performance on i386 platform here. Monthes later, I took a look into it again, and now am coming with (not-so-)new benchmark results (attached). The tests were performed on Athlon XP 1800 and DDR 256MB. From the results, it's obvious that memcpy() using MMX insns is the best for in-cache sized data, typically 50-100% faster than plain old memcpy for data <= 32 KB. Another attached patch is i686 version of copyin(9) that makes use of MMX insns. It works well with intops-only programs, but doesn't with ones like XFree86 that uses FP ops. In this case, it would be helpful if NPX handling code was imported from FreeBSD (they have i586 optimized version of copyin/out(9)). Can anybody give me some comments wrt this? Jun-Young -- Bang Jun-Young <junyoung@mogua.com> --mYCpIKhGyMATD0i+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="memcpy_bench.txt" addr1=0x804c000 addr2=0x804c080 memcpy 64B -- 16777216 loops aligned blocks libc memcpy 0.796562 s MMX memcpy using MOVQ 0.332473 s with mingo's MOVUSB (prefetch, non-temporal) 0.437861 s with simple MOVUSB (no prefetch) 0.477142 s arjanv's MOVQ (with prefetch) 0.397613 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.386256 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.468275 s +0/+4 moderately unaligned blocks libc memcpy 0.794225 s MMX memcpy using MOVQ 0.408814 s with mingo's MOVUSB (prefetch, non-temporal) 0.469252 s with simple MOVUSB (no prefetch) 0.542820 s +10/+13 cruelly unaligned blocks libc memcpy 0.931550 s MMX memcpy using MOVQ 0.465778 s with mingo's MOVUSB (prefetch, non-temporal) 0.556663 s with simple MOVUSB (no prefetch) 0.545896 s addr1=0x804c000 addr2=0x804c100 memcpy 128B -- 8388608 loops aligned blocks libc memcpy 0.511865 s MMX memcpy using MOVQ 0.233085 s with mingo's MOVUSB (prefetch, non-temporal) 0.314226 s with simple MOVUSB (no prefetch) 0.363533 s arjanv's MOVQ (with prefetch) 0.266980 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.255603 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.273115 s +0/+4 moderately unaligned blocks libc memcpy 0.513906 s MMX memcpy using MOVQ 0.295375 s with mingo's MOVUSB (prefetch, non-temporal) 0.351025 s with simple MOVUSB (no prefetch) 0.412870 s +10/+13 cruelly unaligned blocks libc memcpy 0.647541 s MMX memcpy using MOVQ 0.381870 s with mingo's MOVUSB (prefetch, non-temporal) 0.403322 s with simple MOVUSB (no prefetch) 0.421661 s addr1=0x804c000 addr2=0x804c200 memcpy 256B -- 4194304 loops aligned blocks libc memcpy 0.380581 s MMX memcpy using MOVQ 0.173247 s with mingo's MOVUSB (prefetch, non-temporal) 0.252765 s with simple MOVUSB (no prefetch) 0.320588 s arjanv's MOVQ (with prefetch) 0.196009 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.211234 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.198807 s +0/+4 moderately unaligned blocks libc memcpy 0.379022 s MMX memcpy using MOVQ 0.241409 s with mingo's MOVUSB (prefetch, non-temporal) 0.295835 s with simple MOVUSB (no prefetch) 0.388839 s +10/+13 cruelly unaligned blocks libc memcpy 0.505536 s MMX memcpy using MOVQ 0.343646 s with mingo's MOVUSB (prefetch, non-temporal) 0.318098 s with simple MOVUSB (no prefetch) 0.359642 s addr1=0x804c000 addr2=0x804c400 memcpy 512B -- 2097152 loops aligned blocks libc memcpy 0.309567 s MMX memcpy using MOVQ 0.161895 s with mingo's MOVUSB (prefetch, non-temporal) 0.259866 s with simple MOVUSB (no prefetch) 0.299634 s arjanv's MOVQ (with prefetch) 0.171824 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.204493 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.159063 s +0/+4 moderately unaligned blocks libc memcpy 0.310000 s MMX memcpy using MOVQ 0.210169 s with mingo's MOVUSB (prefetch, non-temporal) 0.293950 s with simple MOVUSB (no prefetch) 0.347535 s +10/+13 cruelly unaligned blocks libc memcpy 0.434541 s MMX memcpy using MOVQ 0.318089 s with mingo's MOVUSB (prefetch, non-temporal) 0.301053 s with simple MOVUSB (no prefetch) 0.350758 s addr1=0x804c000 addr2=0x804c800 memcpy 1024B -- 1048576 loops aligned blocks libc memcpy 0.276199 s MMX memcpy using MOVQ 0.170408 s with mingo's MOVUSB (prefetch, non-temporal) 0.232004 s with simple MOVUSB (no prefetch) 0.274786 s arjanv's MOVQ (with prefetch) 0.168275 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.192419 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.157286 s +0/+4 moderately unaligned blocks libc memcpy 0.276402 s MMX memcpy using MOVQ 0.208041 s with mingo's MOVUSB (prefetch, non-temporal) 0.264838 s with simple MOVUSB (no prefetch) 0.321226 s +10/+13 cruelly unaligned blocks libc memcpy 0.399037 s MMX memcpy using MOVQ 0.317386 s with mingo's MOVUSB (prefetch, non-temporal) 0.269808 s with simple MOVUSB (no prefetch) 0.323063 s addr1=0x804c000 addr2=0x804f000 memcpy 2048B -- 524288 loops aligned blocks libc memcpy 0.259386 s MMX memcpy using MOVQ 0.164728 s with mingo's MOVUSB (prefetch, non-temporal) 0.216017 s with simple MOVUSB (no prefetch) 0.262353 s arjanv's MOVQ (with prefetch) 0.160822 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.188910 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.148048 s +0/+4 moderately unaligned blocks libc memcpy 0.259205 s MMX memcpy using MOVQ 0.194549 s with mingo's MOVUSB (prefetch, non-temporal) 0.264177 s with simple MOVUSB (no prefetch) 0.308492 s +10/+13 cruelly unaligned blocks libc memcpy 0.381286 s MMX memcpy using MOVQ 0.306385 s with mingo's MOVUSB (prefetch, non-temporal) 0.256044 s with simple MOVUSB (no prefetch) 0.309575 s addr1=0x8050000 addr2=0x8052000 memcpy 4kB -- 262144 loops aligned blocks libc memcpy 0.251069 s MMX memcpy using MOVQ 0.161883 s with mingo's MOVUSB (prefetch, non-temporal) 0.250987 s with simple MOVUSB (no prefetch) 0.256146 s arjanv's MOVQ (with prefetch) 0.251169 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.256027 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.207190 s +0/+4 moderately unaligned blocks libc memcpy 0.250998 s MMX memcpy using MOVQ 0.188332 s with mingo's MOVUSB (prefetch, non-temporal) 0.267415 s with simple MOVUSB (no prefetch) 0.301825 s +10/+13 cruelly unaligned blocks libc memcpy 0.372422 s MMX memcpy using MOVQ 0.300877 s with mingo's MOVUSB (prefetch, non-temporal) 0.264033 s with simple MOVUSB (no prefetch) 0.302476 s addr1=0x804f000 addr2=0x8054000 memcpy 8kB -- 131072 loops aligned blocks libc memcpy 0.246683 s MMX memcpy using MOVQ 0.160469 s with mingo's MOVUSB (prefetch, non-temporal) 0.223785 s with simple MOVUSB (no prefetch) 0.253043 s arjanv's MOVQ (with prefetch) 0.198100 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.220333 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.165994 s +0/+4 moderately unaligned blocks libc memcpy 0.246569 s MMX memcpy using MOVQ 0.184975 s with mingo's MOVUSB (prefetch, non-temporal) 0.244896 s with simple MOVUSB (no prefetch) 0.298646 s +10/+13 cruelly unaligned blocks libc memcpy 0.367977 s MMX memcpy using MOVQ 0.298119 s with mingo's MOVUSB (prefetch, non-temporal) 0.245495 s with simple MOVUSB (no prefetch) 0.298924 s addr1=0x804f000 addr2=0x8057000 memcpy 16kB -- 65536 loops aligned blocks libc memcpy 0.246980 s MMX memcpy using MOVQ 0.159769 s with mingo's MOVUSB (prefetch, non-temporal) 0.274188 s with simple MOVUSB (no prefetch) 0.251510 s arjanv's MOVQ (with prefetch) 0.174101 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.278145 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.306673 s +0/+4 moderately unaligned blocks libc memcpy 0.247309 s MMX memcpy using MOVQ 0.183421 s with mingo's MOVUSB (prefetch, non-temporal) 0.460254 s with simple MOVUSB (no prefetch) 0.297058 s +10/+13 cruelly unaligned blocks libc memcpy 0.368122 s MMX memcpy using MOVQ 0.296768 s with mingo's MOVUSB (prefetch, non-temporal) 0.299830 s with simple MOVUSB (no prefetch) 0.297180 s addr1=0x804f000 addr2=0x805c000 memcpy 32kB -- 32768 loops aligned blocks libc memcpy 0.246418 s MMX memcpy using MOVQ 0.161774 s with mingo's MOVUSB (prefetch, non-temporal) 0.291646 s with simple MOVUSB (no prefetch) 0.252990 s arjanv's MOVQ (with prefetch) 0.168720 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.220957 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.279949 s +0/+4 moderately unaligned blocks libc memcpy 0.253483 s MMX memcpy using MOVQ 0.189459 s with mingo's MOVUSB (prefetch, non-temporal) 0.330665 s with simple MOVUSB (no prefetch) 0.299876 s +10/+13 cruelly unaligned blocks libc memcpy 0.371691 s MMX memcpy using MOVQ 0.280076 s with mingo's MOVUSB (prefetch, non-temporal) 0.325644 s with simple MOVUSB (no prefetch) 0.299598 s addr1=0x805f000 addr2=0x8070000 memcpy 64kB -- 16384 loops aligned blocks libc memcpy 0.557651 s MMX memcpy using MOVQ 0.484263 s with mingo's MOVUSB (prefetch, non-temporal) 0.513905 s with simple MOVUSB (no prefetch) 0.504620 s arjanv's MOVQ (with prefetch) 0.481128 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.514562 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.513256 s +0/+4 moderately unaligned blocks libc memcpy 0.573247 s MMX memcpy using MOVQ 0.577181 s with mingo's MOVUSB (prefetch, non-temporal) 0.511285 s with simple MOVUSB (no prefetch) 0.596480 s +10/+13 cruelly unaligned blocks libc memcpy 0.742978 s MMX memcpy using MOVQ 0.657358 s with mingo's MOVUSB (prefetch, non-temporal) 0.523992 s with simple MOVUSB (no prefetch) 0.550855 s addr1=0x805f000 addr2=0x8080000 memcpy 128kB -- 8192 loops aligned blocks libc memcpy 0.557362 s MMX memcpy using MOVQ 0.480659 s with mingo's MOVUSB (prefetch, non-temporal) 0.512456 s with simple MOVUSB (no prefetch) 0.503718 s arjanv's MOVQ (with prefetch) 0.477681 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.512652 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.511952 s +0/+4 moderately unaligned blocks libc memcpy 0.568122 s MMX memcpy using MOVQ 0.575231 s with mingo's MOVUSB (prefetch, non-temporal) 0.511856 s with simple MOVUSB (no prefetch) 0.594627 s +10/+13 cruelly unaligned blocks libc memcpy 0.742519 s MMX memcpy using MOVQ 0.642598 s with mingo's MOVUSB (prefetch, non-temporal) 0.514444 s with simple MOVUSB (no prefetch) 0.549701 s addr1=0x805f000 addr2=0x80a0000 memcpy 256kB -- 4096 loops aligned blocks libc memcpy 3.312519 s MMX memcpy using MOVQ 2.991133 s with mingo's MOVUSB (prefetch, non-temporal) 0.511574 s with simple MOVUSB (no prefetch) 3.132933 s arjanv's MOVQ (with prefetch) 2.855973 s arjanv's MOVNTQ (with prefetch, for Athlon) 0.511921 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 0.511443 s +0/+4 moderately unaligned blocks libc memcpy 3.287035 s MMX memcpy using MOVQ 3.142660 s with mingo's MOVUSB (prefetch, non-temporal) 0.511720 s with simple MOVUSB (no prefetch) 3.266173 s +10/+13 cruelly unaligned blocks libc memcpy 3.434399 s MMX memcpy using MOVQ 3.422942 s with mingo's MOVUSB (prefetch, non-temporal) 0.514526 s with simple MOVUSB (no prefetch) 3.203671 s addr1=0x805f000 addr2=0x80e0000 memcpy 512kB -- 2048 loops aligned blocks libc memcpy 3.318688 s MMX memcpy using MOVQ 2.991518 s with mingo's MOVUSB (prefetch, non-temporal) 2.113506 s with simple MOVUSB (no prefetch) 3.136624 s arjanv's MOVQ (with prefetch) 2.856525 s arjanv's MOVNTQ (with prefetch, for Athlon) 1.760890 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 1.892791 s +0/+4 moderately unaligned blocks libc memcpy 3.288139 s MMX memcpy using MOVQ 3.144040 s with mingo's MOVUSB (prefetch, non-temporal) 1.998237 s with simple MOVUSB (no prefetch) 3.270226 s +10/+13 cruelly unaligned blocks libc memcpy 3.441033 s MMX memcpy using MOVQ 3.426393 s with mingo's MOVUSB (prefetch, non-temporal) 2.332106 s with simple MOVUSB (no prefetch) 3.209592 s addr1=0x805f000 addr2=0x8160000 memcpy 1024kB -- 1024 loops aligned blocks libc memcpy 3.158626 s MMX memcpy using MOVQ 2.801466 s with mingo's MOVUSB (prefetch, non-temporal) 1.963610 s with simple MOVUSB (no prefetch) 2.986171 s arjanv's MOVQ (with prefetch) 2.656920 s arjanv's MOVNTQ (with prefetch, for Athlon) 1.601385 s arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA 1.727029 s +0/+4 moderately unaligned blocks libc memcpy 2.989358 s MMX memcpy using MOVQ 2.831822 s with mingo's MOVUSB (prefetch, non-temporal) 1.909301 s with simple MOVUSB (no prefetch) 3.057700 s +10/+13 cruelly unaligned blocks libc memcpy 3.195734 s MMX memcpy using MOVQ 3.108697 s with mingo's MOVUSB (prefetch, non-temporal) 2.108903 s with simple MOVUSB (no prefetch) 3.039293 s --mYCpIKhGyMATD0i+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="memcpy_bench.c" /* -*- c-file-style: "linux" -*- */ /* memcpy speed benchmark using different i86-specific routines. * * Framework (C) 2001 by Martin Pool <mbp@samba.org>, based on speed.c * by tridge. * * Routines lifted from all kinds of places. * * You must not use floating-point code anywhere in this application * because it scribbles on the FP state and does not reset it. */ #include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/time.h> #define MAX(a,b) ((a)>(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b)) #include <sys/resource.h> struct rusage tp1,tp2; static void start_timer() { getrusage(RUSAGE_SELF,&tp1); } static long end_timer() { getrusage(RUSAGE_SELF,&tp2); #if 0 printf ("tp1 = %ld.%05ld, tp2 = %ld.%05ld\n", (long) tp1.ru_utime.tv_sec, (long) tp1.ru_utime.tv_usec, (long) tp2.ru_utime.tv_sec, (long) tp2.ru_utime.tv_usec); #endif return ((tp2.ru_utime.tv_sec - tp1.ru_utime.tv_sec) * 1000000 + (tp2.ru_utime.tv_usec - tp1.ru_utime.tv_usec)); } /* * By Ingo Molnar and Doug Ledford; hacked up to remove * kernel-specific stuff like saving/restoring float registers. * * http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0 */ void * memcpy_movusb (void *to, const void *from, size_t n) { size_t size; #define STEP 0x20 #define ALIGN 0x10 if ((unsigned long)to & (ALIGN-1)) { size = ALIGN - ((unsigned long)to & (ALIGN-1)); __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from), "r" (to)); n -= size; from += size; to += size; } /* * If the copy would have tailings, take care of them * now instead of later */ if (n & (ALIGN-1)) { size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1); } /* * Prefetch the first two cachelines now. */ __asm__ __volatile__("prefetchnta 0x00(%0)\n\t" "prefetchnta 0x20(%0)\n\t" : : "r" (from)); while (n >= STEP) { __asm__ __volatile__( "movups 0x00(%0),%%xmm0\n\t" "movups 0x10(%0),%%xmm1\n\t" "movntps %%xmm0,0x00(%1)\n\t" "movntps %%xmm1,0x10(%1)\n\t" : : "r" (from), "r" (to) : "memory"); from += STEP; /* * Note: Intermixing the prefetch at *exactly* this point * in time has been shown to be the fastest possible. * Timing these prefetch instructions is a complete black * art with nothing but trial and error showing the way. * To that extent, this optimum version was found by using * a userland version of this routine that we clocked for * lots of runs. We then fiddled with ordering until we * settled on our highest speen routines. So, the long * and short of this is, don't mess with instruction ordering * here or suffer permance penalties you will. */ __asm__ __volatile__( "prefetchnta 0x20(%0)\n\t" : : "r" (from)); to += STEP; n -= STEP; } return to; } void * memcpy_simple_movusb (void *to, const void *from, size_t n) { size_t size; #define STEP 0x20 #define ALIGN 0x10 if ((unsigned long)to & (ALIGN-1)) { size = ALIGN - ((unsigned long)to & (ALIGN-1)); __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from), "r" (to)); n -= size; from += size; to += size; } /* * If the copy would have tailings, take care of them * now instead of later */ if (n & (ALIGN-1)) { size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0\n\t" "movups %%xmm0,(%1)\n\t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1); } while (n >= STEP) { __asm__ __volatile__( "movups 0x00(%0),%%xmm0\n\t" "movups 0x10(%0),%%xmm1\n\t" "movups %%xmm0,0x00(%1)\n\t" "movups %%xmm1,0x10(%1)\n\t" : : "r" (from), "r" (to) : "memory"); from += STEP; to += STEP; n -= STEP; } return to; } /* From Linux 2.4.8. I think this must be aligned. */ void * memcpy_mmx (void *to, const void *from, size_t len) { int i; for(i = 0; i < len / 64; i++) { __asm__ __volatile__ ( "movq (%0), %%mm0\n" "\tmovq 8(%0), %%mm1\n" "\tmovq 16(%0), %%mm2\n" "\tmovq 24(%0), %%mm3\n" "\tmovq %%mm0, (%1)\n" "\tmovq %%mm1, 8(%1)\n" "\tmovq %%mm2, 16(%1)\n" "\tmovq %%mm3, 24(%1)\n" "\tmovq 32(%0), %%mm0\n" "\tmovq 40(%0), %%mm1\n" "\tmovq 48(%0), %%mm2\n" "\tmovq 56(%0), %%mm3\n" "\tmovq %%mm0, 32(%1)\n" "\tmovq %%mm1, 40(%1)\n" "\tmovq %%mm2, 48(%1)\n" "\tmovq %%mm3, 56(%1)\n" : : "r" (from), "r" (to) : "memory"); from += 64; to += 64; } if (len & 63) memcpy(to, from, len & 63); return to; } static void print_time (char const *msg, long long loops, long t) { printf(" %-50s %ld.%06ld s\n", msg, t/1000000, t % 1000000); } void * memcpy_arjanv (void *to, const void *from, size_t len) { int i; __asm__ __volatile__ ( "1: prefetchnta (%0)\n" " prefetchnta 64(%0)\n" " prefetchnta 128(%0)\n" " prefetchnta 192(%0)\n" " prefetchnta 256(%0)\n" : : "r" (from) ); for(i=0; i<len/64; i++) { __asm__ __volatile__ ( "1: prefetchnta 320(%0)\n" "2: movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" " movq 16(%0), %%mm2\n" " movq 24(%0), %%mm3\n" " movq %%mm0, (%1)\n" " movq %%mm1, 8(%1)\n" " movq %%mm2, 16(%1)\n" " movq %%mm3, 24(%1)\n" " movq 32(%0), %%mm0\n" " movq 40(%0), %%mm1\n" " movq 48(%0), %%mm2\n" " movq 56(%0), %%mm3\n" " movq %%mm0, 32(%1)\n" " movq %%mm1, 40(%1)\n" " movq %%mm2, 48(%1)\n" " movq %%mm3, 56(%1)\n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } /* *Now do the tail of the block */ if (len&63) memcpy(to, from, len&63); return to; } void * memcpy_arjanv_movntq (void *to, const void *from, size_t len) { int i; __asm__ __volatile__ ( "1: prefetchnta (%0)\n" " prefetchnta 64(%0)\n" " prefetchnta 128(%0)\n" " prefetchnta 192(%0)\n" : : "r" (from) ); for(i=0; i<len/64; i++) { __asm__ __volatile__ ( " prefetchnta 200(%0)\n" " movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" " movq 16(%0), %%mm2\n" " movq 24(%0), %%mm3\n" " movq 32(%0), %%mm4\n" " movq 40(%0), %%mm5\n" " movq 48(%0), %%mm6\n" " movq 56(%0), %%mm7\n" " movntq %%mm0, (%1)\n" " movntq %%mm1, 8(%1)\n" " movntq %%mm2, 16(%1)\n" " movntq %%mm3, 24(%1)\n" " movntq %%mm4, 32(%1)\n" " movntq %%mm5, 40(%1)\n" " movntq %%mm6, 48(%1)\n" " movntq %%mm7, 56(%1)\n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } /* *Now do the tail of the block */ if (len&63) memcpy(to, from, len&63); return to; } void * memcpy_arjanv_interleave (void *to, const void *from, size_t len) { int i; __asm__ __volatile__ ( "1: prefetchnta (%0)\n" " prefetchnta 64(%0)\n" " prefetchnta 128(%0)\n" " prefetchnta 192(%0)\n" : : "r" (from) ); for(i=0; i<len/64; i++) { __asm__ __volatile__ ( " prefetchnta 168(%0)\n" " movq (%0), %%mm0\n" " movntq %%mm0, (%1)\n" " movq 8(%0), %%mm1\n" " movntq %%mm1, 8(%1)\n" " movq 16(%0), %%mm2\n" " movntq %%mm2, 16(%1)\n" " movq 24(%0), %%mm3\n" " movntq %%mm3, 24(%1)\n" " movq 32(%0), %%mm4\n" " movntq %%mm4, 32(%1)\n" " movq 40(%0), %%mm5\n" " movntq %%mm5, 40(%1)\n" " movq 48(%0), %%mm6\n" " movntq %%mm6, 48(%1)\n" " movq 56(%0), %%mm7\n" " movntq %%mm7, 56(%1)\n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } /* *Now do the tail of the block */ if (len&63) memcpy(to, from, len&63); return to; } static void wrap (char *p1, char *p2, size_t size, long loops, void *(*bfn) (void *, const void *, size_t), const char *msg) { long t; int i; memset(p2,42,size); start_timer(); for (i=0; i<loops; i++) bfn (p1, p2, size); t = end_timer(); print_time (msg, loops, t); } static void memcpy_test(size_t size) { long loops = 1024*1024*1024 / size; /* We need to make sure the blocks are *VERY* aligned, because MMX is potentially pretty fussy. */ char *p1 = (char *) malloc (size+64); char *p2 = (char *) malloc (size+64); printf("addr1=%p addr2=%p\n", p1, p2); if (size > 2048) printf ("memcpy %dkB -- %ld loops\n", size>>10, loops); else printf ("memcpy %dB -- %ld loops\n", size, loops); printf (" aligned blocks\n"); wrap (p1, p2, size, loops, memcpy, "libc memcpy"); wrap (p1, p2, size, loops, memcpy_mmx, "MMX memcpy using MOVQ"); wrap(p1, p2, size, loops, memcpy_movusb, "with mingo's MOVUSB (prefetch, non-temporal)"); wrap (p1, p2, size, loops, memcpy_simple_movusb, "with simple MOVUSB (no prefetch)"); wrap (p1, p2, size, loops, memcpy_arjanv, "arjanv's MOVQ (with prefetch)"); wrap (p1, p2, size, loops, memcpy_arjanv_movntq, "arjanv's MOVNTQ (with prefetch, for Athlon)"); wrap (p1, p2, size, loops, memcpy_arjanv_interleave, "arjanv's interleaved MOVQ/MOVNTQ with prefetchNTA"); printf (" +0/+4 moderately unaligned blocks\n"); wrap (p1, p2+4, size, loops, memcpy, "libc memcpy"); wrap (p1, p2+4, size, loops, memcpy_mmx, "MMX memcpy using MOVQ"); wrap(p1, p2+4, size, loops, memcpy_movusb, "with mingo's MOVUSB (prefetch, non-temporal)"); wrap (p1, p2+4, size, loops, memcpy_simple_movusb, "with simple MOVUSB (no prefetch)"); printf (" +10/+13 cruelly unaligned blocks\n"); wrap (p1+10, p2+13, size, loops, memcpy, "libc memcpy"); wrap (p1+10, p2+13, size, loops, memcpy_mmx, "MMX memcpy using MOVQ"); wrap(p1+10, p2+13, size, loops, memcpy_movusb, "with mingo's MOVUSB (prefetch, non-temporal)"); wrap (p1+10, p2+13, size, loops, memcpy_simple_movusb, "with simple MOVUSB (no prefetch)"); puts(""); free(p1); free(p2); } int main (void) { memcpy_test(1<<6); memcpy_test(1<<7); memcpy_test(1<<8); memcpy_test(1<<9); memcpy_test(1<<10); memcpy_test(1<<11); memcpy_test(1<<12); memcpy_test(1<<13); memcpy_test(1<<14); memcpy_test(1<<15); memcpy_test(1<<16); memcpy_test(1<<17); memcpy_test(1<<18); memcpy_test(1<<19); memcpy_test(1<<20); return 0; } --mYCpIKhGyMATD0i+ Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="locore.s.diff" Index: locore.s =================================================================== RCS file: /usr/local/cvs/moguawin/sys/arch/i386/i386/locore.s,v retrieving revision 1.5 diff -u -r1.5 locore.s --- locore.s 2002/10/10 03:59:38 1.5 +++ locore.s 2002/10/15 18:55:38 @@ -951,7 +951,7 @@ #define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ #elif defined(I686_CPU) #define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */ -#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ +#define DEFAULT_COPYIN _C_LABEL(i686_copyin) /* XXX */ #endif .data @@ -1159,6 +1159,103 @@ xorl %eax,%eax ret #endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */ + +#if defined(I686_CPU) +/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */ +ENTRY(i686_copyin) + pushl %esi + pushl %edi + pushl %ebx + GET_CURPCB(%eax) + movl $_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax) + + movl 16(%esp),%eax + movl 20(%esp),%ecx + movl 24(%esp),%esi + + /* + * We check that the end of the destination buffer is not past the end + * of the user's address space. If it's not, then we only need to + * check that each page is readable, and the CPU will do that for us. + */ + movl %eax,%edx + addl %esi,%edx + jc _C_LABEL(i686_copy_efault) + cmpl $VM_MAXUSER_ADDRESS,%edx + ja _C_LABEL(i686_copy_efault) + + xorl %ebx,%ebx + movl %esi,%edx + shrl $6,%edx + cmpl %edx,%ebx + jae 2f + +1: + movq (%eax),%mm0 + movq 8(%eax),%mm1 + movq 16(%eax),%mm2 + movq 24(%eax),%mm3 + movq %mm0,(%ecx) + movq %mm1,8(%ecx) + movq %mm2,16(%ecx) + movq %mm3,24(%ecx) + movq 32(%eax),%mm0 + movq 40(%eax),%mm1 + movq 48(%eax),%mm2 + movq 56(%eax),%mm3 + movq %mm0,32(%ecx) + movq %mm1,40(%ecx) + movq %mm2,48(%ecx) + movq %mm3,56(%ecx) + + addl $64,%eax + addl $64,%ecx + incl %ebx + cmpl %edx,%ebx + jb 1b + +2: + movl %esi,%edx + andl $63,%edx + je 3f + + movl %eax,%esi + movl %edx,%eax + movl %ecx,%edi + + /* bcopy(%esi, %edi, %eax); */ + cld + movl %eax,%ecx + shrl $2,%ecx + rep + movsl + movb %al,%cl + andb $3,%cl + rep + movsb + +3: + GET_CURPCB(%edx) + xorl %eax,%eax + popl %ebx + popl %edi + popl %esi + movl %eax,PCB_ONFAULT(%edx) + ret + +/* LINTSTUB: Ignore */ +NENTRY(i686_copy_efault) + movl $EFAULT,%eax + +/* LINTSTUB: Ignore */ +NENTRY(i686_copy_fault) + GET_CURPCB(%edx) + movl %eax,PCB_ONFAULT(%edx) + popl %ebx + popl %edi + popl %esi + ret +#endif /* I686_CPU */ /* LINTSTUB: Ignore */ NENTRY(copy_efault) --mYCpIKhGyMATD0i+--
댓글