|
|
/* memtest.c, Robert Nix, December, 1993
* nix@vliw.enet.dec.com * based on: * cbash.c * kirk johnson @ MIT * february 1993 * * RCS $Id: cbash.c,v 1.2 1993/08/12 15:30:17 tuna Exp $ * * Usage: memtest <machname> <iterations> <max-mem> * machname - a short indentifier for the machine being tested. * iterations - target number of iterations to run for stable timing. * max-mem - maximum working set size to test. * * Iterations and max-mem can be specified with a "k" or "m" suffix * for kilo or mega iterations/mem. * * Example: Test of a Gateway 60 Mhz Pentium system * Command Line: memtest gp560 8m 4m * Output: * -------------------------------------------------------------------------------- * 4k 8k 16k 32k 64k 128k 256k 512k 1m 2m 4m * L gp560 4 68 68 86 86 86 93 104 111 111 111 122 * L gp560 8 68 68 107 107 107 114 139 165 154 154 154 * L gp560 16 89 68 143 143 143 161 204 232 240 243 243 * L gp560 32 68 68 172 168 168 207 290 347 365 365 365 * L gp560 64 68 72 168 168 168 207 290 350 368 368 368 * L gp560 128 72 75 168 168 168 211 293 358 379 418 379 * L gp560 256 75 79 168 168 168 207 293 379 397 401 401 * L gp560 512 86 86 172 168 168 215 297 418 440 443 494 * L gp560 1k 100 104 175 172 168 218 304 501 522 529 529 * L gp560 2k 136 139 179 172 172 222 322 665 687 755 701 * L gp560 4k 132 243 232 225 222 286 401 991 1016 1094 1048 * L gp560 8k 132 136 243 232 225 290 350 923 973 1034 1109 * L gp560 16k 132 136 132 243 232 225 333 937 908 994 1041 * L gp560 32k 136 132 136 136 243 232 304 833 919 930 1012 * -------------------------------------------------------------------------------- * Explanation of output. * * There are three kinds of tests. * * L - Load latency test. * Measures the average repetition rate, in ns, of a latency-oriented load * loop. The two main variables are: * * (1) working set, or the amount of memory touched by the loop. This * varies across the columns in the output above, from a low of 4k * bytes to a high of max-mem, or 16m bytes. * * (2) stride, or the the number of bytes separating successive loads. * This is the number in the 3rd column of each of the "L" rows * in the output above, and varies from 4 bytes to 32k bytes. * * Interpreting the results. This is easiest on a 3d chart in Excel. * Two strides are always particularly interesting: * * - The cache line or block size stride (32 bytes above). * Big changes in latencies across the columns show the sizes * and basic performance of the load side of the cache hierarchy. * * If you don't know the cache line size: look across the first row * for the first column that takes a big jump up in latency (the jump * from 68ns to 86ns between the 8k column and 16k columnabove), then (b) * scan down the rows of that column for the first relativelystable value * (172ns in the 32 byte stride row above). The row containing * that stable value is probably the cache line size. * * Look across the cache line size row. Access time jumps at 16K -- * so the L1 cache is 8K -- and then jumps again at 512K -- so the L2 * cache is 256K. The slope between 64K and 512K could be caused * by a thrash in the L2 cache; page coloring could remove this thrash. * * - The page size stride (4k above). * Big changes in latencies across the columns expose the tbsize and the cost * of a tb refill. * * Scan the 4k line. It takes a big jump in latency at the 512K working * set (and actually starts to thrash at the 256K working set). This test says the TB * can map somewhere in the neighborhood of 64 4K pages. The TB fill time * looks to be somewhere around 650-700 ns (subtract large working set entries * in the 32-byte stride line from corresponding entries in the 4k stride line). * * The output always contains a little noise: * * - Boost the "iterations" command line parameter to remove timing jitter. * * - All entries contain some loop overhead. Its fair to normalize results by subtracting * out the difference between the reported times and the known latency to the fastest level * of the memory hierarchy. * * - The entries in the lower-left hand corner of the table (large * strides in small memory) are dominated by loop overhead; ignore them. * * - Implement a good page coloring algorithm to remove jitter caused by cache * thrashing. Look at the cache-line sized stride to see the frequency of thrashing. * */
#include <stdio.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#define DEF_MAXMEM 16777216
#define MINMEM 4096
#define ITYPE signed int
signed long max_mem; char *mach_name;
#if defined(_WIN64)
typedef unsigned __int64 ULONG_PTR; #else
typedef unsigned long ULONG_PTR; #endif
#define MAXSTRIDE 32768
#define MINSTRIDE 4
char *version_string = "1.0 (20 Dec 1993)"; extern ITYPE arg_to_int(char *); extern double bash(char *, long, long, long); extern int bash_loop(char *, long, long, long); extern void allocate_memory(char *, long); extern void usage(char *);
int __cdecl main( int argc, char *argv[] ) { ITYPE nbytes; ITYPE stride; ITYPE iters; char *region;
if ((argc > 1) && (strcmp(argv[1], "-v") == 0)) { fprintf(stderr, "This is memtest version %s.\n", version_string); exit(1); } if (argc < 3) usage(argv[0]); mach_name = argv[1]; iters = arg_to_int(argv[2]); if (argc < 4) { max_mem = DEF_MAXMEM; } else { max_mem = arg_to_int(argv[3]); } region = (char *) malloc(max_mem+(128*1024)); region = (char *) ((((ULONG_PTR) region) + (128*1024-1)) & ~((128*1024)-1)); if (region == NULL) { perror("malloc failed"); exit(1); } printf(" %8s", ""); printf("%8s", ""); for (nbytes = MINMEM; nbytes <= max_mem; nbytes += nbytes) { if (nbytes >= (1024 * 1024)) printf("%4dm", nbytes / (1024 * 1024)); else if (nbytes >= 1024) printf("%4dk", nbytes / 1024); else printf("%5d", nbytes); } printf("\n"); for (stride = MINSTRIDE; stride <= MAXSTRIDE; stride += stride) { printf("L %-8s", mach_name); if (stride >= (1024 * 1024)) printf("%7dm", stride / (1024 * 1024)); else if (stride >= 1024) printf("%7dk", stride / 1024); else printf("%8d", stride); for (nbytes = MINMEM; nbytes <= max_mem; nbytes += nbytes) { double ns_ref = bash(region, nbytes, stride, iters); printf("%5.0f", ns_ref); fflush(stdout); } printf("\n"); } exit(0); return 0; }
ITYPE arg_to_int(char *arg) { ITYPE rslt = 0; ITYPE mult = 1;
switch (arg[strlen(arg) - 1]) { case 'k': case 'K': mult = 1024; break;
case 'm': case 'M': mult = 1024 * 1024; break;
default: mult = 1; break; } if (!((arg[0] >= '0') && arg[0] <= '9')) { fprintf(stderr, "Argument %s not a number\n", arg); usage("memtest"); exit(1); } if (sscanf(arg, "%ld", &rslt) != 1) { fprintf(stderr, "Argument %s not a number\n", arg); usage("memtest"); exit(1); } rslt *= mult; return rslt; }
double bash( char *region, long nbytes, /* size of region to bash (bytes) */ long stride, /* stride through region (bytes) */ long iters /* target # of loop iterations */ ) { signed long count; signed long reps; clock_t start, stop; double utime, stime;
count = ((nbytes - sizeof(int)) / stride) + 1; if (! (((count - 1) * stride + (long)sizeof(int)) <= nbytes)) { fprintf(stderr, "trip count problem\n"); exit(1); } reps = (iters + count - 1) / count; if (reps <= 0) reps = 1; iters = reps * count;
/* make sure the memory is allocated */ memset(region, 0, nbytes); memset(region, 1, nbytes); allocate_memory(region, nbytes); memset(region, 0, nbytes); /* warm up the cache */ (void) bash_loop(region, count, stride, 1L);
/* run the bash loop */ start = clock(); (void) bash_loop(region, count, stride, reps); stop = clock(); utime = (double) (stop - start) / CLOCKS_PER_SEC; stime = 0.0;
return 1e9 * ((utime + stime) / iters); }
/* Your virtual memory pagesize must be at least this big */ #define MIN_PAGESIZE 256
void allocate_memory( char *region, /* memory region to be bashed */ long nbytes) { /* size of region (bytes) */ long i;
for (i = 0; i < nbytes; i += MIN_PAGESIZE) *((int *) (region + i)) = 0; }
int bash_loop( char *region, /* memory region to be bashed */ long count, /* number of locations to bash */ long stride, /* stride between locations (bytes) */ long reps /* number of passes through region */ ) { long i; int rslt; char *tmp;
rslt = 0; for (; reps > 0; reps--) { tmp = region; for (i = count; i > 0; i--) { rslt ^= *((int *) tmp); tmp += stride; } }
return rslt; }
void usage(char *progname) { fprintf(stderr, "usage: %s <machname> <iters> [<maxmem>]\n", progname); fprintf(stderr, " <machname> machine name\n"); fprintf(stderr, " <iters> target # of accesses\n"); fprintf(stderr, " <maxmem> maximum amount of mem to touch (def 16 Mb)\n"); exit(1); }
|