windows-server-2003/sdktools/vmtests/color.c


								/* memtest.c, Robert Nix, December, 1993

								 *            [email protected]

								 * based on:

								 *      cbash.c

								 *      kirk johnson @ MIT

								 *      february 1993

								 *

								 *      RCS $Id: cbash.c,v 1.2 1993/08/12 15:30:17 tuna Exp $

								 *

								 * Usage: memtest <machname> <iterations> <max-mem>

								 *    machname   - a short indentifier for the machine being tested.

								 *    iterations - target number of iterations to run for stable timing.

								 *    max-mem    - maximum working set size to test.

								 *

								 *    Iterations and max-mem can be specified with a "k" or "m" suffix

								 *    for kilo or mega iterations/mem.

								 *

								 * Example: Test of a Gateway 60 Mhz Pentium system

								 * Command Line: memtest gp560 8m 4m

								 * Output:

								 *

								--------------------------------------------------------------------------------

								 *                      4k   8k  16k  32k  64k 128k 256k 512k   1m   2m   4m

								 * L gp560          4   68   68   86   86   86   93  104  111  111  111  122

								 * L gp560          8   68   68  107  107  107  114  139  165  154  154  154

								 * L gp560         16   89   68  143  143  143  161  204  232  240  243  243

								 * L gp560         32   68   68  172  168  168  207  290  347  365  365  365

								 * L gp560         64   68   72  168  168  168  207  290  350  368  368  368

								 * L gp560        128   72   75  168  168  168  211  293  358  379  418  379

								 * L gp560        256   75   79  168  168  168  207  293  379  397  401  401

								 * L gp560        512   86   86  172  168  168  215  297  418  440  443  494

								 * L gp560         1k  100  104  175  172  168  218  304  501  522  529  529

								 * L gp560         2k  136  139  179  172  172  222  322  665  687  755  701

								 * L gp560         4k  132  243  232  225  222  286  401  991 1016 1094 1048

								 * L gp560         8k  132  136  243  232  225  290  350  923  973 1034 1109

								 * L gp560        16k  132  136  132  243  232  225  333  937  908  994 1041

								 * L gp560        32k  136  132  136  136  243  232  304  833  919  930 1012

								 *

								--------------------------------------------------------------------------------

								 * Explanation of output.

								 *

								 * There are three kinds of tests.

								 *

								 * L - Load latency test.

								 *     Measures the average repetition rate, in ns, of a latency-oriented load

								 *     loop.  The two main variables are:

								 *

								 *        (1) working set, or the amount of memory touched by the loop.  This

								 *            varies across the columns in the output above, from a low of 4k

								 *            bytes to a high of max-mem, or 16m bytes.

								 *

								 *        (2) stride, or the the number of bytes separating successive loads.

								 *            This is the number in the 3rd column of each of the "L" rows

								 *            in the output above, and varies from 4 bytes to 32k bytes.

								 *

								 * Interpreting the results. This is easiest on a 3d chart in Excel.

								 * Two strides are always particularly interesting:

								 *

								 *      - The cache line or block size stride (32 bytes above).

								 *        Big changes in latencies across the columns show the sizes

								 *        and basic performance of the load side of the cache hierarchy.

								 *

								 *        If you don't know the cache line size: look across the first row

								 *        for the first column that takes a big jump up in latency (the jump

								 *        from 68ns to 86ns between the 8k column and 16k columnabove), then (b)

								 *        scan down the rows of that column for the first relativelystable value

								 *        (172ns in the 32 byte stride row above).  The row containing

								 *        that stable value is probably the cache line size.

								 *

								 *        Look across the cache line size row.  Access time jumps at 16K --

								 *        so the L1 cache is 8K -- and then jumps again at 512K -- so the L2

								 *        cache is 256K.  The slope between 64K and 512K could be caused

								 *        by a thrash in the L2 cache; page coloring could remove this thrash.

								 *

								 *      - The page size stride (4k above).

								 *        Big changes in latencies across the columns expose the tbsize and the cost

								 *        of a tb refill.

								 *

								 *        Scan the 4k line. It takes a big jump in latency at the 512K working

								 *        set (and actually starts to thrash at the 256K working set). This test says the TB

								 *        can map somewhere in the neighborhood of 64 4K pages.  The TB fill time

								 *        looks to be somewhere around 650-700 ns (subtract large working set entries

								 *        in the 32-byte stride line from corresponding entries in the 4k stride line).

								 *

								 * The output always contains a little noise:

								 *

								 *      - Boost the "iterations" command line parameter to remove timing jitter.

								 *

								 *      - All entries contain some loop overhead.  Its fair to normalize results by subtracting

								 *        out the difference between the reported times and the known latency to the fastest level

								 *        of the memory hierarchy.

								 *

								 *      - The entries in the lower-left hand corner of the table (large

								 *        strides in small memory) are dominated by loop overhead; ignore them.

								 *

								 *      - Implement a good page coloring algorithm to remove jitter caused by cache

								 *        thrashing.  Look at the cache-line sized stride to see the frequency of thrashing.

								 *

								 */


								#include <stdio.h>

								#include <malloc.h>

								#include <stdlib.h>

								#include <string.h>

								#include <assert.h>

								#include <time.h>

								#define DEF_MAXMEM 16777216

								#define MINMEM 4096

								#define ITYPE signed int

								signed long     max_mem;

								char           *mach_name;


								#if defined(_WIN64)

								typedef unsigned __int64 ULONG_PTR;

								#else

								typedef unsigned long ULONG_PTR;

								#endif


								#define MAXSTRIDE 32768

								#define MINSTRIDE 4


								char           *version_string = "1.0 (20 Dec 1993)";

								extern ITYPE  arg_to_int(char *);

								extern double bash(char *, long, long, long);

								extern int bash_loop(char *, long, long, long);

								extern void allocate_memory(char *, long);

								extern void usage(char *);


								int __cdecl main(

								     int argc,

								     char *argv[]

								)

								{

								ITYPE           nbytes;

								ITYPE           stride;

								ITYPE           iters;

								char           *region;


								    if ((argc > 1) && (strcmp(argv[1], "-v") == 0)) {

								       fprintf(stderr, "This is memtest version %s.\n", version_string);

								        exit(1);

								    }

								    if (argc < 3)

								    usage(argv[0]);

								    mach_name = argv[1];

								    iters = arg_to_int(argv[2]);

								    if (argc < 4) {

								        max_mem = DEF_MAXMEM;

								    } else {

								        max_mem = arg_to_int(argv[3]);

								    }

								    region = (char *) malloc(max_mem+(128*1024));

								    region = (char *) ((((ULONG_PTR) region) + (128*1024-1)) & ~((128*1024)-1));

								    if (region == NULL) {

								        perror("malloc failed");

								        exit(1);

								    }

								    printf("  %8s", "");

								    printf("%8s", "");

								    for (nbytes = MINMEM; nbytes <= max_mem; nbytes += nbytes) {

								        if (nbytes >= (1024 * 1024))

								            printf("%4dm", nbytes / (1024 * 1024));

								        else if (nbytes >= 1024)

								            printf("%4dk", nbytes / 1024);

								        else

								            printf("%5d", nbytes);

								    }

								    printf("\n");

								    for (stride = MINSTRIDE; stride <= MAXSTRIDE; stride += stride) {

								        printf("L %-8s", mach_name);

								        if (stride >= (1024 * 1024))

								            printf("%7dm", stride / (1024 * 1024));

								        else if (stride >= 1024)

								            printf("%7dk", stride / 1024);

								        else

								            printf("%8d", stride);

								        for (nbytes = MINMEM; nbytes <= max_mem; nbytes += nbytes) {

								            double ns_ref = bash(region, nbytes, stride, iters);

								            printf("%5.0f", ns_ref);

								            fflush(stdout);

								        }

								        printf("\n");

								    }

								    exit(0);

								    return 0;

								}


								ITYPE

								arg_to_int(char *arg)

								{

								ITYPE           rslt = 0;

								ITYPE           mult = 1;


								    switch (arg[strlen(arg) - 1]) {

								    case 'k':

								    case 'K':

								        mult = 1024;

								        break;


								    case 'm':

								    case 'M':

								        mult = 1024 * 1024;

								        break;


								    default:

								        mult = 1;

								        break;

								    }

								    if (!((arg[0] >= '0') && arg[0] <= '9')) {

								        fprintf(stderr, "Argument %s not a number\n", arg);

								        usage("memtest");

								        exit(1);

								    }

								    if (sscanf(arg, "%ld", &rslt) != 1) {

								        fprintf(stderr, "Argument %s not a number\n", arg);

								        usage("memtest");

								        exit(1);

								    }

								    rslt *= mult;

								    return rslt;

								}


								double

								bash(

								     char *region,

								     long nbytes,       /* size of region to bash (bytes) */

								     long stride,       /* stride through region (bytes)  */

								     long iters         /* target # of loop iterations    */

								)

								{

								signed long     count;

								signed long     reps;

								clock_t         start, stop;

								double          utime, stime;


								    count = ((nbytes - sizeof(int)) / stride) + 1;

								    if (! (((count - 1) * stride + (long)sizeof(int)) <= nbytes)) {

								        fprintf(stderr, "trip count problem\n");

								        exit(1);

								    }

								    reps = (iters + count - 1) / count;

								    if (reps <= 0)

								        reps = 1;

								    iters = reps * count;


								    /* make sure the memory is allocated */

								    memset(region, 0, nbytes);

								    memset(region, 1, nbytes);

								    allocate_memory(region, nbytes);

								    memset(region, 0, nbytes);

								    /* warm up the cache */

								    (void) bash_loop(region, count, stride, 1L);


								    /* run the bash loop */

								    start = clock();

								    (void) bash_loop(region, count, stride, reps);

								    stop = clock();

								    utime = (double) (stop - start) / CLOCKS_PER_SEC;

								    stime = 0.0;


								    return 1e9 * ((utime + stime) / iters);

								}


								/* Your virtual memory pagesize must be at least this big */

								#define MIN_PAGESIZE    256


								void

								allocate_memory(

								                char *region,   /* memory region to be bashed       */

								                long nbytes)

								{                       /* size of region (bytes)           */

								long            i;


								    for (i = 0; i < nbytes; i += MIN_PAGESIZE)

								        *((int *) (region + i)) = 0;

								}


								int

								bash_loop(

								          char *region, /* memory region to be bashed       */

								          long count,   /* number of locations to bash      */

								          long stride,  /* stride between locations (bytes) */

								          long reps     /* number of passes through region  */

								)

								{

								long            i;

								int             rslt;

								char           *tmp;


								    rslt = 0;

								    for (; reps > 0; reps--) {

								        tmp = region;

								        for (i = count; i > 0; i--) {

								            rslt ^= *((int *) tmp);

								            tmp += stride;

								        }

								    }


								    return rslt;

								}


								void

								usage(char *progname)

								{

								    fprintf(stderr, "usage: %s <machname> <iters> [<maxmem>]\n", progname);

								    fprintf(stderr, "  <machname>   machine name\n");

								    fprintf(stderr, "  <iters>      target # of accesses\n");

								    fprintf(stderr, "  <maxmem>     maximum amount of mem to touch (def 16 Mb)\n");

								    exit(1);

								}