diff -uprN mpich-1.2.6/mpid/ch_shmem/Makefile.in mpich-1.2.6-patched/mpid/ch_shmem/Makefile.in --- mpich-1.2.6/mpid/ch_shmem/Makefile.in 2002-05-13 20:06:52.000000000 +0200 +++ mpich-1.2.6-patched/mpid/ch_shmem/Makefile.in 2004-11-02 17:10:59.000000000 +0100 @@ -21,7 +21,7 @@ top_srcdir = @top_srcdir@ srcdir = @srcdir@ libbuild_dir = @libbuild_dir@ DEFS = @DEFS@ -I. -I${srcdir} @mpich_includes@ -DHAVE_MPICH_MPID_H \ - -DMPID_DEVICE_CODE @GETNAME_DEFS@ @DEVCFLAGS@ @DEV_DEFS@ + -DMPID_DEVICE_CODE @GETNAME_DEFS@ @DEVCFLAGS@ @DEV_DEFS@ -DNUMA_LINUX @VPATH@ diff -uprN mpich-1.2.6/mpid/ch_shmem/p2pprocs.c mpich-1.2.6-patched/mpid/ch_shmem/p2pprocs.c --- mpich-1.2.6/mpid/ch_shmem/p2pprocs.c 2002-09-09 17:27:52.000000000 +0200 +++ mpich-1.2.6-patched/mpid/ch_shmem/p2pprocs.c 2004-11-02 17:10:59.000000000 +0100 @@ -227,6 +227,15 @@ static int MPID_numprocs = 0; /* Numbe #include #include #include + +#ifdef NUMA_LINUX +#define __USE_GNU +#include +#include +#include +#include +#endif + /* Set SIGCHLD handler */ #ifdef DYNAMIC_CHILDREM static int MPID_child_status = 0; @@ -408,7 +417,10 @@ int argc; char **argv; { int i, rc; - +#ifdef NUMA_LINUX + unsigned long affinity_mask; + long ncpus; +#endif /* set signal handler */ #if defined(MPID_DEBUG_SPECIAL) SIGNAL_HAND_SET( SIGINT, MPID_dump_internals ); @@ -511,6 +523,18 @@ char **argv; else if (rc == 0) { MPID_myid = nextId; +#ifdef NUMA_LINUX + + /* CPU_SET(nextId, &affinity_mask); */ + if ( (ncpus = sysconf(_SC_NPROCESSORS_CONF)) < 0) { + perror("Failed to get number of processors available"); + ncpus = 1; + } + affinity_mask = 1<<(nextId % ncpus); + if (syscall(__NR_sched_setaffinity, 0, sizeof affinity_mask, &affinity_mask) < 0) + perror("Failed to set child processor affinity"); + sched_yield(); +#endif /* NUMA_LINUX */ SIGNAL_UNBLOCK(); /* Should we close stdin (fd==0)? */ return; diff -uprN mpich-1.2.6/mpid/ch_shmem/shdef.h mpich-1.2.6-patched/mpid/ch_shmem/shdef.h --- mpich-1.2.6/mpid/ch_shmem/shdef.h 2003-01-09 21:28:23.000000000 +0100 +++ mpich-1.2.6-patched/mpid/ch_shmem/shdef.h 2004-11-02 17:10:59.000000000 +0100 @@ -36,11 +36,11 @@ #define MPID_MAX_SHMEM 4194304*(PROCESSOR_COUNT/8) #else #define MPID_MAX_PROCS 256 -#define MPID_MAX_SHMEM 4194304 +#define MPID_MAX_SHMEM 4194304*2 #endif /* PROCESSOR_COUNT > 256 */ #else #define MPID_MAX_PROCS 256 -#define MPID_MAX_SHMEM 4194304 +#define MPID_MAX_SHMEM 4194304*2 #endif /* PROCESSOR_COUNT */ #endif /* MPI_cspp */ @@ -141,6 +141,13 @@ typedef struct { This is the global area of memory; when this structure is allocated, we have the initial shared memory */ +#ifdef NUMA_LINUX +#define PAGE_SIZE 4096 +#define GLOBMEM_PAD (PAGE_SIZE - (((sizeof (p2p_lock_t)) * (MPID_MAX_PROCS*2 + 1) + (sizeof (MPID_SHMEM_Queue)) * MPID_MAX_PROCS + (sizeof (MPID_SHMEM_Stack)) * MPID_MAX_PROCS)) % PAGE_SIZE) + +#define PART_SIZE_PER_PROC ((MPID_SHMEM_MAX_PKTS / MPID_MAX_PROCS) * (sizeof (MPID_PKT_T)) + (PAGE_SIZE - ((MPID_SHMEM_MAX_PKTS / MPID_MAX_PROCS) * (sizeof (MPID_PKT_T))))) +#define POOL_SIZE (PART_SIZE_PER_PROC * MPID_MAX_PROCS) +#endif typedef struct { /* locks may need to be aligned, so keep at front (p2p_shmalloc provides 16-byte alignment for each allocated block). */ @@ -149,9 +156,13 @@ typedef struct { p2p_lock_t globlock; MPID_SHMEM_Queue incoming[MPID_MAX_PROCS]; /* Incoming messages */ MPID_SHMEM_Stack avail[MPID_MAX_PROCS]; /* Avail pkts */ - - MPID_PKT_T pool[MPID_SHMEM_MAX_PKTS]; /* Preallocated pkts */ - +#ifdef NUMA_LINUX + char pad[GLOBMEM_PAD]; /* Pad to page-align pool + */ + MPID_PKT_T pool[PART_SIZE_PER_PROC]; /* Preallocated pkts */ +#else + MPID_PKT_T pool[MPID_SHMEM_MAX_PKTS];*/ /* Preallocated pkts */ +#endif /* We put globid last because it may otherwise upset the cache alignment of the arrays */ #if defined(MPI_cspp) diff -uprN mpich-1.2.6/mpid/ch_shmem/shmeminit.c mpich-1.2.6-patched/mpid/ch_shmem/shmeminit.c --- mpich-1.2.6/mpid/ch_shmem/shmeminit.c 2002-05-13 20:06:53.000000000 +0200 +++ mpich-1.2.6-patched/mpid/ch_shmem/shmeminit.c 2004-11-02 17:10:59.000000000 +0100 @@ -16,7 +16,12 @@ #include "flow.h" #include "chpackflow.h" #include - +#ifdef NUMA_LINUX +#define __USE_GNU +#include +#include +#include +#endif /* #define DEBUG(a) {a} */ #define DEBUG(a) @@ -45,6 +50,17 @@ MPID_Device *MPID_CH_InitMsgPass( int *a { MPID_Device *dev; +#ifdef NUMA_LINUX + /* Set hard affinity for this process to node 0 */ + unsigned long affinity_mask = 1; + + /* CPU_SET(0, &affinity_mask);*/ + if (syscall(__NR_sched_setaffinity, 0, sizeof affinity_mask, &affinity_mask) < 0) { + perror("Failed to set start process affinity"); + } + sched_yield(); +#endif + dev = (MPID_Device *)MALLOC( sizeof(MPID_Device) ); if (!dev) return 0; /* The short protocol MUST be for messages no longer than diff -uprN mpich-1.2.6/mpid/ch_shmem/shmempriv.c mpich-1.2.6-patched/mpid/ch_shmem/shmempriv.c --- mpich-1.2.6/mpid/ch_shmem/shmempriv.c 2002-04-05 19:09:45.000000000 +0200 +++ mpich-1.2.6-patched/mpid/ch_shmem/shmempriv.c 2004-11-02 17:10:59.000000000 +0100 @@ -9,7 +9,7 @@ #ifdef HAVE_STDLIB_H #include #endif - +#include /* MPID_shmem is not volatile but its contents are */ MPID_SHMEM_globmem *MPID_shmem = 0; /* LOCAL copy of some of MPID_shmem */ @@ -197,18 +197,19 @@ void MPID_SHMEM_init( int *argc, char ** MPID_shmem->incoming[i].tail = 0; /* Setup the avail list of packets */ - MPID_shmem->avail[i].head = (MPID_PKT_T * VOLATILE) - &MPID_shmem->pool[cnt]; +#ifndef NUMA_LINUX + MPID_shmem->avail[i].head = (MPID_PKT_T * VOLATILE) + &MPID_shmem->pool[cnt]; + /* This should be done after fork */ for (j=0; jpool[cnt+j].head.next = ((MPID_PKT_T *)MPID_shmem->pool) + cnt + j + 1; -/* MPID_shmem->pool[cnt+j].head.src = i; */ MPID_shmem->pool[cnt+j].head.owner = i; } /* Clear the last "next" pointer */ MPID_shmem->pool[cnt+pkts_per_proc-1].head.next = 0; +#endif /* NUMA_LINUX */ cnt += pkts_per_proc; - p2p_lock_init( MPID_shmem->availlock + i ); p2p_lock_init( MPID_shmem->incominglock + i ); } @@ -264,6 +265,24 @@ void MPID_SHMEM_init( int *argc, char ** #endif MPID_MyWorldRank = MPID_myid; +#ifdef NUMA_LINUX + sched_yield(); + /* Setup the packets for this process, do it here to + first touch the memory on the node it should live on + (in case the OS does not support page migration). */ + cnt = MPID_myid * pkts_per_proc; + p2p_lock( &MPID_shmem->globlock ); + MPID_shmem->avail[MPID_myid].head = (MPID_PKT_T * VOLATILE) + &MPID_shmem->pool[cnt]; + for (j=0; jpool[cnt+j].head.next = + ((MPID_PKT_T *)MPID_shmem->pool) + cnt + j + 1; + MPID_shmem->pool[cnt+j].head.owner = MPID_myid; + } + /* Clear the last "next" pointer */ + MPID_shmem->pool[cnt+pkts_per_proc-1].head.next = 0; + p2p_unlock( &MPID_shmem->globlock ); +#endif /* NUMA_LINUX */ MPID_SHMEM_FreeSetup(); MPID_incoming = &MPID_shmem->incoming[MPID_myid].head;