Andrew Petersen
2014-07-18 14:28:51 UTC
Hello
Lets say my heterogeneous cluster has
n001 with 12 cores
n002 with 20 cores
How do I get slurm to run a job on 12 cores of node 1 and 20 cores on node
2? If I use -N 2 --hint-compute_bound, it will only run n001x12 and
n002x12, if the BatchHost=n001 (If the BatchHost is n002, it will run 20
cores on n001, causing oversubscribing).
I can do it with the low level mpirun -machinefile
command, where machinefile has
n008:20
n001:12
However slurm seems to over-rule this information
Regards
Andrew Petersen
P.S., the output of
scontrol show config
is:
Configuration data as of 2014-07-17T18:51:39
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost = localhost
AccountingStorageLoc = N/A
AccountingStoragePort = 6819
AccountingStorageType = accounting_storage/slurmdbd
AccountingStorageUser = N/A
AccountingStoreJobComment = YES
AuthType = auth/munge
BackupAddr = (null)
BackupController = (null)
BatchStartTimeout = 10 sec
BOOT_TIME = 2014-06-19T10:53:14
CacheGroups = 0
CheckpointType = checkpoint/none
ClusterName = slurm_cluster
CompleteWait = 0 sec
ControlAddr = fission
ControlMachine = fission
CryptoType = crypto/munge
DebugFlags = (null)
DefMemPerNode = UNLIMITED
DisableRootJobs = NO
EnforcePartLimits = NO
Epilog = (null)
EpilogMsgTime = 2000 usec
EpilogSlurmctld = (null)
FastSchedule = 0
FirstJobId = 1
GetEnvTimeout = 2 sec
GresTypes = gpu
GroupUpdateForce = 0
GroupUpdateTime = 600 sec
HASH_VAL = Different Ours=0x2e2a4b6a Slurmctld=0xd9296c09
HealthCheckInterval = 0 sec
HealthCheckProgram = (null)
InactiveLimit = 0 sec
JobAcctGatherFrequency = 30 sec
JobAcctGatherType = jobacct_gather/linux
JobCheckpointDir = /var/slurm/checkpoint
JobCompHost = localhost
JobCompLoc = /var/log/slurm_jobcomp.log
JobCompPort = 0
JobCompType = jobcomp/none
JobCompUser = root
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobFileAppend = 0
JobRequeue = 1
JobSubmitPlugins = (null)
KillOnBadExit = 0
KillWait = 30 sec
Licenses = (null)
MailProg = /bin/mail
MaxJobCount = 10000
MaxJobId = 4294901760
MaxMemPerNode = UNLIMITED
MaxStepCount = 40000
MaxTasksPerNode = 128
MessageTimeout = 10 sec
MinJobAge = 300 sec
MpiDefault = none
MpiParams = (null)
NEXT_JOB_ID = 45294
OverTimeLimit = 0 min
PluginDir = /cm/shared/apps/slurm/2.3.4/lib64/slurm
PlugStackConfig = /etc/slurm/plugstack.conf
PreemptMode = OFF
PreemptType = preempt/none
PriorityType = priority/basic
PrivateData = none
ProctrackType = proctrack/pgid
Prolog = (null)
PrologSlurmctld = /cm/local/apps/cmd/scripts/prolog
PropagatePrioProcess = 0
PropagateResourceLimits = ALL
PropagateResourceLimitsExcept = (null)
ResumeProgram = (null)
ResumeRate = 300 nodes/min
ResumeTimeout = 60 sec
ResvOverRun = 0 min
ReturnToService = 2
SallocDefaultCommand = (null)
SchedulerParameters = (null)
SchedulerPort = 7321
SchedulerRootFilter = 1
SchedulerTimeSlice = 30 sec
SchedulerType = sched/backfill
SelectType = select/linear
SelectTypeParameters = CR_CPU
SlurmUser = slurm(117)
SlurmctldDebug = 3
SlurmctldLogFile = /var/log/slurmctld
SlurmSchedLogFile = (null)
SlurmctldPort = 6817
SlurmctldTimeout = 600 sec
SlurmdDebug = 3
SlurmdLogFile = /var/log/slurmd
SlurmdPidFile = /var/run/slurmd.pid
SlurmdPort = 6818
SlurmdSpoolDir = /cm/local/apps/slurm/var/spool
SlurmdTimeout = 600 sec
SlurmdUser = root(0)
SlurmSchedLogLevel = 0
SlurmctldPidFile = /var/run/slurmctld.pid
SLURM_CONF = /etc/slurm/slurm.conf
SLURM_VERSION = 2.3.4
SrunEpilog = (null)
SrunProlog = (null)
StateSaveLocation = /cm/shared/apps/slurm/var/cm/statesave
SuspendExcNodes = (null)
SuspendExcParts = (null)
SuspendProgram = (null)
SuspendRate = 60 nodes/min
SuspendTime = NONE
SuspendTimeout = 30 sec
SwitchType = switch/none
TaskEpilog = (null)
TaskPlugin = task/none
TaskPluginParam = (null type)
TaskProlog = (null)
TmpFS = /tmp
TopologyPlugin = topology/none
TrackWCKey = 0
TreeWidth = 50
UsePam = 0
UnkillableStepProgram = (null)
UnkillableStepTimeout = 60 sec
VSizeFactor = 0 percent
WaitTime = 0 sec
Slurmctld(primary/backup) at fission/(NULL) are UP/DOW
People (2)
Jonathan Wormald
Show details
Lets say my heterogeneous cluster has
n001 with 12 cores
n002 with 20 cores
How do I get slurm to run a job on 12 cores of node 1 and 20 cores on node
2? If I use -N 2 --hint-compute_bound, it will only run n001x12 and
n002x12, if the BatchHost=n001 (If the BatchHost is n002, it will run 20
cores on n001, causing oversubscribing).
I can do it with the low level mpirun -machinefile
command, where machinefile has
n008:20
n001:12
However slurm seems to over-rule this information
Regards
Andrew Petersen
P.S., the output of
scontrol show config
is:
Configuration data as of 2014-07-17T18:51:39
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost = localhost
AccountingStorageLoc = N/A
AccountingStoragePort = 6819
AccountingStorageType = accounting_storage/slurmdbd
AccountingStorageUser = N/A
AccountingStoreJobComment = YES
AuthType = auth/munge
BackupAddr = (null)
BackupController = (null)
BatchStartTimeout = 10 sec
BOOT_TIME = 2014-06-19T10:53:14
CacheGroups = 0
CheckpointType = checkpoint/none
ClusterName = slurm_cluster
CompleteWait = 0 sec
ControlAddr = fission
ControlMachine = fission
CryptoType = crypto/munge
DebugFlags = (null)
DefMemPerNode = UNLIMITED
DisableRootJobs = NO
EnforcePartLimits = NO
Epilog = (null)
EpilogMsgTime = 2000 usec
EpilogSlurmctld = (null)
FastSchedule = 0
FirstJobId = 1
GetEnvTimeout = 2 sec
GresTypes = gpu
GroupUpdateForce = 0
GroupUpdateTime = 600 sec
HASH_VAL = Different Ours=0x2e2a4b6a Slurmctld=0xd9296c09
HealthCheckInterval = 0 sec
HealthCheckProgram = (null)
InactiveLimit = 0 sec
JobAcctGatherFrequency = 30 sec
JobAcctGatherType = jobacct_gather/linux
JobCheckpointDir = /var/slurm/checkpoint
JobCompHost = localhost
JobCompLoc = /var/log/slurm_jobcomp.log
JobCompPort = 0
JobCompType = jobcomp/none
JobCompUser = root
JobCredentialPrivateKey = (null)
JobCredentialPublicCertificate = (null)
JobFileAppend = 0
JobRequeue = 1
JobSubmitPlugins = (null)
KillOnBadExit = 0
KillWait = 30 sec
Licenses = (null)
MailProg = /bin/mail
MaxJobCount = 10000
MaxJobId = 4294901760
MaxMemPerNode = UNLIMITED
MaxStepCount = 40000
MaxTasksPerNode = 128
MessageTimeout = 10 sec
MinJobAge = 300 sec
MpiDefault = none
MpiParams = (null)
NEXT_JOB_ID = 45294
OverTimeLimit = 0 min
PluginDir = /cm/shared/apps/slurm/2.3.4/lib64/slurm
PlugStackConfig = /etc/slurm/plugstack.conf
PreemptMode = OFF
PreemptType = preempt/none
PriorityType = priority/basic
PrivateData = none
ProctrackType = proctrack/pgid
Prolog = (null)
PrologSlurmctld = /cm/local/apps/cmd/scripts/prolog
PropagatePrioProcess = 0
PropagateResourceLimits = ALL
PropagateResourceLimitsExcept = (null)
ResumeProgram = (null)
ResumeRate = 300 nodes/min
ResumeTimeout = 60 sec
ResvOverRun = 0 min
ReturnToService = 2
SallocDefaultCommand = (null)
SchedulerParameters = (null)
SchedulerPort = 7321
SchedulerRootFilter = 1
SchedulerTimeSlice = 30 sec
SchedulerType = sched/backfill
SelectType = select/linear
SelectTypeParameters = CR_CPU
SlurmUser = slurm(117)
SlurmctldDebug = 3
SlurmctldLogFile = /var/log/slurmctld
SlurmSchedLogFile = (null)
SlurmctldPort = 6817
SlurmctldTimeout = 600 sec
SlurmdDebug = 3
SlurmdLogFile = /var/log/slurmd
SlurmdPidFile = /var/run/slurmd.pid
SlurmdPort = 6818
SlurmdSpoolDir = /cm/local/apps/slurm/var/spool
SlurmdTimeout = 600 sec
SlurmdUser = root(0)
SlurmSchedLogLevel = 0
SlurmctldPidFile = /var/run/slurmctld.pid
SLURM_CONF = /etc/slurm/slurm.conf
SLURM_VERSION = 2.3.4
SrunEpilog = (null)
SrunProlog = (null)
StateSaveLocation = /cm/shared/apps/slurm/var/cm/statesave
SuspendExcNodes = (null)
SuspendExcParts = (null)
SuspendProgram = (null)
SuspendRate = 60 nodes/min
SuspendTime = NONE
SuspendTimeout = 30 sec
SwitchType = switch/none
TaskEpilog = (null)
TaskPlugin = task/none
TaskPluginParam = (null type)
TaskProlog = (null)
TmpFS = /tmp
TopologyPlugin = topology/none
TrackWCKey = 0
TreeWidth = 50
UsePam = 0
UnkillableStepProgram = (null)
UnkillableStepTimeout = 60 sec
VSizeFactor = 0 percent
WaitTime = 0 sec
Slurmctld(primary/backup) at fission/(NULL) are UP/DOW
People (2)
Jonathan Wormald
Show details