Some notes about PBS
References
Job submission
Commands
Installing and checking
Manual:
Troubleshoot
Initial error
acl_users
(listed in the man pbs_server_attributes page) seems not used.
authorized_users
is used instead in the config file:
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@itb*.mwt2.org'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@vtb*.mwt2.org'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@vtb*.uchicago.edu'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@itb*.uchicago.edu'
wrong syntax (illegal -W value)
Known bug fixed in blahp due to different syntax in different PBS versions:
Correct syntax (to use in qsub script) is:
-W stagein=\'file1 at host1:src1,file2 at host2:src2\'
qstat examples
Checked PBS info in manual and online
Check also my troubleshooting page from earlier campus grid jobs
Examples:
***** Start *****
*** qstat
Job id Name User Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
279529.itbv-pbs job1-notr.sub marco 00:00:00 C short
279530.itbv-pbs bl_c22fc32fb6c8 marco 0 Q short
279531.itbv-pbs job1-notr.sub marco 0 Q short
279532.itbv-pbs bl_f60ab6e8d142 marco 0 Q short
*** qstat -q
server: itbv-pbs.mwt2.org
Queue Memory CPU Time Walltime Node Run Que Lm State
---------------- ------ -------- -------- ---- --- --- -- -----
sl6 -- -- 96:00:00 -- 0 0 16 E ^@
htpc -- -- 48:00:00 -- 0 0 2 E R
long -- -- 96:00:00 -- 0 0 -- E R
short -- -- 96:00:00 -- 0 3 -- E R
default -- -- -- -- 0 0 -- E R
----- -----
0 3
*** qstat -Q
Queue Max Tot Ena Str Que Run Hld Wat Trn Ext T
---------------- --- --- --- --- --- --- --- --- --- --- -
sl6 16 0 yes no 0 0 0 0 0 0 E
htpc 2 0 yes yes 0 0 0 0 0 0 E
long 0 0 yes yes 0 0 0 0 0 0 E
short 0 4 yes yes 3 0 0 0 0 0 E
default 0 0 yes yes 0 0 0 0 0 0 R
*** qstat -B
Server Max Tot Que Run Hld Wat Trn Ext Status
---------------- --- --- --- --- --- --- --- --- ----------
itbv-pbs.mwt2.or 0 4 3 0 0 0 0 0 Idle
*** qstat -a
itbv-pbs.mwt2.org:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
-------------------- -------- -------- ---------------- ------ ----- --- ------ ----- - -----
279529.itbv-pbs. marco short job1-notr.sub 14819 1 1 -- 72:00 C 00:00
279530.itbv-pbs. marco short bl_c22fc32fb6c8 -- 1 1 -- 72:00 Q --
279531.itbv-pbs. marco short job1-notr.sub -- 1 1 -- 72:00 Q --
279532.itbv-pbs. marco short bl_f60ab6e8d142 -- 1 1 -- 72:00 Q --
*** qstat -R
itbv-pbs.mwt2.org:
Req'd Req'd Elap
Job ID Username Queue NDS TSK Memory Time S Time BIG FAST PFS
-------------------- -------- -------- ----- --- ------ ----- - ----- ----- ----- -----
279529.itbv-pbs. marco short 1 1 -- 72:00 C 00:00 -- -- --
279530.itbv-pbs. marco short 1 1 -- 72:00 Q -- -- -- --
279531.itbv-pbs. marco short 1 1 -- 72:00 Q -- -- -- --
279532.itbv-pbs. marco short 1 1 -- 72:00 Q -- -- -- --
*** qstat -al
itbv-pbs.mwt2.org:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
-------------------- -------- -------- ---------------- ------ ----- --- ------ ----- - -----
279529.itbv-pbs. marco short job1-notr.sub 14819 1 1 -- 72:00 C 00:00
279530.itbv-pbs. marco short bl_c22fc32fb6c8 -- 1 1 -- 72:00 Q --
279531.itbv-pbs. marco short job1-notr.sub -- 1 1 -- 72:00 Q --
279532.itbv-pbs. marco short bl_f60ab6e8d142 -- 1 1 -- 72:00 Q --
*** qstat -f1
Job Id: 279529.itbv-pbs.mwt2.org
Job_Name = job1-notr.sub
Job_Owner = marco@itbv-ce-pbs.uchicago.edu
resources_used.cput = 00:00:00
resources_used.mem = 0kb
resources_used.vmem = 0kb
resources_used.walltime = 00:00:00
job_state = C
queue = short
server = itbv-pbs.mwt2.org
Checkpoint = u
ctime = Mon Mar 5 16:45:11 2012
Error_Path = itbv-ce-pbs.uchicago.edu:/share/home/marco/job1-notr.sub.e279529
exec_host = itb-cloud.mwt2.org/0
exec_port = 15003
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Mon Mar 5 16:45:19 2012
Output_Path = itbv-ce-pbs.uchicago.edu:/share/home/marco/job1-notr.sub.o279529
Priority = 0
qtime = Mon Mar 5 16:45:11 2012
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
Resource_List.walltime = 72:00:00
session_id = 14819
Variable_List = PBS_O_QUEUE=default,PBS_O_HOST=itbv-ce-pbs.uchicago.edu,PBS_O_HOME=/home/marco,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/mail/marco,PBS_O_SHELL=/bin/bash,PBS_SERVER=itbv-pbs.mwt2.org,PBS_O_WORKDIR=/share/home/marco
etime = Mon Mar 5 16:45:11 2012
exit_status = 0
submit_args = /home/marco/pbstest/job1-notr.sub
start_time = Mon Mar 5 16:45:19 2012
start_count = 1
fault_tolerant = False
comp_time = Mon Mar 5 16:45:19 2012
submit_host = itbv-ce-pbs.uchicago.edu
init_work_dir = /share/home/marco
Job Id: 279530.itbv-pbs.mwt2.org
Job_Name = bl_c22fc32fb6c8
Job_Owner = marco@itbv-ce-pbs.uchicago.edu
job_state = Q
queue = short
server = itbv-pbs.mwt2.org
Checkpoint = u
ctime = Mon Mar 5 16:45:20 2012
Error_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = n
mtime = Mon Mar 5 16:45:20 2012
Output_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Priority = 0
qtime = Mon Mar 5 16:45:20 2012
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
Resource_List.walltime = 72:00:00
Shell_Path_List = /bin/bash
stageout = err_bl_c22fc32fb6c8_pbs-hostname3.err.7-0@itbv-ce-pbs.uchicago.edu:/share/home/marco/condor-tutorial/pbs-hostname3.err.7-0
Variable_List = PBS_O_QUEUE=default,PBS_O_HOST=itbv-ce-pbs.uchicago.edu,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,PBS_SERVER=itbv-pbs.mwt2.org,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
etime = Mon Mar 5 16:45:20 2012
submit_args = /tmp/bl_c22fc32fb6c8
fault_tolerant = False
submit_host = itbv-ce-pbs.uchicago.edu
init_work_dir = /share/home/marco/condor-tutorial
Job Id: 279531.itbv-pbs.mwt2.org
Job_Name = job1-notr.sub
Job_Owner = marco@itbv-ce-pbs.uchicago.edu
job_state = Q
queue = short
server = itbv-pbs.mwt2.org
Checkpoint = u
ctime = Mon Mar 5 16:45:21 2012
Error_Path = itbv-ce-pbs.uchicago.edu:/share/home/marco/job1-notr.sub.e279531
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Mon Mar 5 16:45:21 2012
Output_Path = itbv-ce-pbs.uchicago.edu:/share/home/marco/job1-notr.sub.o279531
Priority = 0
qtime = Mon Mar 5 16:45:21 2012
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
Resource_List.walltime = 72:00:00 Variable_List = PBS_O_QUEUE=default,PBS_O_HOST=itbv-ce-pbs.uchicago.edu,PBS_O_HOME=/home/marco,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/mail/marco,PBS_O_SHELL=/bin/bash,PBS_SERVER=itbv-pbs.mwt2.org,PBS_O_WORKDIR=/share/home/m
arco
etime = Mon Mar 5 16:45:21 2012
submit_args = /home/marco/pbstest/job1-notr.sub
fault_tolerant = False
submit_host = itbv-ce-pbs.uchicago.edu
init_work_dir = /share/home/marco
Job Id: 279532.itbv-pbs.mwt2.org
Job_Name = bl_f60ab6e8d142
Job_Owner = marco@itbv-ce-pbs.uchicago.edu
job_state = Q
queue = short
server = itbv-pbs.mwt2.org
Checkpoint = u
ctime = Mon Mar 5 16:45:25 2012
Error_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = n
mtime = Mon Mar 5 16:45:25 2012
Output_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Priority = 0
qtime = Mon Mar 5 16:45:25 2012
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
Resource_List.walltime = 72:00:00
Shell_Path_List = /bin/bash
stageout = err_bl_f60ab6e8d142_pbs-hostname3.err.8-0@itbv-ce-pbs.uchicago.edu:/share/home/marco/condor-tutorial/pbs-hostname3.err.8-0
Variable_List = PBS_O_QUEUE=default,PBS_O_HOST=itbv-ce-pbs.uchicago.edu,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,PBS_SERVER=itbv-pbs.mwt2.org,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
etime = Mon Mar 5 16:45:25 2012
submit_args = /tmp/bl_f60ab6e8d142
fault_tolerant = False
submit_host = itbv-ce-pbs.uchicago.edu
init_work_dir = /share/home/marco/condor-tutorial
Example 2:
qstat -Qf
Queue: sl6
queue_type = Execution
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
max_running = 16
from_route_only = False
resources_max.ncpus = 16
resources_max.walltime = 96:00:00
resources_min.ncpus = 1
resources_min.nodect = 1
resources_default.walltime = 24:00:00
mtime = 1330956846
enabled = True
Queue: htpc
queue_type = Execution
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
max_running = 2
from_route_only = True
resources_max.ncpus = 16
resources_max.walltime = 48:00:00
resources_min.ncpus = 4
resources_min.nodect = 1
resources_default.walltime = 24:00:00
mtime = 1330956846
enabled = True
started = True
Queue: long
queue_type = Execution
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
from_route_only = True
resources_max.walltime = 96:00:00
resources_min.ncpus = 1
resources_min.nodect = 1
resources_min.walltime = 12:01:00
resources_default.ncpus = 1
resources_default.nodes = 1
mtime = 1330956846
enabled = True
started = True
Queue: short
queue_type = Execution
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
from_route_only = True
resources_max.walltime = 96:00:00
resources_min.ncpus = 1
resources_min.nodect = 1
resources_default.ncpus = 1
resources_default.nodes = 1
resources_default.walltime = 72:00:00
mtime = 1330956846
resources_assigned.ncpus = 0
resources_assigned.nodect = 0
enabled = True
started = True
Queue: default
queue_type = Route
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
acl_host_enable = True
acl_hosts = itbv-pbs.*,itbv-ce-pbs.*,vtbv-ce.*,itb4.*,itb3.*,itb2.*
mtime = 1330956846
route_destinations = short,long,htpc
enabled = True
started = True
All Job attributes:
Job Id: 279532.itbv-pbs.mwt2.org
Job_Name = bl_f60ab6e8d142
Job_Owner = marco@itbv-ce-pbs.uchicago.edu
job_state = Q
queue = short
server = itbv-pbs.mwt2.org
Checkpoint = u
ctime = Mon Mar 5 16:45:25 2012
Error_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = n
mtime = Mon Mar 5 16:45:25 2012
Output_Path = itbv-ce-pbs.uchicago.edu:/dev/null
Priority = 0
qtime = Mon Mar 5 16:45:25 2012
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
Resource_List.walltime = 72:00:00
Shell_Path_List = /bin/bash
stageout = err_bl_f60ab6e8d142_pbs-hostname3.err.8-0@itbv-ce-pbs.uchicago.edu:/share/home/marco/condor-tutorial/pbs-hostname3.err.8-0
Variable_List = PBS_O_QUEUE=default,PBS_O_HOST=itbv-ce-pbs.uchicago.edu,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,PBS_SERVER=itbv-pbs.mwt2.org,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
etime = Mon Mar 5 16:45:25 2012
submit_args = /tmp/bl_f60ab6e8d142
fault_tolerant = False
submit_host = itbv-ce-pbs.uchicago.edu
init_work_dir = /share/home/marco/condor-tutorial
Queue attributes:
Queue: short / default
queue_type = Execution / Route
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
mtime = 1330956846
enabled = True
started = True
# for execution:
from_route_only = True
resources_max.walltime = 96:00:00
resources_min.ncpus = 1
resources_min.nodect = 1
resources_default.ncpus = 1
resources_default.nodes = 1
resources_default.walltime = 72:00:00
resources_assigned.ncpus = 0
resources_assigned.nodect = 0
# for route:
acl_host_enable = True
acl_hosts = itbv-pbs.*,itbv-ce-pbs.*,vtbv-ce.*,itb4.*,itb3.*,itb2.*
route_destinations = short,long,htpc
Summary:
--
MarcoMambelli - 05 Mar 2012