Some notes about PBS


Job submission Commands Installing and checking Manual:


user unable to submit form specific node (not headnode)

Initial error acl_users (listed in the man pbs_server_attributes page) seems not used. authorized_users is used instead in the config file:
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@itb*'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@vtb*'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@vtb*'
[root@itbv-pbs ~]# qmgr -c 'set server authorized_users+=uc3@itb*'

wrong syntax (illegal -W value)

Known bug fixed in blahp due to different syntax in different PBS versions: Correct syntax (to use in qsub script) is:
-W stagein=\'file1 at host1:src1,file2 at host2:src2\'

qstat examples

Checked PBS info in manual and online

Check also my troubleshooting page from earlier campus grid jobs

***** Start *****
*** qstat
Job id                    Name             User            Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
279529.itbv-pbs            job1-notr.sub    marco           00:00:00 C short          
279530.itbv-pbs            bl_c22fc32fb6c8  marco                  0 Q short          
279531.itbv-pbs            job1-notr.sub    marco                  0 Q short          
279532.itbv-pbs            bl_f60ab6e8d142  marco                  0 Q short          
*** qstat -q


Queue            Memory CPU Time Walltime Node  Run Que Lm  State
---------------- ------ -------- -------- ----  --- --- --  -----
sl6                --      --    96:00:00   --    0   0 16   E ^@
htpc               --      --    48:00:00   --    0   0  2   E R
long               --      --    96:00:00   --    0   0 --   E R
short              --      --    96:00:00   --    0   3 --   E R
default            --      --       --      --    0   0 --   E R
                                               ----- -----
                                                   0     3
*** qstat -Q
Queue              Max   Tot   Ena   Str   Que   Run   Hld   Wat   Trn   Ext T         
----------------   ---   ---   ---   ---   ---   ---   ---   ---   ---   --- -         
sl6                 16     0   yes    no     0     0     0     0     0     0 E         
htpc                 2     0   yes   yes     0     0     0     0     0     0 E         
long                 0     0   yes   yes     0     0     0     0     0     0 E         
short                0     4   yes   yes     3     0     0     0     0     0 E         
default              0     0   yes   yes     0     0     0     0     0     0 R         
*** qstat -B
Server             Max   Tot   Que   Run   Hld   Wat   Trn   Ext Status    
----------------   ---   ---   ---   ---   ---   ---   ---   --- ----------
itbv-pbs.mwt2.or     0     4     3     0     0     0     0     0 Idle      
*** qstat -a 
                                                                         Req'd  Req'd   Elap
Job ID               Username Queue    Jobname          SessID NDS   TSK Memory Time  S Time
-------------------- -------- -------- ---------------- ------ ----- --- ------ ----- - -----
279529.itbv-pbs.     marco    short    job1-notr.sub     14819     1   1    --  72:00 C 00:00
279530.itbv-pbs.     marco    short    bl_c22fc32fb6c8     --      1   1    --  72:00 Q   -- 
279531.itbv-pbs.     marco    short    job1-notr.sub       --      1   1    --  72:00 Q   -- 
279532.itbv-pbs.     marco    short    bl_f60ab6e8d142     --      1   1    --  72:00 Q   -- 
*** qstat -R 
                                                 Req'd  Req'd   Elap 
Job ID               Username Queue    NDS   TSK Memory Time  S Time   BIG  FAST   PFS
-------------------- -------- -------- ----- --- ------ ----- - ----- ----- ----- -----
279529.itbv-pbs.     marco    short        1   1    --  72:00 C 00:00   --    --    -- 
279530.itbv-pbs.     marco    short        1   1    --  72:00 Q   --    --    --    -- 
279531.itbv-pbs.     marco    short        1   1    --  72:00 Q   --    --    --    -- 
279532.itbv-pbs.     marco    short        1   1    --  72:00 Q   --    --    --    -- 
*** qstat -al 
                                                                         Req'd  Req'd   Elap
Job ID               Username Queue    Jobname          SessID NDS   TSK Memory Time  S Time
-------------------- -------- -------- ---------------- ------ ----- --- ------ ----- - -----
279529.itbv-pbs.     marco    short    job1-notr.sub     14819     1   1    --  72:00 C 00:00
279530.itbv-pbs.     marco    short    bl_c22fc32fb6c8     --      1   1    --  72:00 Q   -- 
279531.itbv-pbs.     marco    short    job1-notr.sub       --      1   1    --  72:00 Q   -- 
279532.itbv-pbs.     marco    short    bl_f60ab6e8d142     --      1   1    --  72:00 Q   -- 
*** qstat -f1
Job Id:
    Job_Name = job1-notr.sub
    Job_Owner =
    resources_used.cput = 00:00:00
    resources_used.mem = 0kb
    resources_used.vmem = 0kb
    resources_used.walltime = 00:00:00
    job_state = C
    queue = short
    server =
    Checkpoint = u
    ctime = Mon Mar  5 16:45:11 2012
    Error_Path =
    exec_host =
    exec_port = 15003
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = a
    mtime = Mon Mar  5 16:45:19 2012
    Output_Path =
    Priority = 0
    qtime = Mon Mar  5 16:45:11 2012
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    Resource_List.walltime = 72:00:00
    session_id = 14819
    Variable_List = PBS_O_QUEUE=default,,PBS_O_HOME=/home/marco,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/mail/marco,PBS_O_SHELL=/bin/bash,,PBS_O_WORKDIR=/share/home/marco
    etime = Mon Mar  5 16:45:11 2012
    exit_status = 0
    submit_args = /home/marco/pbstest/job1-notr.sub
    start_time = Mon Mar  5 16:45:19 2012
    start_count = 1
    fault_tolerant = False
    comp_time = Mon Mar  5 16:45:19 2012
    submit_host =
    init_work_dir = /share/home/marco

Job Id:
    Job_Name = bl_c22fc32fb6c8
    Job_Owner =
    job_state = Q
    queue = short
    server =
    Checkpoint = u
    ctime = Mon Mar  5 16:45:20 2012
    Error_Path =
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = n
    mtime = Mon Mar  5 16:45:20 2012
    Output_Path =
    Priority = 0
    qtime = Mon Mar  5 16:45:20 2012
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    Resource_List.walltime = 72:00:00
    Shell_Path_List = /bin/bash
    stageout =
    Variable_List = PBS_O_QUEUE=default,,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
    etime = Mon Mar  5 16:45:20 2012
    submit_args = /tmp/bl_c22fc32fb6c8
    fault_tolerant = False
    submit_host =
    init_work_dir = /share/home/marco/condor-tutorial

Job Id:
    Job_Name = job1-notr.sub
    Job_Owner =
    job_state = Q
    queue = short
    server =
    Checkpoint = u
    ctime = Mon Mar  5 16:45:21 2012
    Error_Path =
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = a
    mtime = Mon Mar  5 16:45:21 2012
    Output_Path =
    Priority = 0
    qtime = Mon Mar  5 16:45:21 2012
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    Resource_List.walltime = 72:00:00    Variable_List = PBS_O_QUEUE=default,,PBS_O_HOME=/home/marco,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/mail/marco,PBS_O_SHELL=/bin/bash,,PBS_O_WORKDIR=/share/home/m
    etime = Mon Mar  5 16:45:21 2012
    submit_args = /home/marco/pbstest/job1-notr.sub
    fault_tolerant = False
    submit_host =
    init_work_dir = /share/home/marco

Job Id:
    Job_Name = bl_f60ab6e8d142
    Job_Owner =
    job_state = Q
    queue = short
    server =
    Checkpoint = u
    ctime = Mon Mar  5 16:45:25 2012
    Error_Path =
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = n
    mtime = Mon Mar  5 16:45:25 2012
    Output_Path =
    Priority = 0
    qtime = Mon Mar  5 16:45:25 2012
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    Resource_List.walltime = 72:00:00
    Shell_Path_List = /bin/bash
    stageout =
    Variable_List = PBS_O_QUEUE=default,,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
    etime = Mon Mar  5 16:45:25 2012
    submit_args = /tmp/bl_f60ab6e8d142
    fault_tolerant = False
    submit_host =
    init_work_dir = /share/home/marco/condor-tutorial

Example 2:
qstat -Qf
Queue: sl6
    queue_type = Execution
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    max_running = 16
    from_route_only = False
    resources_max.ncpus = 16
    resources_max.walltime = 96:00:00
    resources_min.ncpus = 1
    resources_min.nodect = 1
    resources_default.walltime = 24:00:00
    mtime = 1330956846
    enabled = True

Queue: htpc
    queue_type = Execution
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    max_running = 2
    from_route_only = True
    resources_max.ncpus = 16
    resources_max.walltime = 48:00:00
    resources_min.ncpus = 4
    resources_min.nodect = 1
    resources_default.walltime = 24:00:00
    mtime = 1330956846
    enabled = True
    started = True

Queue: long
    queue_type = Execution
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    from_route_only = True
    resources_max.walltime = 96:00:00
    resources_min.ncpus = 1
    resources_min.nodect = 1
    resources_min.walltime = 12:01:00
    resources_default.ncpus = 1
    resources_default.nodes = 1
    mtime = 1330956846
    enabled = True
    started = True

Queue: short
    queue_type = Execution
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    from_route_only = True
    resources_max.walltime = 96:00:00
    resources_min.ncpus = 1
    resources_min.nodect = 1
    resources_default.ncpus = 1
    resources_default.nodes = 1
    resources_default.walltime = 72:00:00
    mtime = 1330956846
    resources_assigned.ncpus = 0
    resources_assigned.nodect = 0
    enabled = True
    started = True

Queue: default
    queue_type = Route
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    acl_host_enable = True
    acl_hosts = itbv-pbs.*,itbv-ce-pbs.*,vtbv-ce.*,itb4.*,itb3.*,itb2.*
    mtime = 1330956846
    route_destinations = short,long,htpc
    enabled = True
    started = True

All Job attributes:
Job Id:
    Job_Name = bl_f60ab6e8d142
    Job_Owner =
    job_state = Q
    queue = short
    server =
    Checkpoint = u
    ctime = Mon Mar  5 16:45:25 2012
    Error_Path =
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = n
    mtime = Mon Mar  5 16:45:25 2012
    Output_Path =
    Priority = 0
    qtime = Mon Mar  5 16:45:25 2012
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.nodes = 1
    Resource_List.walltime = 72:00:00
    Shell_Path_List = /bin/bash
    stageout =
    Variable_List = PBS_O_QUEUE=default,,PBS_O_HOME=/home/marco,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=marco,PBS_O_PATH=/usr/local/bin:/bin:/usr/bin,PBS_O_MAIL=/var/spool/mail/marco,PBS_O_SHELL=/bin/bash,,PBS_O_WORKDIR=/share/home/marco/condor-tutorial
    etime = Mon Mar  5 16:45:25 2012
    submit_args = /tmp/bl_f60ab6e8d142
    fault_tolerant = False
    submit_host =
    init_work_dir = /share/home/marco/condor-tutorial

Queue attributes:
Queue: short / default
    queue_type = Execution / Route
    total_jobs = 0
    state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 
    mtime = 1330956846
    enabled = True
    started = True
# for execution:
    from_route_only = True
    resources_max.walltime = 96:00:00
    resources_min.ncpus = 1
    resources_min.nodect = 1
    resources_default.ncpus = 1
    resources_default.nodes = 1
    resources_default.walltime = 72:00:00
    resources_assigned.ncpus = 0
    resources_assigned.nodect = 0
# for route:
    acl_host_enable = True
    acl_hosts = itbv-pbs.*,itbv-ce-pbs.*,vtbv-ce.*,itb4.*,itb3.*,itb2.*
    route_destinations = short,long,htpc


-- MarcoMambelli - 05 Mar 2012

This topic: Main > WikiUsers > MarcoMambelli > MarcoWorkPages > MarcoOSG > MarcoBoscoNotes > MarcoPbsNotes
Topic revision: 14 Mar 2012, MarcoMambelli
This site is powered by FoswikiCopyright © by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Foswiki? Send feedback