1
/*****************************************************************************\
2
* sacct_stat.c - stat slurmd for percise job information
4
* $Id: options.c 7541 2006-03-18 01:44:58Z da $
5
*****************************************************************************
6
* Copyright (C) 2006 The Regents of the University of California.
7
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8
* Written by Danny Auble <da@llnl.gov>.
11
* This file is part of SLURM, a resource management program.
12
* For details, see <http://www.llnl.gov/linux/slurm/>.
14
* SLURM is free software; you can redistribute it and/or modify it under
15
* the terms of the GNU General Public License as published by the Free
16
* Software Foundation; either version 2 of the License, or (at your option)
19
* In addition, as a special exception, the copyright holders give permission
20
* to link the code of portions of this program with the OpenSSL library under
21
* certain conditions as described in each individual source file, and
22
* distribute linked combinations including the two. You must obey the GNU
23
* General Public License in all respects for all of the code used other than
24
* OpenSSL. If you modify file(s) with this exception, you may extend this
25
* exception to your version of the file(s), but you are not obligated to do
26
* so. If you do not wish to do so, delete this exception statement from your
27
* version. If you delete this exception statement from all source files in
28
* the program, then also delete it here.
30
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
31
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
35
* You should have received a copy of the GNU General Public License along
36
* with SLURM; if not, write to the Free Software Foundation, Inc.,
37
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
38
\*****************************************************************************/
42
#include "src/common/forward.h"
43
#include "src/common/slurm_auth.h"
45
jobacct_step_rec_t step;
49
void *_stat_thread(void *args);
50
int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id,
52
int _process_results();
55
int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id,
60
stat_jobacct_msg_t *jobacct_msg = NULL;
64
ret_data_info_t *ret_data_info = NULL;
65
int rc = SLURM_SUCCESS;
68
debug("getting the stat of job %d on %d nodes",
69
job_id, step_layout->node_cnt);
71
memset(&temp_sacct, 0, sizeof(sacct_t));
72
temp_sacct.min_cpu = (float)NO_VAL;
73
memset(&step.sacct, 0, sizeof(sacct_t));
74
step.sacct.min_cpu = (float)NO_VAL;
76
step.stepid = step_id;
77
step.nodes = step_layout->node_list;
79
step.state = JOB_RUNNING;
80
slurm_msg_t_init(&msg);
81
/* Common message contents */
84
r.jobacct = jobacct_gather_g_create(NULL);
85
msg.msg_type = MESSAGE_STAT_JOBACCT;
89
ret_list = slurm_send_recv_msgs(step_layout->node_list, &msg, 0, false);
91
error("got an error no list returned");
95
itr = list_iterator_create(ret_list);
96
while((ret_data_info = list_next(itr))) {
97
switch (ret_data_info->type) {
98
case MESSAGE_STAT_JOBACCT:
99
jobacct_msg = (stat_jobacct_msg_t *)
102
debug2("got it back for job %d",
103
jobacct_msg->job_id);
104
jobacct_gather_g_2_sacct(
106
jobacct_msg->jobacct);
107
ntasks += jobacct_msg->num_tasks;
108
aggregate_sacct(&step.sacct, &temp_sacct);
111
case RESPONSE_SLURM_RC:
112
rc = slurm_get_return_code(ret_data_info->type,
113
ret_data_info->data);
114
error("there was an error with the request rc = %s",
118
rc = slurm_get_return_code(ret_data_info->type,
119
ret_data_info->data);
120
error("unknown return given %d rc = %s",
121
ret_data_info->type, slurm_strerror(rc));
125
list_iterator_destroy(itr);
126
list_destroy(ret_list);
132
step.sacct.ave_rss *= 1024;
133
step.sacct.max_rss *= 1024;
134
step.sacct.ave_vsize *= 1024;
135
step.sacct.max_vsize *= 1024;
137
step.sacct.ave_cpu /= tot_tasks;
138
step.sacct.ave_cpu /= 100;
139
step.sacct.min_cpu /= 100;
140
step.sacct.ave_rss /= tot_tasks;
141
step.sacct.ave_vsize /= tot_tasks;
142
step.sacct.ave_pages /= tot_tasks;
144
jobacct_gather_g_destroy(r.jobacct);
145
return SLURM_SUCCESS;
148
int _process_results()
150
print_fields(JOBSTEP, &step);
151
return SLURM_SUCCESS;
154
int sacct_stat(uint32_t jobid, uint32_t stepid)
157
slurm_msg_t resp_msg;
158
job_step_id_msg_t req;
159
slurm_step_layout_t *step_layout = NULL;
160
int rc = SLURM_SUCCESS;
162
slurm_msg_t_init(&req_msg);
163
slurm_msg_t_init(&resp_msg);
164
debug("requesting info for job %u.%u", jobid, stepid);
166
req.step_id = stepid;
167
req_msg.msg_type = REQUEST_STEP_LAYOUT;
170
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) {
174
switch (resp_msg.msg_type) {
175
case RESPONSE_STEP_LAYOUT:
176
step_layout = (slurm_step_layout_t *)resp_msg.data;
178
case RESPONSE_SLURM_RC:
179
rc = ((return_code_msg_t *) resp_msg.data)->return_code;
180
slurm_free_return_code_msg(resp_msg.data);
181
printf("problem getting job: %s\n", slurm_strerror(rc));
182
slurm_seterrno_ret(rc);
185
slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
190
error("didn't get the job record rc = %s", slurm_strerror(rc));
194
_sacct_query(step_layout, jobid, stepid);
198
slurm_step_layout_destroy(step_layout);