source: branches/branch-1.8/zoo-project/zoo-services/utils/hpc/service.c @ 976

Last change on this file since 976 was 907, checked in by djay, 6 years ago

Fix typo. Define StdErr? from FinalizeHPC1 service. make sure to call removeShmLock in case it has been created by the GetStatus? service.

  • Property svn:keywords set to Id
File size: 14.0 KB
Line 
1/**
2 * Author : Gérald FENOY
3 *
4 * Copyright 2017 GeoLabs SARL. All rights reserved.
5 *
6 * This work was supported by public funds received in the framework of GEOSUD,
7 * a project (ANR-10-EQPX-20) of the program "Investissements d'Avenir" managed
8 * by the French National Research Agency
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29
30#include "service.h"
31#include "service_internal.h"
32#include "sshapi.h"
33#include "server_internal.h"
34
35#include <sys/socket.h>
36#include <sys/un.h>
37
38#include <libxml/tree.h>
39#include <libxml/parser.h>
40#include <libxml/xpath.h>
41#include <libxml/xpathInternals.h>
42
43#include <libxslt/xslt.h>
44#include <libxslt/xsltInternals.h>
45#include <libxslt/transform.h>
46#include <libxslt/xsltutils.h>
47
48#include <dirent.h>
49extern "C" {
50
51  /**
52   * FinalizeHPC ZOO Service :
53   * This service is used to inform a ZOO-Kernel waiting for the end of the
54   * execution of a HPC service
55   */
56  ZOO_DLL_EXPORT int FinalizeHPC(maps*& conf,maps*& inputs,maps*& outputs){
57    // Retrieve the jobid corresponding to the identifier generated by SLURM
58    // by reading the file generated when running the SBATCH file
59    map* jobid=getMapFromMaps(inputs,"jobid","value");
60    struct sockaddr_un addr;
61    char buf[100]="3";
62    int fd,rc=NULL;
63    int i=0;
64    map* usid=getMapFromMaps(conf,"lenv","usid");
65    map* tmpPath=getMapFromMaps(conf,"main","tmpPath");
66
67    char *flenv =
68      (char *) malloc ((strlen (tmpPath->value) + 
69                        strlen (jobid->value) + 12) * sizeof (char));
70    sprintf (flenv, "%s/%s_lenv.cfg", tmpPath->value, jobid->value);
71    maps* m = (maps *) malloc (MAPS_SIZE);
72    m->child=NULL;
73    m->next=NULL;
74    map* configId=NULL;
75
76   
77    if(conf_read(flenv, m) != 2){
78      configId=getMapFromMaps(m,"lenv","configId");
79      setMapInMaps(conf,"lenv","configId",configId->value);
80    }else{
81      setMapInMaps(conf,"lenv","message",_("Unable to read the lenv section file of the requested jobid"));
82      return SERVICE_FAILED;
83    }
84   
85    SSHCON *test=ssh_connect(conf);
86    /*if(test==NULL){
87      setMapInMaps(conf,"lenv","message",_("Unable to connect using ssh."));
88      return SERVICE_FAILED;
89      }*/
90
91    char *logPath=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+12)*sizeof(char));
92    sprintf(logPath,"%s/exec_out_%s",tmpPath->value,jobid->value);
93    struct stat f_status;
94    int ts=stat(logPath, &f_status);
95    char* fcontent = NULL;
96    if(ts==0) {
97      fcontent=(char*)malloc(sizeof(char)*(f_status.st_size+1));
98      FILE* f=fopen(logPath,"rb");
99      fread(fcontent,f_status.st_size,1,f);
100      int fsize=f_status.st_size;
101      fcontent[fsize]=0;
102      fclose(f);
103    }else{
104      setMapInMaps(conf,"lenv","message",_("No service with this jobid can be found"));
105      return SERVICE_FAILED;
106    }
107    free(logPath);
108    // Run scontrol to check if the service execution ended.
109    // Store all the informations returned by scontrol command as a cfg file to
110    // be parsed back by the ZOO-Kernel waiting for the execution of the remote
111    // service
112    maps* tmpMaps=createMaps("henv");
113    char* command=(char*)malloc((126)*sizeof(char));
114    sprintf(command,"scontrol show jobid | grep -A24 JobId=%s",fcontent);   
115    if(ssh_exec(conf,command,ssh_get_cnt(conf))==0){
116      free(command);
117      setMapInMaps(conf,"lenv","message",_("Failed to run scontrol remotely"));
118      // TODO: check status in db and if available continue in other case return SERVICE_FAILED
119      return SERVICE_FAILED;
120    }else{
121      free(command);
122      logPath=(char*)malloc((strlen(tmpPath->value)+strlen(usid->value)+11)*sizeof(char));
123      sprintf(logPath,"%s/exec_out_%s",tmpPath->value,usid->value);
124      int ts=stat(logPath, &f_status);
125      if(ts==0) {
126        fcontent=(char*)malloc(sizeof(char)*(f_status.st_size+1));
127        FILE* f=fopen(logPath,"rb");
128        fread(fcontent,f_status.st_size,1,f);
129        int fsize=f_status.st_size;
130        fcontent[fsize]=0;
131        fclose(f);
132        free(logPath);
133        char *token, *saveptr;
134        token = strtok_r (fcontent, " ", &saveptr);
135        while (token != NULL)
136          {
137            char *token1, *saveptr1;
138            char *tmpToken=strdup(token);
139            token1 = strtok_r (tmpToken, "=", &saveptr1);
140            int isNext=-1;
141            int hasTwoElements=0;
142            char *name=NULL;
143            while (token1 != NULL)
144              {
145                if(hasTwoElements==0)
146                  name=strdup(token1);
147                if(hasTwoElements<1)
148                  hasTwoElements+=1;
149                else{
150                  char *value=strdup(token1);
151                  if(value[strlen(value)-1]=='\n')
152                    value[strlen(value)-1]=0;
153                  if(strlen(name)>0 && strlen(value)>0){
154                    if(tmpMaps->content==NULL)
155                      tmpMaps->content=createMap(name,value);
156                    else
157                      addToMap(tmpMaps->content,name,value);
158                    free(value);
159                  }
160                  free(name);
161                  hasTwoElements=0;
162                }
163                token1 = strtok_r (NULL, "=", &saveptr1);
164              }
165            free(tmpToken);
166            token = strtok_r (NULL, " ", &saveptr);
167          }
168      }else{
169        setMapInMaps(conf,"lenv","message",_("Unable to access the downloaded execution log file"));
170        return SERVICE_FAILED;
171      }
172    }
173    logPath=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+15)*sizeof(char));
174    sprintf(logPath,"%s/exec_status_%s",tmpPath->value,jobid->value);
175    dumpMapsToFile(tmpMaps,logPath,0);
176    char *sname=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+21));
177    sprintf(sname,"%s/.wait_socket_%s.sock",tmpPath->value,jobid->value);
178    if ( (fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
179      perror("socket error");
180      setMapInMaps(conf,"lenv","message",_("Socket error"));
181      return SERVICE_FAILED;
182    }
183    memset(&addr, 0, sizeof(addr));
184    addr.sun_family = AF_UNIX;
185    strncpy(addr.sun_path, sname, sizeof(addr.sun_path)-1);
186    if (connect(fd, (struct sockaddr*)&addr, sizeof(addr)) == -1) {
187      perror("connect error");
188      setMapInMaps(conf,"lenv","message",_("Unable to connect"));
189      return SERVICE_FAILED;
190    }
191    if (write(fd, "3", 1) != rc) {
192      if (rc < 0) {
193        perror("write error");
194        setMapInMaps(conf,"lenv","message",_("Unable to announce the successful execution of the HPC service"));
195        close(fd);
196        return SERVICE_FAILED;
197      }
198    }
199    close(fd);
200    setOutputValue(outputs,"Result",(char*)"\"FinalizeHPC run successfully\"",32);
201    unlink(flenv);
202    free(flenv);
203
204    return SERVICE_SUCCEEDED;
205  }
206
207
208  /**
209   * FinalizeHPC1 ZOO Service :
210   * This service is used to inform a ZOO-Kernel waiting for the end of the
211   * execution of a HPC service
212   *
213   * format="AllocCPUS"; for i in $(sacct -e) ; do format="$format,$i"; done; format="$(echo $format | sed "s:AllocCPUS,::")" ; echo $format; sacct --format=$format -p | grep "997f-11e8-9f78-0050569320d2"
214   *
215   * AllocCPUS,AllocGRES,AllocNodes,AllocTRES,Account,AssocID,AveCPU,AveCPUFreq,AveDiskRead,AveDiskWrite,AvePages,AveRSS,AveVMSize,BlockID,Cluster,Comment,ConsumedEnergy,ConsumedEnergyRaw,CPUTime,CPUTimeRAW,DerivedExitCode,Elapsed,Eligible,End,ExitCode,GID,Group,JobID,JobIDRaw,JobName,Layout,MaxDiskRead,MaxDiskReadNode,MaxDiskReadTask,MaxDiskWrite,MaxDiskWriteNode,MaxDiskWriteTask,MaxPages,MaxPagesNode,MaxPagesTask,MaxRSS,MaxRSSNode,MaxRSSTask,MaxVMSize,MaxVMSizeNode,MaxVMSizeTask,MinCPU,MinCPUNode,MinCPUTask,NCPUS,NNodes,NodeList,NTasks,Priority,Partition,QOS,QOSRAW,ReqCPUFreq,ReqCPUFreqMin,ReqCPUFreqMax,ReqCPUFreqGov,ReqCPUS,ReqGRES,ReqMem,ReqNodes,ReqTRES,Reservation,ReservationId,Reserved,ResvCPU,ResvCPURAW,Start,State,Submit,Suspended,SystemCPU,Timelimit,TotalCPU,UID,User,UserCPU,WCKey,WCKeyID
216   * 28||1|cpu=28,node=1|geosud|258|||||||||cluster||||00:00:56|56|0:0|00:00:02|2018-08-06T15:48:13|2018-08-06T15:48:16|0:0|1019|geosud|883299|883299|ZOO-Project_5bd1c32b-997f-11e8-9f78-0050569320d2_GSDBandMath_6_2_005||||||||||||||||||||28|1|muse044||4294360886|defq|qos_geosud|20|Unknown|Unknown|Unknown|Unknown|1||0n|1|cpu=1,node=1|||00:00:01|00:00:01|1|2018-08-06T15:48:14|COMPLETED|2018-08-06T15:48:13|00:00:00||UNLIMITED|00:00:00|1229|geosudwps|||0|
217   *
218   */
219  ZOO_DLL_EXPORT int FinalizeHPC1(maps*& conf,maps*& inputs,maps*& outputs){
220    // Retrieve the jobid corresponding to the identifier generated by SLURM
221    // by reading the file generated when running the SBATCH file
222    map* jobid=getMapFromMaps(inputs,"jobid","value");
223    struct sockaddr_un addr;
224    char buf[100]="3";
225    int fd,rc=NULL;
226    int i=0;
227    map* usid=getMapFromMaps(conf,"lenv","usid");
228    map* tmpPath=getMapFromMaps(conf,"main","tmpPath");
229
230    char *flenv =
231      (char *) malloc ((strlen (tmpPath->value) + 
232                        strlen (jobid->value) + 12) * sizeof (char));
233    sprintf (flenv, "%s/%s_lenv.cfg", tmpPath->value, jobid->value);
234    maps* m = (maps *) malloc (MAPS_SIZE);
235    m->child=NULL;
236    m->next=NULL;
237    map* configId=NULL;
238
239   
240    if(conf_read(flenv, m) != 2){
241      configId=getMapFromMaps(m,"lenv","configId");
242      setMapInMaps(conf,"lenv","configId",configId->value);
243    }else{
244      setMapInMaps(conf,"lenv","message",_("Unable to read the lenv section file of the requested jobid"));
245      return SERVICE_FAILED;
246    }
247
248    SSHCON *test=ssh_connect(conf);
249    /*if(test==NULL){
250      setMapInMaps(conf,"lenv","message",_("Unable to connect using ssh."));
251      return SERVICE_FAILED;
252      }*/
253
254    char *logPath=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+12)*sizeof(char));
255    sprintf(logPath,"%s/exec_out_%s",tmpPath->value,jobid->value);
256    struct stat f_status;
257    int ts=stat(logPath, &f_status);
258    char* fcontent = NULL;
259    if(ts==0) {
260      fcontent=(char*)malloc(sizeof(char)*(f_status.st_size+1));
261      FILE* f=fopen(logPath,"rb");
262      fread(fcontent,f_status.st_size,1,f);
263      int fsize=f_status.st_size;
264      fcontent[fsize]=0;
265      fclose(f);
266    }else{
267      setMapInMaps(conf,"lenv","message",_("No service with this jobid can be found"));
268      return SERVICE_FAILED;
269    }
270    free(logPath);
271    // Run sacct to check if the service execution ended.
272    // Store all the informations returned by scontrol command as a cfg file to
273    // be parsed back by the ZOO-Kernel waiting for the execution of the remote
274    // service
275    maps* tmpMaps=createMaps("henv");
276   
277    map* tmpMap=getMapFromMaps(conf,configId->value,"remote_command_opt");
278    char* command=(char*)malloc((126+strlen(tmpMap->value))*sizeof(char));
279    sprintf(command,"sacct --format=%s -p | grep \"%s\" | sed \"s:||:|None|:g;s:||:|None|:g\"",tmpMap->value,jobid->value);   
280    if(ssh_exec(conf,command,ssh_get_cnt(conf))==0){
281      free(command);
282      setMapInMaps(conf,"lenv","message",_("Failed to run sacct remotely"));
283      // TODO: check status in db and if available continue in other case return SERVICE_FAILED
284      return SERVICE_FAILED;
285    }else{
286      free(command);
287      logPath=(char*)malloc((strlen(tmpPath->value)+strlen(usid->value)+11)*sizeof(char));
288      sprintf(logPath,"%s/exec_out_%s",tmpPath->value,usid->value);
289      int ts=stat(logPath, &f_status);
290      if(ts==0) {
291        fcontent=(char*)malloc(sizeof(char)*(f_status.st_size+1));
292        FILE* f=fopen(logPath,"rb");
293        fread(fcontent,f_status.st_size,1,f);
294        int fsize=f_status.st_size;
295        fcontent[fsize]=0;
296        fclose(f);
297        free(logPath);
298        char *token, *saveptr;
299        char *token1, *saveptr1;
300        token = strtok_r (tmpMap->value, ",", &saveptr);
301        token1 = strtok_r (fcontent, "|", &saveptr1);
302        while (token != NULL) {
303          fprintf(stderr,"%s %d %s \n",__FILE__,__LINE__,token);
304          fflush(stderr);
305          fprintf(stderr,"%s %d %s %s \n",__FILE__,__LINE__,token,token1);
306          fflush(stderr);
307          if(token1 != NULL){
308            if(tmpMaps->content==NULL)
309              tmpMaps->content=createMap(token,token1);
310            else
311              addToMap(tmpMaps->content,token,token1);
312          }
313          token = strtok_r (NULL, ",", &saveptr);
314          token1 = strtok_r (NULL, "|", &saveptr1);
315        }
316      }else{
317        free(logPath);
318        setMapInMaps(conf,"lenv","message",_("Unable to access the downloaded execution log file"));
319        return SERVICE_FAILED;
320      }
321    }
322    tmpMap=getMapFromMaps(tmpMaps,"henv","JobId");
323    if(tmpMap!=NULL){
324      char* tmpStr=(char*)malloc((32)*sizeof(char));
325      sprintf(tmpStr,"slurm-%s.out",tmpMap->value);
326      addToMap(tmpMaps->content,"StdErr",tmpStr); 
327      free(tmpStr);
328    }
329    logPath=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+15)*sizeof(char));
330    sprintf(logPath,"%s/exec_status_%s",tmpPath->value,jobid->value);
331    dumpMapsToFile(tmpMaps,logPath,0);
332    char *sname=(char*)malloc((strlen(tmpPath->value)+strlen(jobid->value)+21));
333    sprintf(sname,"%s/.wait_socket_%s.sock",tmpPath->value,jobid->value);
334    if ( (fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
335      perror("socket error");
336      setMapInMaps(conf,"lenv","message",_("Socket error"));
337      return SERVICE_FAILED;
338    }
339    memset(&addr, 0, sizeof(addr));
340    addr.sun_family = AF_UNIX;
341    strncpy(addr.sun_path, sname, sizeof(addr.sun_path)-1);
342    if (connect(fd, (struct sockaddr*)&addr, sizeof(addr)) == -1) {
343      perror("connect error");
344      setMapInMaps(conf,"lenv","message",_("Unable to connect"));
345      return SERVICE_FAILED;
346    }
347    if (write(fd, "3", 1) != rc) {
348      if (rc < 0) {
349        perror("write error");
350        setMapInMaps(conf,"lenv","message",_("Unable to announce the successful execution of the HPC service"));
351        close(fd);
352        return SERVICE_FAILED;
353      }
354    }
355    close(fd);
356    unlink(flenv);
357    free(flenv);
358    setOutputValue(outputs,"Result",(char*)"\"FinalizeHPC run successfully\"",32);
359
360    return SERVICE_SUCCEEDED;
361  }
362 
363}
Note: See TracBrowser for help on using the repository browser.

Search

Context Navigation

ZOO Sponsors

http://www.zoo-project.org/trac/chrome/site/img/geolabs-logo.pnghttp://www.zoo-project.org/trac/chrome/site/img/neogeo-logo.png http://www.zoo-project.org/trac/chrome/site/img/apptech-logo.png http://www.zoo-project.org/trac/chrome/site/img/3liz-logo.png http://www.zoo-project.org/trac/chrome/site/img/gateway-logo.png

Become a sponsor !

Knowledge partners

http://www.zoo-project.org/trac/chrome/site/img/ocu-logo.png http://www.zoo-project.org/trac/chrome/site/img/gucas-logo.png http://www.zoo-project.org/trac/chrome/site/img/polimi-logo.png http://www.zoo-project.org/trac/chrome/site/img/fem-logo.png http://www.zoo-project.org/trac/chrome/site/img/supsi-logo.png http://www.zoo-project.org/trac/chrome/site/img/cumtb-logo.png

Become a knowledge partner

Related links

http://zoo-project.org/img/ogclogo.png http://zoo-project.org/img/osgeologo.png