source: XIOS/trunk/src/event_scheduler.hpp @ 492

Last change on this file since 492 was 492, checked in by ymipsl, 7 years ago

Add event scheduler functionnality in order to schedule events from different context, that cause Deadlock or crash when using collective MPI communication in netcdf/hdf5 library.

YM

File size: 7.6 KB
Line 
1#ifndef __EVENT_SCHEDULER_HPP__
2#define __EVENT_SCHEDULER_HPP__
3
4#include "xmlioserver_spl.hpp"
5#include "mpi.hpp"
6
7namespace xios
8{
9
10    //!  Event scheduling class. An instance of this class is used to order the event providing from different context to avoid dead lock.
11    /*!
12     *   Event are ordered in a same context using the timeLine id, so each server will process the same event. But between different
13     *   context, events are not scheduled and servers may choose to process different events and deadlock or MPI crash may occurs if
14     *   collective MPI communication are involved by the events.
15     *   This class solve the problem by scheduling the event and choose which event must be process by each server to insure correct
16     *   synchronisation. Information is send by asynchronous MPI communication to the root process that order the different events
17     *   (First In First Out) and brodcast the information to the other servers. To avoid to much incoming communication for the root
18     *   process, and hierachical tree is used for communicating from a limited number of child processes to the parent. 
19     */
20   
21    class CEventScheduler
22    {
23       public:
24       //!  Constructor
25       /*! A new communicator is created by duplicate comm. The communicating tree hierarchy is created.
26        *  @param[in] comm : MPI communicator du duplicate for internal use
27        */
28       CEventScheduler(const MPI_Comm& comm) ;
29
30
31       //! Destructor
32       ~CEventScheduler() ;
33
34
35
36       //! public interface for registring an event from the server
37       /*!
38        *  @param[in] timeLine : Time line id of the event
39        *  @param[in] contextHashId : Hashed id of the context
40        */
41       void registerEvent(const size_t timeLine, const size_t contextHashId) ;
42
43
44
45       //! public interface for query if the event defined by timeLine and hashId is sheduled next
46       /*!
47        *  @param[in] timeLine : Time line id of the event
48        *  @param[in] contextHasId : Hashed id of the context
49        *  @return  : boolean value, true is the event is scheduled next
50        *
51        *  If the event is scheduled next, it is remove from the `eventStack` queue list 
52        */   
53       bool queryEvent(const size_t timeLine, const size_t contextHashId) ;
54
55
56       //! Public interface to give the hand to the instance to check pending or incoming message.
57       /*!
58        * Must be called periodicaly. Call `checkParentRequest` and `checkChildRequest` private method.
59        */
60       void checkEvent(void) ;
61
62       private:
63
64
65       //! Send an event to the parent of level `lev+1`
66       /*!
67        *  @param[in] timeLine : Time line id of the event
68        *  @param[in] contextHasId : Hashed id of the context
69        *  @param[in] lev : actual level of the child in the hierarchy
70        *  The event is sent by an asynchrounous MPI_ISend
71        */
72       void registerEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ;
73
74
75
76       //! Children side. Check potential incoming message and if pending request are completed
77       /*!
78        *  - Check by `MPI_Test` if pending request sent to parents are complete.
79        *  - Probe incoming message from parent by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`
80        *  - Check by `MPI_Test` if pending received requests are complete. if yes :
81        *    + Broadcast the event to the childrens if is also a parent
82        *    + Otherwise : push the incomming event in the `eventStack` queue.
83        */
84       void checkParentRequest(void) ;
85
86
87
88       //! Parent side. Check potential incoming message and if pending request are completed
89       /*!
90        *  - Probe incoming message from chidren by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`.
91        *  - Check pending received event request from children using `MPI_Probe`. If and event is received, it is incerted in the
92        *    map `recvEvent` which is increased by 1. If the number of request received from children for this event is equal to the number
93        *    of children then :
94        *    + if the event level is 0, bcast the event to the children.
95        *    + else send the event to the parent.
96        *  - Check pending sent event request to children using `MPI_TEST` and if complete release the corresponding buffer
97        */
98       void checkChildRequest(void) ;
99
100
101
102       //! Parent side. Broadcast a received event from the parent to the children.
103       /*!
104        *  @param[in] timeLine : Time line id of the event
105        *  @param[in] contextHasId : Hashed id of the context
106        *  @param[in] lev : actual level of the child in the hierarchy
107        * Asynchronus MPI_ISend is used.
108        */
109       void bcastEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ;
110       
111
112
113
114       //! Structure defining an event, composed of the timeLine, the context hashId and the hierachical level of the communication.
115       struct SEvent
116       {
117         size_t timeLine ; /*!< Time line id of the event in the context */
118         size_t hashId ; /*!< hassh id of the context */
119         size_t level ;  /*!<hierarchical level of the communication*/
120
121         //! Definition of the == operator : needed to order the object in a map container
122         /*!
123            @param[in] e : object to compare with
124            @return : boolean result of the comparison
125         */
126         bool operator==(const SEvent& e) const
127         { 
128           if (timeLine == e.timeLine && hashId == e.hashId && level==e.level) return true ;
129           else return false ;
130         } ;
131       
132
133         //! Definition of the < operator : needed to order the object in a map container
134         /*!
135            @param[in] e : object to compare with
136            @return : boolean result of the comparison
137         */
138
139         bool operator<(const SEvent& e) const
140         { 
141           if (timeLine < e.timeLine) return true ;
142           else if (timeLine == e.timeLine && hashId < e.hashId) return true ;
143           else if (timeLine == e.timeLine && hashId == e.hashId && level<e.level) return true ;
144           else return false ;
145         } ;
146       } ;       
147       
148
149       //! Pending request struture. It keep send or receive buffer from asynchronous communication while the request is not complete.
150       struct SPendingRequest
151       {
152         size_t buffer[3] ;      /*!< communication buffer : timeLine, hashId, level */
153         MPI_Request request ;   /*!< pending MPI request */ 
154       } ;
155       
156       MPI_Comm communicator ;  /*!< Internal MPI communicator */ 
157       int mpiRank ;            /*!< Rank in the communicator */
158       int mpiSize ;            /*!< Size of the communicator */
159 
160       queue< pair<size_t, size_t> > eventStack ;         
161       queue<SPendingRequest* > pendingSentParentRequest ;   /*!< Pending request sent to parent   */
162       queue<SPendingRequest*>  pendingRecvParentRequest ;   /*!< Pending request recv from parent */   
163       list<SPendingRequest* >  pendingRecvChildRequest ;    /*!< Pending request recv from child  */
164       list<SPendingRequest*>   pendingSentChildRequest ;    /*!< Pending request sent to child    */
165       map< SEvent, int > recvEvent ;                        /*!< list of event received from children. Contains the currnet number children that have already post the same event */
166       
167       
168       int level ;                   /*!< Number of hierachical level for communication */
169       vector<int> parent ;          /*!< Parent rank for each level */ 
170       vector<vector<int> >  child ; /*!< List of child rank for each level */
171       vector<int> nbChild ;         /*!< Number of child for each level */   
172
173    } ;
174}
175
176#endif
Note: See TracBrowser for help on using the repository browser.