source: XIOS/trunk/src/event_scheduler.hpp @ 492

Last change on this file since 492 was 492, checked in by ymipsl, 10 years ago

Add event scheduler functionnality in order to schedule events from different context, that cause Deadlock or crash when using collective MPI communication in netcdf/hdf5 library.

YM

File size: 7.6 KB
Line 
1#ifndef __EVENT_SCHEDULER_HPP__
2#define __EVENT_SCHEDULER_HPP__
3
4#include "xmlioserver_spl.hpp"
5#include "mpi.hpp"
6
7namespace xios
8{
9
10    //!  Event scheduling class. An instance of this class is used to order the event providing from different context to avoid dead lock.
11    /*!
12     *   Event are ordered in a same context using the timeLine id, so each server will process the same event. But between different
13     *   context, events are not scheduled and servers may choose to process different events and deadlock or MPI crash may occurs if
14     *   collective MPI communication are involved by the events.
15     *   This class solve the problem by scheduling the event and choose which event must be process by each server to insure correct
16     *   synchronisation. Information is send by asynchronous MPI communication to the root process that order the different events
17     *   (First In First Out) and brodcast the information to the other servers. To avoid to much incoming communication for the root
18     *   process, and hierachical tree is used for communicating from a limited number of child processes to the parent. 
19     */
20   
21    class CEventScheduler
22    {
23       public:
24       //!  Constructor
25       /*! A new communicator is created by duplicate comm. The communicating tree hierarchy is created.
26        *  @param[in] comm : MPI communicator du duplicate for internal use
27        */
28       CEventScheduler(const MPI_Comm& comm) ;
29
30
31       //! Destructor
32       ~CEventScheduler() ;
33
34
35
36       //! public interface for registring an event from the server
37       /*!
38        *  @param[in] timeLine : Time line id of the event
39        *  @param[in] contextHashId : Hashed id of the context
40        */
41       void registerEvent(const size_t timeLine, const size_t contextHashId) ;
42
43
44
45       //! public interface for query if the event defined by timeLine and hashId is sheduled next
46       /*!
47        *  @param[in] timeLine : Time line id of the event
48        *  @param[in] contextHasId : Hashed id of the context
49        *  @return  : boolean value, true is the event is scheduled next
50        *
51        *  If the event is scheduled next, it is remove from the `eventStack` queue list 
52        */   
53       bool queryEvent(const size_t timeLine, const size_t contextHashId) ;
54
55
56       //! Public interface to give the hand to the instance to check pending or incoming message.
57       /*!
58        * Must be called periodicaly. Call `checkParentRequest` and `checkChildRequest` private method.
59        */
60       void checkEvent(void) ;
61
62       private:
63
64
65       //! Send an event to the parent of level `lev+1`
66       /*!
67        *  @param[in] timeLine : Time line id of the event
68        *  @param[in] contextHasId : Hashed id of the context
69        *  @param[in] lev : actual level of the child in the hierarchy
70        *  The event is sent by an asynchrounous MPI_ISend
71        */
72       void registerEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ;
73
74
75
76       //! Children side. Check potential incoming message and if pending request are completed
77       /*!
78        *  - Check by `MPI_Test` if pending request sent to parents are complete.
79        *  - Probe incoming message from parent by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`
80        *  - Check by `MPI_Test` if pending received requests are complete. if yes :
81        *    + Broadcast the event to the childrens if is also a parent
82        *    + Otherwise : push the incomming event in the `eventStack` queue.
83        */
84       void checkParentRequest(void) ;
85
86
87
88       //! Parent side. Check potential incoming message and if pending request are completed
89       /*!
90        *  - Probe incoming message from chidren by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`.
91        *  - Check pending received event request from children using `MPI_Probe`. If and event is received, it is incerted in the
92        *    map `recvEvent` which is increased by 1. If the number of request received from children for this event is equal to the number
93        *    of children then :
94        *    + if the event level is 0, bcast the event to the children.
95        *    + else send the event to the parent.
96        *  - Check pending sent event request to children using `MPI_TEST` and if complete release the corresponding buffer
97        */
98       void checkChildRequest(void) ;
99
100
101
102       //! Parent side. Broadcast a received event from the parent to the children.
103       /*!
104        *  @param[in] timeLine : Time line id of the event
105        *  @param[in] contextHasId : Hashed id of the context
106        *  @param[in] lev : actual level of the child in the hierarchy
107        * Asynchronus MPI_ISend is used.
108        */
109       void bcastEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ;
110       
111
112
113
114       //! Structure defining an event, composed of the timeLine, the context hashId and the hierachical level of the communication.
115       struct SEvent
116       {
117         size_t timeLine ; /*!< Time line id of the event in the context */
118         size_t hashId ; /*!< hassh id of the context */
119         size_t level ;  /*!<hierarchical level of the communication*/
120
121         //! Definition of the == operator : needed to order the object in a map container
122         /*!
123            @param[in] e : object to compare with
124            @return : boolean result of the comparison
125         */
126         bool operator==(const SEvent& e) const
127         { 
128           if (timeLine == e.timeLine && hashId == e.hashId && level==e.level) return true ;
129           else return false ;
130         } ;
131       
132
133         //! Definition of the < operator : needed to order the object in a map container
134         /*!
135            @param[in] e : object to compare with
136            @return : boolean result of the comparison
137         */
138
139         bool operator<(const SEvent& e) const
140         { 
141           if (timeLine < e.timeLine) return true ;
142           else if (timeLine == e.timeLine && hashId < e.hashId) return true ;
143           else if (timeLine == e.timeLine && hashId == e.hashId && level<e.level) return true ;
144           else return false ;
145         } ;
146       } ;       
147       
148
149       //! Pending request struture. It keep send or receive buffer from asynchronous communication while the request is not complete.
150       struct SPendingRequest
151       {
152         size_t buffer[3] ;      /*!< communication buffer : timeLine, hashId, level */
153         MPI_Request request ;   /*!< pending MPI request */ 
154       } ;
155       
156       MPI_Comm communicator ;  /*!< Internal MPI communicator */ 
157       int mpiRank ;            /*!< Rank in the communicator */
158       int mpiSize ;            /*!< Size of the communicator */
159 
160       queue< pair<size_t, size_t> > eventStack ;         
161       queue<SPendingRequest* > pendingSentParentRequest ;   /*!< Pending request sent to parent   */
162       queue<SPendingRequest*>  pendingRecvParentRequest ;   /*!< Pending request recv from parent */   
163       list<SPendingRequest* >  pendingRecvChildRequest ;    /*!< Pending request recv from child  */
164       list<SPendingRequest*>   pendingSentChildRequest ;    /*!< Pending request sent to child    */
165       map< SEvent, int > recvEvent ;                        /*!< list of event received from children. Contains the currnet number children that have already post the same event */
166       
167       
168       int level ;                   /*!< Number of hierachical level for communication */
169       vector<int> parent ;          /*!< Parent rank for each level */ 
170       vector<vector<int> >  child ; /*!< List of child rank for each level */
171       vector<int> nbChild ;         /*!< Number of child for each level */   
172
173    } ;
174}
175
176#endif
Note: See TracBrowser for help on using the repository browser.