source: XIOS/dev/dev_ym/XIOS_COUPLING/src/context_client.cpp @ 2258

Last change on this file since 2258 was 2258, checked in by ymipsl, 3 years ago

One sided protocol improvment.
YM

  • Property copyright set to
    Software name : XIOS (Xml I/O Server)
    http://forge.ipsl.jussieu.fr/ioserver
    Creation date : January 2009
    Licence : CeCCIL version2
    see license file in root directory : Licence_CeCILL_V2-en.txt
    or http://www.cecill.info/licences/Licence_CeCILL_V2-en.html
    Holder : CEA/LSCE (Laboratoire des Sciences du CLimat et de l'Environnement)
    CNRS/IPSL (Institut Pierre Simon Laplace)
    Project Manager : Yann Meurdesoif
    yann.meurdesoif@cea.fr
  • Property svn:eol-style set to native
File size: 18.4 KB
Line 
1#include "xios_spl.hpp"
2#include "context_client.hpp"
3#include "context_server.hpp"
4#include "event_client.hpp"
5#include "buffer_out.hpp"
6#include "buffer_client.hpp"
7#include "type.hpp"
8#include "event_client.hpp"
9#include "context.hpp"
10#include "mpi.hpp"
11#include "timer.hpp"
12#include "cxios.hpp"
13#include "server.hpp"
14#include "services.hpp"
15#include <boost/functional/hash.hpp>
16#include <random>
17#include <chrono>
18
19namespace xios
20{
21    /*!
22    \param [in] parent Pointer to context on client side
23    \param [in] intraComm_ communicator of group client
24    \param [in] interComm_ communicator of group server
25    \cxtSer [in] cxtSer Pointer to context of server side. (It is only used in case of attached mode).
26    */
27    CContextClient::CContextClient(CContext* parent, MPI_Comm intraComm_, MPI_Comm interComm_, CContext* cxtSer)
28     : mapBufferSize_(), parentServer(cxtSer), maxBufferedEvents(4), associatedServer_(nullptr)
29    {
30     
31      context_ = parent;
32      intraComm = intraComm_;
33      interComm = interComm_;
34      MPI_Comm_rank(intraComm, &clientRank);
35      MPI_Comm_size(intraComm, &clientSize);
36
37      int flag;
38      MPI_Comm_test_inter(interComm, &flag);
39      if (flag) isAttached_=false ;
40      else  isAttached_=true ;
41
42      pureOneSided=CXios::getin<bool>("pure_one_sided",false); // pure one sided communication (for test)
43      if (isAttachedModeEnabled()) pureOneSided=false ; // no one sided in attach mode
44     
45
46
47      if (flag) MPI_Comm_remote_size(interComm, &serverSize);
48      else  MPI_Comm_size(interComm, &serverSize);
49
50      computeLeader(clientRank, clientSize, serverSize, ranksServerLeader, ranksServerNotLeader);
51
52      if (flag) MPI_Intercomm_merge(interComm_,false, &interCommMerged) ;
53     
54      if (!isAttachedModeEnabled())
55      { 
56
57        CTimer::get("create Windows").resume() ;
58
59        // We create dummy pair of intercommunicator between clients and server
60        // Why ? Just because on openMPI, it reduce the creation time of windows otherwhise which increase quadratically
61        // We don't know the reason
62        double time ;
63        MPI_Comm commSelf ;
64        MPI_Comm_split(intraComm_,clientRank,clientRank, &commSelf) ;
65        MPI_Comm interComm ;
66        winComm_.resize(serverSize) ;
67        windows_.resize(serverSize) ;
68        for(int rank=0; rank<serverSize; rank++) 
69        {
70          time=MPI_Wtime() ;
71          MPI_Intercomm_create(commSelf, 0, interCommMerged, clientSize+rank, 0, &interComm) ;
72          MPI_Intercomm_merge(interComm, false, &winComm_[rank]) ;
73          windows_[rank].resize(2) ;
74          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm_[rank], &windows_[rank][0]);
75          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm_[rank], &windows_[rank][1]); 
76          time=MPI_Wtime()-time ;
77          info(100)<< "MPI_Win_create_dynamic : client to server rank "<<rank<<" => "<<time/1e-6<<" us"<<endl ;
78        }
79        MPI_Comm_free(&commSelf) ;
80        CTimer::get("create Windows").resume() ;
81     }
82
83      auto time=chrono::system_clock::now().time_since_epoch().count() ;
84      std::default_random_engine rd(time); // not reproducible from a run to another
85      std::uniform_int_distribution<size_t> dist;
86      hashId_=dist(rd) ;
87      MPI_Bcast(&hashId_,1,MPI_SIZE_T,0,intraComm) ; // Bcast to all server of the context
88
89      timeLine = 1;
90    }
91
92    void CContextClient::computeLeader(int clientRank, int clientSize, int serverSize,
93                                       std::list<int>& rankRecvLeader,
94                                       std::list<int>& rankRecvNotLeader)
95    {
96      if ((0 == clientSize) || (0 == serverSize)) return;
97
98      if (clientSize < serverSize)
99      {
100        int serverByClient = serverSize / clientSize;
101        int remain = serverSize % clientSize;
102        int rankStart = serverByClient * clientRank;
103
104        if (clientRank < remain)
105        {
106          serverByClient++;
107          rankStart += clientRank;
108        }
109        else
110          rankStart += remain;
111
112        for (int i = 0; i < serverByClient; i++)
113          rankRecvLeader.push_back(rankStart + i);
114
115        rankRecvNotLeader.resize(0);
116      }
117      else
118      {
119        int clientByServer = clientSize / serverSize;
120        int remain = clientSize % serverSize;
121
122        if (clientRank < (clientByServer + 1) * remain)
123        {
124          if (clientRank % (clientByServer + 1) == 0)
125            rankRecvLeader.push_back(clientRank / (clientByServer + 1));
126          else
127            rankRecvNotLeader.push_back(clientRank / (clientByServer + 1));
128        }
129        else
130        {
131          int rank = clientRank - (clientByServer + 1) * remain;
132          if (rank % clientByServer == 0)
133            rankRecvLeader.push_back(remain + rank / clientByServer);
134          else
135            rankRecvNotLeader.push_back(remain + rank / clientByServer);
136        }
137      }
138    }
139
140    /*!
141    In case of attached mode, the current context must be reset to context for client
142    \param [in] event Event sent to server
143    */
144    void CContextClient::sendEvent(CEventClient& event)
145    {
146      list<int> ranks = event.getRanks();
147      info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<endl ;
148      if (CXios::checkEventSync)
149      {
150        int typeId, classId, typeId_in, classId_in;
151        long long timeLine_out;
152        long long timeLine_in( timeLine );
153        typeId_in=event.getTypeId() ;
154        classId_in=event.getClassId() ;
155//        MPI_Allreduce(&timeLine,&timeLine_out, 1, MPI_UINT64_T, MPI_SUM, intraComm) ; // MPI_UINT64_T standardized by MPI 3
156        MPI_Allreduce(&timeLine_in,&timeLine_out, 1, MPI_LONG_LONG_INT, MPI_SUM, intraComm) ; 
157        MPI_Allreduce(&typeId_in,&typeId, 1, MPI_INT, MPI_SUM, intraComm) ;
158        MPI_Allreduce(&classId_in,&classId, 1, MPI_INT, MPI_SUM, intraComm) ;
159        if (typeId/clientSize!=event.getTypeId() || classId/clientSize!=event.getClassId() || timeLine_out/clientSize!=timeLine)
160        {
161           ERROR("void CContextClient::sendEvent(CEventClient& event)",
162               << "Event are not coherent between client for timeline = "<<timeLine);
163        }
164       
165        vector<int> servers(serverSize,0) ;
166        auto ranks=event.getRanks() ;
167        for(auto& rank : ranks) servers[rank]=1 ;
168        MPI_Allreduce(MPI_IN_PLACE, servers.data(), serverSize,MPI_INT,MPI_SUM,intraComm) ;
169        ostringstream osstr ;
170        for(int i=0;i<serverSize;i++)  if (servers[i]==0) osstr<<i<<" , " ;
171        if (!osstr.str().empty())
172        {
173          ERROR("void CContextClient::sendEvent(CEventClient& event)",
174                 <<" Some servers will not receive the message for timeline = "<<timeLine<<endl
175                 <<"Servers are : "<<osstr.str()) ;
176        }
177
178
179      }
180
181      if (!event.isEmpty())
182      {
183        list<int> sizes = event.getSizes();
184
185         // We force the getBuffers call to be non-blocking on classical servers
186        list<CBufferOut*> buffList;
187        getBuffers(timeLine, ranks, sizes, buffList) ;
188
189        event.send(timeLine, sizes, buffList);
190       
191        //for (auto itRank = ranks.begin(); itRank != ranks.end(); itRank++) buffers[*itRank]->infoBuffer() ;
192
193        unlockBuffers(ranks) ;
194        info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<"  sent"<<endl ;
195         
196        checkBuffers(ranks);
197      }
198     
199      if (isAttachedModeEnabled()) // couldBuffer is always true in attached mode
200      {
201        while (checkBuffers(ranks)) context_->globalEventLoop() ;
202     
203        CXios::getDaemonsManager()->scheduleContext(hashId_) ;
204        while (CXios::getDaemonsManager()->isScheduledContext(hashId_)) context_->globalEventLoop() ;
205      }
206     
207      timeLine++;
208    }
209
210    /*!
211    If client is also server (attached mode), after sending event, it should process right away
212    the incoming event.
213    \param [in] ranks list rank of server connected this client
214    */
215    void CContextClient::waitEvent(list<int>& ranks)
216    {
217      while (checkBuffers(ranks))
218      {
219        context_->eventLoop() ;
220      }
221
222      MPI_Request req ;
223      MPI_Status status ;
224
225      MPI_Ibarrier(intraComm,&req) ;
226      int flag=false ;
227
228      do 
229      {
230        CXios::getDaemonsManager()->eventLoop() ;
231        MPI_Test(&req,&flag,&status) ;
232      } while (!flag) ;
233
234
235    }
236
237
238    void CContextClient::waitEvent_old(list<int>& ranks)
239    {
240      parentServer->server->setPendingEvent();
241      while (checkBuffers(ranks))
242      {
243        parentServer->server->listen();
244        parentServer->server->checkPendingRequest();
245      }
246
247      while (parentServer->server->hasPendingEvent())
248      {
249       parentServer->server->eventLoop();
250      }
251    }
252
253    /*!
254     * Get buffers for each connection to the servers. This function blocks until there is enough room in the buffers unless
255     * it is explicitly requested to be non-blocking.
256     *
257     *
258     * \param [in] timeLine time line of the event which will be sent to servers
259     * \param [in] serverList list of rank of connected server
260     * \param [in] sizeList size of message corresponding to each connection
261     * \param [out] retBuffers list of buffers that can be used to store an event
262     * \param [in] nonBlocking whether this function should be non-blocking
263     * \return whether the already allocated buffers could be used
264    */
265    bool CContextClient::getBuffers(const size_t timeLine, const list<int>& serverList, const list<int>& sizeList, list<CBufferOut*>& retBuffers,
266                                    bool nonBlocking /*= false*/)
267    {
268      list<int>::const_iterator itServer, itSize;
269      list<CClientBuffer*> bufferList;
270      map<int,CClientBuffer*>::const_iterator it;
271      list<CClientBuffer*>::iterator itBuffer;
272      bool areBuffersFree;
273
274      for (itServer = serverList.begin(); itServer != serverList.end(); itServer++)
275      {
276        it = buffers.find(*itServer);
277        if (it == buffers.end())
278        {
279          newBuffer(*itServer);
280          it = buffers.find(*itServer);
281        }
282        bufferList.push_back(it->second);
283      }
284
285      double lastTimeBuffersNotFree=0. ;
286      double time ;
287      bool doUnlockBuffers ;
288      CTimer::get("Blocking time").resume();
289      do
290      {
291        areBuffersFree = true;
292        doUnlockBuffers=false ;
293        time=MPI_Wtime() ;
294        if (time-lastTimeBuffersNotFree > latency_)
295        {
296          for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
297          {
298            areBuffersFree &= (*itBuffer)->isBufferFree(*itSize);
299          }
300          if (!areBuffersFree)
301          {
302            lastTimeBuffersNotFree = time ;
303            doUnlockBuffers=true ;
304          }         
305        }
306        else areBuffersFree = false ;
307
308        if (!areBuffersFree)
309        {
310          if (doUnlockBuffers) for (itBuffer = bufferList.begin(); itBuffer != bufferList.end(); itBuffer++) (*itBuffer)->unlockBuffer();
311          checkBuffers();
312
313          context_->globalEventLoop() ;
314        }
315
316      } while (!areBuffersFree && !nonBlocking);
317      CTimer::get("Blocking time").suspend();
318
319      if (areBuffersFree)
320      {
321        for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
322          retBuffers.push_back((*itBuffer)->getBuffer(timeLine, *itSize));
323      }
324      return areBuffersFree;
325   }
326
327   /*!
328   Make a new buffer for a certain connection to server with specific rank
329   \param [in] rank rank of connected server
330   */
331   void CContextClient::newBuffer(int rank)
332   {
333      if (!mapBufferSize_.count(rank))
334      {
335        error(0) << "WARNING: Unexpected request for buffer to communicate with server " << rank << std::endl;
336        mapBufferSize_[rank] = CXios::minBufferSize;
337        maxEventSizes[rank] = CXios::minBufferSize;
338      }
339     
340      vector<MPI_Win> Wins(2,MPI_WIN_NULL) ;
341      if (!isAttachedModeEnabled()) Wins=windows_[rank] ;
342 
343      CClientBuffer* buffer = buffers[rank] = new CClientBuffer(interComm, Wins, clientRank, rank, mapBufferSize_[rank], maxEventSizes[rank]);
344      if (isGrowableBuffer_) buffer->setGrowableBuffer(1.2) ;
345      else buffer->fixBuffer() ;
346      // Notify the server
347      CBufferOut* bufOut = buffer->getBuffer(0, 4*sizeof(MPI_Aint));
348      MPI_Aint sendBuff[4] ;
349      sendBuff[0]=hashId_;
350      sendBuff[1]=mapBufferSize_[rank];
351      sendBuff[2]=buffers[rank]->getWinAddress(0); 
352      sendBuff[3]=buffers[rank]->getWinAddress(1); 
353      info(100)<<"CContextClient::newBuffer : rank "<<rank<<" winAdress[0] "<<buffers[rank]->getWinAddress(0)<<" winAdress[1] "<<buffers[rank]->getWinAddress(1)<<endl;
354      bufOut->put(sendBuff, 4); 
355      buffer->checkBuffer(true);
356
357   }
358
359   /*!
360   Verify state of buffers. Buffer is under pending state if there is no message on it
361   \return state of buffers, pending(true), ready(false)
362   */
363   bool CContextClient::checkBuffers(void)
364   {
365      map<int,CClientBuffer*>::iterator itBuff;
366      bool pending = false;
367      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
368        pending |= itBuff->second->checkBuffer(!pureOneSided);
369      return pending;
370   }
371
372   //! Release all buffers
373   void CContextClient::releaseBuffers()
374   {
375      map<int,CClientBuffer*>::iterator itBuff;
376      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
377      {
378         delete itBuff->second;
379      }
380      buffers.clear();
381
382// don't know when release windows
383
384      if (!isAttachedModeEnabled())
385      { 
386        for(int rank=0; rank<serverSize; rank++)
387        {
388          MPI_Win_free(&windows_[rank][0]);
389          MPI_Win_free(&windows_[rank][1]);
390          MPI_Comm_free(&winComm_[rank]) ;
391        }
392      } 
393   }
394
395     
396  /*!
397   Lock the buffers for one sided communications
398   \param [in] ranks list rank of server to which client connects to
399   */
400   void CContextClient::lockBuffers(list<int>& ranks)
401   {
402      list<int>::iterator it;
403      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->lockBuffer();
404   }
405
406  /*!
407   Unlock the buffers for one sided communications
408   \param [in] ranks list rank of server to which client connects to
409   */
410   void CContextClient::unlockBuffers(list<int>& ranks)
411   {
412      list<int>::iterator it;
413      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->unlockBuffer();
414   }
415     
416   /*!
417   Verify state of buffers corresponding to a connection
418   \param [in] ranks list rank of server to which client connects to
419   \return state of buffers, pending(true), ready(false)
420   */
421   bool CContextClient::checkBuffers(list<int>& ranks)
422   {
423      list<int>::iterator it;
424      bool pending = false;
425      for (it = ranks.begin(); it != ranks.end(); it++) pending |= buffers[*it]->checkBuffer(!pureOneSided);
426      return pending;
427   }
428
429   /*!
430    * Set the buffer size for each connection. Warning: This function is collective.
431    *
432    * \param [in] mapSize maps the rank of the connected servers to the size of the correspoinding buffer
433    * \param [in] maxEventSize maps the rank of the connected servers to the size of the biggest event
434   */
435   void CContextClient::setBufferSize(const std::map<int,StdSize>& mapSize)
436   {
437     for(auto& it : mapSize) 
438      buffers[it.first]->fixBufferSize(std::max(CXios::minBufferSize*1.0,std::min(it.second*CXios::bufferSizeFactor*1.01,CXios::maxBufferSize*1.0)));
439   }
440
441  /*!
442  Get leading server in the group of connected server
443  \return ranks of leading servers
444  */
445  const std::list<int>& CContextClient::getRanksServerNotLeader(void) const
446  {
447    return ranksServerNotLeader;
448  }
449
450  /*!
451  Check if client connects to leading server
452  \return connected(true), not connected (false)
453  */
454  bool CContextClient::isServerNotLeader(void) const
455  {
456    return !ranksServerNotLeader.empty();
457  }
458
459  /*!
460  Get leading server in the group of connected server
461  \return ranks of leading servers
462  */
463  const std::list<int>& CContextClient::getRanksServerLeader(void) const
464  {
465    return ranksServerLeader;
466  }
467
468  /*!
469  Check if client connects to leading server
470  \return connected(true), not connected (false)
471  */
472  bool CContextClient::isServerLeader(void) const
473  {
474    return !ranksServerLeader.empty();
475  }
476
477   /*!
478   * Finalize context client and do some reports. Function is non-blocking.
479   */
480  void CContextClient::finalize(void)
481  {
482    map<int,CClientBuffer*>::iterator itBuff;
483    std::list<int>::iterator ItServerLeader; 
484   
485    bool stop = false;
486
487    int* nbServerConnectionLocal  = new int[serverSize] ;
488    int* nbServerConnectionGlobal  = new int[serverSize] ;
489    for(int i=0;i<serverSize;++i) nbServerConnectionLocal[i]=0 ;
490    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)  nbServerConnectionLocal[itBuff->first]=1 ;
491    for (ItServerLeader = ranksServerLeader.begin(); ItServerLeader != ranksServerLeader.end(); ItServerLeader++)  nbServerConnectionLocal[*ItServerLeader]=1 ;
492   
493    MPI_Allreduce(nbServerConnectionLocal, nbServerConnectionGlobal, serverSize, MPI_INT, MPI_SUM, intraComm);
494   
495    CEventClient event(CContext::GetType(), CContext::EVENT_ID_CONTEXT_FINALIZE);
496    CMessage msg;
497
498    for (int i=0;i<serverSize;++i) if (nbServerConnectionLocal[i]==1) event.push(i, nbServerConnectionGlobal[i], msg) ;
499    sendEvent(event);
500
501    delete[] nbServerConnectionLocal ;
502    delete[] nbServerConnectionGlobal ;
503
504
505    CTimer::get("Blocking time").resume();
506    checkBuffers();
507    CTimer::get("Blocking time").suspend();
508
509    std::map<int,StdSize>::const_iterator itbMap = mapBufferSize_.begin(),
510                                          iteMap = mapBufferSize_.end(), itMap;
511
512    StdSize totalBuf = 0;
513    for (itMap = itbMap; itMap != iteMap; ++itMap)
514    {
515      report(10) << " Memory report : Context <" << context_->getId() << "> : client side : memory used for buffer of each connection to server" << endl
516                 << "  +) To server with rank " << itMap->first << " : " << itMap->second << " bytes " << endl;
517      totalBuf += itMap->second;
518    }
519    report(0) << " Memory report : Context <" << context_->getId() << "> : client side : total memory used for buffer " << totalBuf << " bytes" << endl;
520
521  }
522
523
524  /*!
525  */
526  bool CContextClient::havePendingRequests(void)
527  {
528    bool pending = false;
529    map<int,CClientBuffer*>::iterator itBuff;
530    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
531      pending |= itBuff->second->hasPendingRequest();
532    return pending;
533  }
534 
535  bool CContextClient::isNotifiedFinalized(void)
536  {
537    if (isAttachedModeEnabled()) return true ;
538
539    bool finalized = true;
540    map<int,CClientBuffer*>::iterator itBuff;
541    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
542      finalized &= itBuff->second->isNotifiedFinalized();
543    return finalized;
544  }
545
546}
Note: See TracBrowser for help on using the repository browser.