summaryrefslogtreecommitdiff
path: root/storage/ndb/src/common/transporter/SCI_Transporter.hpp
blob: 3140b4d822bc7c3e16027b57a627fda3fd643162 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/* Copyright (C) 2003 MySQL AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA */

#ifndef SCI_Transporter_H 
#define SCI_Transporter_H 
#include "Transporter.hpp" 
#include "SHM_Buffer.hpp" 
 
 
#include <sisci_api.h> 
#include <sisci_error.h> 
#include <sisci_types.h> 
 
#include <ndb_types.h> 
 
/** 
 *  The SCI Transporter 
 * 
 *  The design goal of the SCI transporter is to deliver high performance  
 *  data transfers (low latency, high bandwidth) combined with very high  
 *  availability (failover support).  
 *  High performance is an inherit feature of SCI and the, whereas failover  
 *  support is implemented at the application level.  
 *  In SCI the programming model is similar to the shared memory paradigm.  
 *  A process on one node (A) allocates a memory segment and import the 
 *  segment to  its virtual address space. Another node (B) can connect to  
 *  the segment and map this segment into its virtual address space.  
 *  If A writes data to the segment, then B can read it and vice versa, through 
 *  ordinary loads and stores. This is also called PIO (programmable IO), and  
 *  is one thing that distinguish SCI from other interconnects such as, 
 *  ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network  
 *  latency is achieved, compared to the interconnects mentioned above. 
 *  In order for NDB to utilize SCI,  the SCI transporter relies on the  
 *  SISCI api. The SISCI api provides a high level abstraction to the low 
 *  level SCI driver called PCISCI driver. 
 *  The SISCI api provides functions to setup, export, and import 
 *  memory segments in a process virtual address space, and also functions to 
 *  guarantee the correctness of data transfers between nodes. Basically, the  
 *  
 *  In NDB Cluster, each SCI transporter creates a local segment  
 *  that is mapped into the virtual address space. After the creation of the  
 *  local segment, the SCI transporter connects to a segment created by another 
 *  transporter at a remote node, and the maps the remote segment into its  
 *  virtual address space. However, since NDB Cluster relies on redundancy 
 *  at the network level, by using dual SCI adapters communication can be
 *  maintained even if one of the adapter cards fails (or anything on the
 *  network this adapter card exists in e.g. an SCI switch failure).
 * 
 */ 

/**  
 * class SCITransporter 
 * @brief - main class for the SCI transporter. 
 */ 
class SCI_Transporter : public Transporter { 
  friend class TransporterRegistry; 
public:   
 
  /** 
   * Init the transporter. Allocate sendbuffers and open a SCI virtual device 
   * for each adapter. 
   * @return true if successful, otherwize false 
   */ 
  bool initTransporter();                 
   
   
  /** 
   * Creates a sequence for error checking. 
   * @param adapterid the adapter on which to create a new sequence. 
   * @return SCI_ERR_OK if ok, otherwize something else. 
   */ 
  sci_error_t createSequence(Uint32 adapterid);      
   
   
  /** Initiate Local Segment: create a memory segment, 
   * prepare a memory segment, map the local segment  
   * into  memory space and make segment available. 
   * @return SCI_ERR_OK if ok, otherwize something else. 
   */ 
  sci_error_t initLocalSegment();        
 
  /** 
   * Calculate the segment id for the remote segment 
   * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.) 
   * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.) 
   * @return a segment id 
   */ 
  Uint32  remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);     
 
  // Get local segment id (inline) 
  Uint32  hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId); 
   
  /** 
   * closeSCI closes the SCI virtual device 
   */ 
  void closeSCI();                       
 
 
  /** 
   * Check the status of the remote node, 
   * if it is connected or has disconnected 
   * @return true if connected, otherwize false. 
   */ 
  bool checkConnected(); 
 
  /** 
   * Check if the segment are properly connected to each other (remotely 
   * and locally).  
   * @return True if the both the local segment is mapped and the  
   * remote segment is mapped. Otherwize false. 
   */ 
  bool getConnectionStatus(); 

  virtual Uint32 get_free_buffer() const;   
private: 
  SCI_Transporter(TransporterRegistry &t_reg,
                  const char *local_host,
                  const char *remote_host,
                  int port,
		  bool isMgmConnection,
                  Uint32 packetSize,  
		  Uint32 bufferSize, 
		  Uint32 nAdapters, 
		  Uint16 remoteSciNodeId0,  
		  Uint16 remoteSciNodeId1,  
		  NodeId localNodeID,  
		  NodeId remoteNodeID,
		  NodeId serverNodeId,
		  bool checksum,  
		  bool signalId, 
		  Uint32 reportFreq = 4096); 
 
   /** 
   * Destructor. Disconnects the transporter. 
   */ 
	~SCI_Transporter();    
  bool m_mapped; 
  bool m_initLocal; 
  bool m_sciinit; 
  Uint32 m_failCounter; 
  /** 
   * For statistics on transfered packets  
   */   
//#ifdef DEBUG_TRANSPORTER 
#if 1
  Uint32 i1024; 
  Uint32 i2048; 
  Uint32 i2049; 
  Uint32 i10242048; 
  Uint32 i20484096; 
  Uint32 i4096; 
  Uint32 i4097; 
#endif
 
  volatile Uint32 * m_localStatusFlag; 
  volatile Uint32 * m_remoteStatusFlag; 
  volatile Uint32 * m_remoteStatusFlag2; 

  struct {
    Uint32 * m_buffer;       // The buffer
    Uint32 m_dataSize;       // No of words in buffer
    Uint32 m_sendBufferSize; // Buffer size
    Uint32 m_forceSendLimit; // Send when buffer is this full
  } m_sendBuffer;

  SHM_Reader * reader; 
  SHM_Writer * writer; 
  SHM_Writer * writer2; 
 
  /** 
   * Statistics 
   */ 
  Uint32 m_reportFreq; 
 
  Uint32 m_adapters;   
  Uint32 m_numberOfRemoteNodes; 
 
  Uint16 m_remoteNodes[2]; 
 
  typedef struct SciAdapter { 
    sci_desc_t scidesc; 
    Uint32 localSciNodeId; 
    bool linkStatus; 
  } SciAdapter; 
 
  SciAdapter* sciAdapters; 
  Uint32 m_ActiveAdapterId; 
  Uint32 m_StandbyAdapterId; 
 
  typedef struct sourceSegm { 
    sci_local_segment_t localHandle; // Handle to local segment to be mapped
    struct localHandleMap { 
      sci_map_t map;                   // Handle to the new mapped segment.  
                                       // 2 = max adapters in one node 
    } lhm[2];  
     
    volatile void *mappedMemory; // Used when reading 
  } sourceSegm; 
 
  typedef struct targetSegm { 
    struct remoteHandleMap { 
      sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
      sci_map_t          map;            //Handle to the new mapped segment 
    } rhm[2]; 
 
    sci_sequence_status_t m_SequenceStatus;    // Used for error checking 
    sci_sequence_t sequence;  
    volatile void * mappedMemory;              // Used when writing 
    SHM_Writer * writer; 
  } targetSegm; 
   
  sci_sequence_status_t m_SequenceStatus;    // Used for error checking 
 
 
  // Shared between all SCI users  active=(either prim or second) 
  sci_desc_t     activeSCIDescriptor;    
  
  sourceSegm*     m_SourceSegm;               // Local segment reference 
  targetSegm*     m_TargetSegm;               // Remote segment reference 
  
  Uint32 m_LocalAdapterId;    // Adapter Id  
  Uint16 m_LocalSciNodeId;    // The SCI-node Id of this machine (adapter 0) 
  Uint16 m_LocalSciNodeId1;   // The SCI-node Id of this machine (adapter 1) 
  Uint16 m_RemoteSciNodeId;   // The SCI-node Id of remote machine (adapter 0) 
  Uint16 m_RemoteSciNodeId1;  // The SCI-node Id of remote machine (adapter 1) 
 
  Uint32 m_PacketSize;        // The size of each data packet 
  Uint32 m_BufferSize;        // Mapped SCI buffer size  
 
  Uint32 * getWritePtr(Uint32 lenBytes, Uint32 prio);
  void updateWritePtr(Uint32 lenBytes, Uint32 prio);

  /** 
   * doSend. Copies the data from the source (the send buffer) to the  
   * shared mem. segment. 
   * Sequences are used for error checking. 
   * If an error occurs, the transfer is retried. 
   * If the link that we need to swap to is broken, we will disconnect.
   * @return Returns true if datatransfer ok. If not retriable 
   * then false is returned. 
   */ 
  bool doSend();   
 
  /** 
   * @param adapterNo  the adapter for which to retrieve the node id. 
   * @return Returns the node id for an adapter. 
   */ 
  Uint32 getLocalNodeId(Uint32 adapterNo); 
             
  bool hasDataToRead() const { 
    return reader->empty() == false;
  } 
 
  bool hasDataToSend() const {
    return m_sendBuffer.m_dataSize > 0;
  }

  /**  
   * Make the local segment unavailable, no new connections will be accepted. 
   * @return Returns true if the segment was successfully disconnected. 
   */ 
  bool disconnectLocal();                   
 
  /**  
   * Make the local segment unavailable, no new connections will be accepted. 
   * @return Returns true if the segment was successfully disconnected. 
   */ 
  bool disconnectRemote();       
   
  void resetToInitialState(); 
             
  /** 
   *  It is always possible to send data with SCI! 
   *  @return True (always) 
   */ 
  bool sendIsPossible(struct timeval * timeout); 
   
  void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
    reader->getReadPtr(* ptr, * eod);
  }

  void updateReceivePtr(Uint32 *ptr){
    reader->updateReadPtr(ptr);
  }
 
  /** 
   *   Corresponds to SHM_Transporter::setupBuffers() 
   *   Initiates the start pointer of the buffer and read pointers. 
   *   Initiate the localSegment for the SHM reader. 
   */ 
  void setupLocalSegment();   
 
  /** 
   *  Initiate the remoteSegment for the SHM writer 
   */ 
  void setupRemoteSegment();   
 
  /** 
   * Set the connect flag in the remote memory segment (write through) 
   */ 
  void setConnected();   
   
  /** 
   * Set the disconnect flag in the remote memory segment (write through) 
   */ 
  void setDisconnect();   
   
  /** 
   * Check if there is a link between the adapter and the switch 
   * @param adapterNo  the adapter for which to retrieve the link status. 
   * @return Returns true if there is a link between adapter and switch. 
   * Otherwize false is returned and the cables must be checked. 
   */ 
  bool getLinkStatus(Uint32 adapterNo); 
 
  /** 
   * failoverShmWriter takes the state of the active writer and inserts into 
   * the standby writer. 
   */ 
  void failoverShmWriter(); 
 
  bool init_local();
  bool init_remote();

protected: 
   
  /** Perform a connection between segment 
   * This is a client node, trying to connect to a remote segment. 
   * @param timeout, the time the connect thread sleeps before  
   * retrying. 
   * @return Returns true on success, otherwize falser 
   */ 
  bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
  bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
 
  /** 
   *  We will disconnect if: 
   *  -# the other node has disconnected from us 
   *  -# unrecoverable error in transmission, on both adapters 
   *  -# if we are shutdown properly 
   */ 
  void disconnectImpl(); 
 
  static bool initSCI(); 
}; 
 
 
/** The theLocalAdapterId combined with the theRemoteNodeId constructs 
 *  (SCI ids)* a unique identifier for the local segment 
 */ 
inline  
Uint32 
SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,  
			       Uint16 SciRemoteNodeId) { 
 
  return (SciLocalNodeId << 16) | SciRemoteNodeId;  
} 
 
/** The theLocalAdapterId combined with the theRemoteNodeId constructs 
 *  (SCI ids)* a unique identifier for the remote segment 
 */ 
inline  
Uint32 
SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId, 
				 Uint16 SciRemoteNodeId) { 
   
  return (SciRemoteNodeId << 16) | SciLocalNodeId; 
} 
 
 
#endif