1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
|
---------- bulk ----------
bulk rdma收发文件
mercury/05_bulk/client.c
./client protocal server_addr filename
main
HG_Set_log_level("debug")
state.hg_class = HG_Init(protocol, HG_FALSE)
state.hg_context = HG_Context_create(state.hg_class)
state.save_rpc_id = MERCURY_REGISTER(state.hg_class, "save", save_in_t, save_out_t, NULL)
access(save_op.filename, F_OK) 检查文件是否存在
HG_Addr_lookup1(state.hg_context, lookup_callback, &save_op, server_address, HG_OP_ID_IGNORE) -> 查地址, 然后回调
lookup_callback
save_operation* save_op = (save_operation*)(callback_info->arg); 回调通过指针传递参数
FILE* file = fopen(save_op->filename,"r");
fseek(file, 0L, SEEK_END)
save_op->size = ftell(file) 计算文件位置
fseek(file, 0L, SEEK_SET);
save_op->buffer = calloc(1, save_op->size);
fread(save_op->buffer,1,save_op->size,file) // fread和fwrite的用法详解(以数据块的形式读写文件)
HG_Create(state->hg_context, addr, state->save_rpc_id, &handle)
HG_Bulk_create(state->hg_class, 1, (void**) &(save_op->buffer), &(save_op->size), HG_BULK_READ_ONLY, &(save_op->bulk_handle));
HG_Bulk_create (void**) &(save_op->buffer) 传入文件内容buf指针地址(二级指针) -> 客户端创建BULK
hg_buld malloc/memset 0
hg_atomic_init32 ref_count 1 初始化引用计数
转分段 注册分段 hg_bulk_register_segments 最大静态段=8, HG_BULK_STATIC_MAX
hg_bulk_create_na_mem_descs 单独注册内存描述 individually 分为静态staic和动态数组dynamic
hg_bulk_register 注册段 传递索引i
NA_Mem_handle_create
mem_handler_create -> na_ofi_mem_handler_create
NA_UNUSED 编译参数 忽略编译告警
calloc
na_ofi_mem_desc s[0] 单段, d 多段
flags & 0xff 全1
NA_Mem_register
mem_register na_ofi_mem_register
requested_key 自增
NA_MEM_READ_ONLY
access = FI_REMOTE_READ | FI_WRITE 远程读和本地写
fi_mr_regv 注册内存向量
rxm_mr_regv
rxm_mr_get_msg_access
fi_mr_regv
vrb_mr_regv
vrb_mr_cache_reg
ofi_mr_rbt_find 如果有缓存就找红黑树
util_mr_cache_create
add_region
vrb_mr_reg_common
ibv_reg_mr 注册内存
md->mr_fid.key = md->mr->rkey 远程键
md->lkey = md->mr->lkey 本地键
vrb_eq_write_event 写完成事件
rxm_mr_init
fi_mr_key 取回内存key
...desc.info.fi_mr_key = 远程键
NA_Mem_handle_get_serialize_size na_ofi_mem_handle_get_serialize_size
sizeof iovec info + iovcnt
HG_Forward -> 客户端调用RDMA的发送接口, 服务端调用 HG_Bulk_transfer HG_BULK_PULL 拉取数据
save_completed
while ... do -> 循环
ret = HG_Trigger(state.hg_context, 0, 1, &count)
HG_Progress(state.hg_context, 100)
ret = HG_Context_destroy(state.hg_context)
hg_return_t err = HG_Finalize(state.hg_class)
----------
server:
mercury/05_bulk/server.c
./server addr
main
HG_Init
HG_Addr_self
HG_Addr_to_string
HG_Context_create
HG_Register_data
do
HG_Trigger
HG_Progress
save
HG_Bulk_transfer(stt->hg_context, save_bulk_completed, my_rpc_state, HG_BULK_PULL, info->addr, in.bulk_handle, 0, my_rpc_state->bulk_handle, 0, my_rpc_state->size, HG_OP_ID_IGNORE) origin and local 对齐 -> 服务端通过RDMA的单边读请求, 从客户端DMA拉取数据
HG_Context_destroy
HG_Finalize
---------- bulk ----------
HG_Forward
...
hg_core_handle->ops.forward(hg_core_handle) -> hg_core_forward_na
na_ret = NA_Msg_send_unexpected(hg_core_handle->na_class,
hg_core_handle->na_context, hg_core_send_input_cb, hg_core_handle,
hg_core_handle->core_handle.in_buf, hg_core_handle->in_buf_used,
hg_core_handle->in_buf_plugin_data, hg_core_handle->na_addr,
hg_core_handle->core_handle.info.context_id, hg_core_handle->tag,
hg_core_handle->na_send_op_id);
...
na_ofi_msg_send_unexpected(na_class_t *na_class -> na_ofi_msg_send
rc = fi_senddata(ep, msg_info->buf.const_ptr, msg_info->buf_size,
msg_info->desc, msg_info->tag & NA_OFI_TAG_MASK, msg_info->fi_addr,
context);
ofi+verbs;ofi_rxm://192.169.29.63:55555, class_name:ofi, protocol_name:verbs
HG_Core_init_opt2
hg_core_init
NA_Initialize_opt -> Initialize NA if not provided externally
na_plugin_check_protocol
na_ofi_check_protocol(const char *protocol_name)
uint32_t runtime_version = fi_version()
FI_MAJOR(runtime_version), FI_MINOR(runtime_version)
type = na_ofi_prov_name_to_type(protocol_name) -> static const char *const na_ofi_prov_name[] = {NA_OFI_PROV_TYPES} -> 网络配置,插件, 协议等
na_ofi_getinfo(type, NULL, &providers)
// 设置网络参数(提示信息,过滤提供者)
hints = fi_allocinfo()
hints->mode = FI_ASYNC_IOV -> FI_ASYNC_IOV 模式指示应用程序必须提供 IO 向量所需的缓冲。 设置后,应用程序不得修改长度 > 1 的 IO 向量,包括任何相关的内存描述符数组,直到关联操作完成
hints->ep_attr->type = FI_EP_RDM -> 设置端点类型为: 可靠但是连接更少的数据报类型
hints->caps = FI_MSG | FI_TAGGED | FI_RMA | FI_DIRECTED_RECV
hints->tx_attr->msg_order = FI_ORDER_SAS -> msg_order:保证具有相同标签的消息是有序的。 (FI_ORDER_SAS - 发送后发送(send after send)。如果设置,消息发送操作(包括标记的发送)将按照相对于其他消息发送的提交顺序进行传输。如果未设置,则消息发送可能会不按其提交的顺序进行传输)
hints->tx_attr->op_flags = FI_INJECT_COMPLETE -> 当可以安全地重用缓冲区时生成完成事件
hints->domain_attr->av_type = FI_AV_MAP
hints->domain_attr->resource_mgmt = FI_RM_ENABLED
hints->fabric_attr->prov_name = strdup(na_ofi_prov_name[prov_type]) -> 提供者: verbs;ofi_rxm
hints->mode |= FI_CONTEXT
hints->ep_attr->protocol = (uint32_t) na_ofi_prov_ep_proto[prov_type]
hints->caps |= na_ofi_prov_extra_caps[prov_type]
hints->domain_attr->control_progress = na_ofi_prov_progress[prov_type]
hints->domain_attr->data_progress = na_ofi_prov_progress[prov_type]
rc = fi_getinfo(NA_OFI_VERSION, node, service, flags, hints, fi_info_p)
fi_freeinfo(providers)
ret = na_private_class->na_class.ops->initialize( &na_private_class->na_class, na_info, listen) -> na_ofi_initialize(
na_ofi_prov_addr_format(prov_type, na_init_info.addr_format)
info.thread_mode = ((na_init_info.thread_mode & NA_THREAD_MODE_SINGLE)
na_ofi_class = na_ofi_class_alloc();
ret = na_ofi_verify_info
na_ofi_class->msg_recv_unexpected == na_ofi_msg_recv
na_ofi_class->opt_features |= NA_OPT_MULTI_RECV
na_ofi_class->cq_poll = na_ofi_cq_poll_no_source
na_ofi_fabric_open(prov_type, na_ofi_class->fi_info->fabric_attr, &na_ofi_class->fabric)
na_ofi_domain_open(na_ofi_class->fabric, na_init_info.auth_key,
no_wait, na_ofi_prov_flags[prov_type] & NA_OFI_SEP,
na_ofi_prov_flags[prov_type] & NA_OFI_DOM_SHARED, na_ofi_class->fi_info,
&na_ofi_class->domain)
na_ofi_class->context_max = na_init_info.max_contexts
na_ofi_class->use_sep
na_ofi_endpoint_open(na_ofi_class->fabric, na_ofi_class->domain,
na_ofi_class->no_wait, na_ofi_class->use_sep, na_ofi_class->context_max,
na_init_info.max_unexpected_size, na_init_info.max_expected_size,
na_ofi_class->fi_info, &na_ofi_class->endpoint)
pool_chunk_size
na_ofi_class->send_pool = hg_mem_pool_create(pool_chunk_size
na_ofi_class->recv_pool = hg_mem_pool_create(pool_chunk_size
for (i = 0; i < NA_OFI_ADDR_POOL_COUNT; i++) -> 地址池
struct na_ofi_addr *na_ofi_addr = na_ofi_addr_alloc(na_ofi_class)
HG_QUEUE_PUSH_TAIL(&na_ofi_class->addr_pool.queue, na_ofi_addr, entry)
ret = na_ofi_endpoint_get_src_addr(na_ofi_class)
fi_getinfo
ret = prov->provider->getinfo(version, node, service, flags, hints, &cur); -> static int rxm_getinfo
ofi_is_wildcard_listen_addr
rxm_validate_atomic_hints(hints)
ofix_getinfo(version, node, service, flags, &rxm_util_prov, hints, rxm_info_to_core, rxm_info_to_rxm, info)
...
prov_mode = ofi_mr_get_prov_mode(api_version, user_info, prov_info)
prov_mode = ofi_cap_mr_mode(user_info->caps, prov_mode)
rxm_alter_info(hints, *info)
ofi_set_prov_attr(tail->fabric_attr, prov->provider)
发送数据
fi_senddata -> rxm_senddata
rxm_get_conn(rxm_ep, dest_addr, &rxm_conn)
...
rxm_conn_progress(ep)
ret = fi_eq_read(ep->msg_eq, &event, &cm_entry -> do loop -> vrb_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags)
eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid) -> 事件队列, 参考结构: struct vrb_eq {
ret = vrb_eq_read_event(eq, event, buf, len, flags)
if (dlistfd_empty(&eq->list_head)) -> 读eq队列
rdma_get_cm_event(eq->channel, &cma_event);
vrb_eq_cm_process_event(eq, cma_event, event, buf, len) -> VERBS模块, 从事件队列中处理CM事件
RDMA_CM_EVENT_ROUTE_RESOLVED -> 客户端路由解析事件
ep->state = VRB_CONNECTING
rdma_connect(ep->id, &ep->conn_param) -> 主动发起连接请求(建连) -> int rdma_connect (struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ->
rdma_ack_cm_event(cma_event)
rxm_handle_event(ep, event, &cm_entry, ret)
rxm_process_connect(cm_entry)
domain->flow_ctrl_ops->enable(conn->msg_ep, conn->ep->msg_info->rx_attr->size / 2) -> 流控 -> static int vrb_enable_ep_flow_ctrl
ep->peer_rq_credits = 1 + ep->saved_peer_rq_credits
vrb_ep2_domain(ep)->send_credits
conn->state = RXM_CM_CONNECTED -> 已建连
rxm_send_common -> rxm_send_common(struct rxm_ep *rxm_ep,
data_len = ofi_total_iov_len(iov, count)
assert(count <= rxm_ep->rxm_info->tx_attr->iov_limit) -> 限制4个iov
iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device)
if (data_len <= rxm_ep->eager_limit) -> 紧急数据(小数据)16KB=16384B, data_len=184
rxm_send_eager(rxm_ep, rxm_conn, iov, desc, count,
fi_send(rxm_conn->msg_ep, &eager_buf->pkt, total_len -> vrb_msg_ep_send
struct ibv_send_wr wr = {
.wr_id = VERBS_COMP(ep, (uintptr_t)context),
.opcode = IBV_WR_SEND,
.send_flags = VERBS_INJECT(ep, len, desc),
};
vrb_send_buf
struct ibv_sge sge = vrb_init_sge(buf, len, desc);
wr->sg_list = &sge;
wr->num_sge = 1;
vrb_post_send(ep, wr, 0)
if (!ep->sq_credits || !ep->peer_rq_credits) -> 流控
vrb_flush_cq(cq)
vrb_poll_cq(cq, &wc)
ret = ibv_poll_cq(cq->cq, 1, wc)
vrb_report_wc(cq, &wc)
(void) ofi_cq_write(&cq->util_cq, (void *) (uintptr_t) wc->wr_id,flags, len, NULL, data, 0)
ofi_cirque_commit(cq->cirq) -> 环形链表计数器+1
ep->sq_credits--
ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr)
buf转sge:
#define vrb_init_sge(buf, len, desc) (struct ibv_sge) \
{ .addr = (uintptr_t) buf, \
.length = (uint32_t) len, \
.lkey = (desc) ? ((struct vrb_mem_desc *) (desc))->lkey : 0 }
static int rxm_send_connect(struct rxm_conn *conn)
static int rxm_open_conn
ret = fi_endpoint(domain->msg_domain, msg_info, &msg_ep, conn)
int vrb_open_ep -> vrb_get_port_space -> return RDMA_PS_TCP
int vrb_create_ep
rdma_create_id
rdma_resolve_addr -> TODO 也将此调用转换为非阻塞(使用事件通道):在运行大型 MPI 作业时,这可能是为了更好地扩展所需要的。 使其成为非阻塞意味着我们无法在 EP 启用时创建 QP。 在使用 rdma_create_qp 创建 QP 之前,我们需要等待 RDMA_CM_EVENT_ADDR_RESOLVED 事件。 它还需要一个SW接收队列来存储启用EP后应用程序发布的recvs
ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops
ep->util_ep.ep_fid.ops = &vrb_ep_base_ops
fi_ep_bind(msg_ep, &ep->msg_eq->fid, 0) -> static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
ofi_ep_bind_valid(&vrb_prov, bfid, flags)
rdma_migrate_id(ep->id, ep->eq->channel)
rxm_bind_comp(ep, msg_ep)
fi_ep_bind(msg_ep, &ep->msg_cq->fid, FI_TRANSMIT | FI_RECV)
ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags)
return fid_list_insert(&cq->ep_list
fi_enable(msg_ep); -> static int vrb_ep_enable(struct fid_ep *ep_fid)
vrb_msg_ep_get_qp_attr(ep, &attr)
ret = rdma_create_qp(ep->id, domain->pd, &attr)
conn->flow_ctrl = domain->flow_ctrl_ops->available(msg_ep) -> static bool vrb_flow_ctrl_available
ret = rxm_prepost_recv(ep, msg_ep) -> ep->msg_info->rx_attr->size = 128
rx_buf = ofi_buf_alloc(rxm_ep->rx_pool)
if (ofi_bufpool_grow(pool))
ret = ofi_bufpool_region_alloc(buf_region);
rxm_post_recv(rx_buf)
ret = (int) fi_recv(rx_buf->rx_ep, &rx_buf->pkt -> vrb_msg_ep_recv(struct fid_ep *ep_fid
struct ibv_recv_wr wr = {
.wr_id = (uintptr_t)context,
.num_sge = 1,
.sg_list = &sge,
.next = NULL,
};
ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr)
ret = ibv_post_recv(ep->ibv_qp, wr, &bad_wr)
conn->msg_ep = msg_ep
ret = rxm_init_connect_data(conn, &cm_data)
ret = fi_getopt(&conn->ep->msg_pep->fid, FI_OPT_ENDPOINT
cm_data->connect.port = ofi_addr_get_port(&conn->ep->addr.sa)
cm_data->connect.client_conn_id = rxm_conn_id(conn->peer->index) -> rx_size=128
fi_connect(conn->msg_ep, info->dest_addr, &cm_data, sizeof(cm_data)) -> vrb_msg_ep_connect(struct fid_ep *ep_fid,
vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr)
vrb_ep_prepare_rdma_cm_param(&ep->conn_param,
conn_param->responder_resources = RDMA_MAX_RESP_RES
conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH
conn_param->flow_control = 1;
conn_param->rnr_retry_count = 7; -> 无限重试
ep->conn_param.retry_count = 15
rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT) -> 触发CM事件: RDMA_CM_EVENT_ROUTE_RESOLVED
rxm_ep_settings_init
rdma write, 单边写,
NA_Put
na_ofi_put
na_ofi_rma fi_writemsg = fi_rma_op .writemsg = -> include/rdma/fi_rma.h -> fi_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
ep->rma->writemsg(ep, msg, flags)
.writemsg = vrb_msg_ep_rma_writemsg
vrb_msg_ep_rma_writemsg -> prov/verbs/src/verbs_rma.c
struct ibv_send_wr wr
wr.opcode = IBV_WR_RDMA_WRITE | wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM 写|立即数写
vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc -> ssize_t vrb_send_iovli
wr->sg_list = alloca(sizeof(*wr->sg_list) * count)
wr->sg_list[i].addr = (uintptr_t) iov[i].iov_base
wr->sg_list[i].length = iov[i].iov_len;
wr->send_flags = IBV_SEND_INLINE ?IBV_SEND_FENCE
wr->sg_list[0]
wr->num_sge =
vrb_post_send(ep, wr, flags) -> prov/verbs/src/verbs_ep.c -> ssize_t vrb_post_send
ibv_post_send(ep->ibv_qp, wr, &bad_wr)
所有标签(Flags), include/rdma/fabric.h -> #define FI_MSG (1ULL << 1) ...
服务端执行bulk拉取
HG_Bulk_transfer(hg_context_t *context
hg_bulk_transfer_na(hg_bulk_op_t op
case HG_BULK_PULL:
na_bulk_op = hg_bulk_na_get
na_ret = na_bulk_op(hg_bulk_op_id->na_class -> hg_bulk_na_get -> return NA_Get -> na_class->ops->get -> na_ofi_get(na_class_t *na_class
na_ofi_rma_common(NA_OFI_CLASS(na_class), context, NA_CB_GET,
callback, arg, fi_readmsg, "fi_readmsg", FI_COMPLETION,
(struct na_ofi_mem_handle *) local_mem_handle, local_offset,
(struct na_ofi_mem_handle *) remote_mem_handle, remote_offset, length,
(struct na_ofi_addr *) remote_addr, remote_id,
(struct na_ofi_op_id *) op_id) -> 服务端拉取数据(READ)
..
na_ofi_rma_common(struct na_ofi_class *na_ofi_class,
struct iovec *local_iov = NA_OFI_IOV
void *local_desc = fi_mr_desc(na_ofi_mem_handle_local->fi_mr)
na_ofi_iov_translate
na_ofi_rma_iov_translate
na_ofi_rma_post(na_ofi_context->fi_tx, rma_info, &na_ofi_op_id->fi_ctx)
rc = rma_info->fi_rma_op -> rxm_ep_readmsg(struct fid_ep *ep_fid,
return rxm_ep_rma_common(rxm_ep, msg, flags | rxm_ep->util_ep.tx_msg_flags, fi_readmsg, FI_READ);
rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc, msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ), rma_buf)
fi_mr_desc(((struct rxm_mr *) desc[i])->msg_mr)
rma_msg(rxm_conn->msg_ep, &msg_rma, flags) -> vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid
struct ibv_send_wr wr = {
.wr_id = VERBS_COMP_READ_FLAGS(ep, flags, (uintptr_t)msg->context),
.opcode = IBV_WR_RDMA_READ,
.wr.rdma.remote_addr = msg->rma_iov->addr,
.wr.rdma.rkey = (uint32_t)msg->rma_iov->key,
.num_sge = msg->iov_count,
};
vrb_post_send(ep, &wr, 0)
vrb_flush_cq(cq)
ep->sq_credits--
ctx->op_queue = VRB_OP_SQ
ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr) -> 提交WR,将客户端数据读过来(DMA)
wr->wr_id = (uintptr_t) ctx->user_ctx
slist_insert_tail(&ctx->entry, &ep->sq_list) -> 触发 vrb_flush_sq 遍历该发送队列 -> while (!slist_empty(&ep->sq_list))
下刷发送队列调用栈
(gdb) bt
#0 vrb_flush_sq (ep=0xb7a4d0) at prov/verbs/src/verbs_ep.c:456
#1 0x00007ffff7339f90 in vrb_ep_close (fid=0xb7a4d0) at prov/verbs/src/verbs_ep.c:582
#2 0x00007ffff7349804 in fi_close (fid=0xb7a4d0) at ./include/rdma/fabric.h:631
#3 0x00007ffff734a605 in rxm_close_conn (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:88
#4 0x00007ffff734cd88 in rxm_process_shutdown (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:768
#5 0x00007ffff734d170 in rxm_handle_event (ep=0x78b570, event=3, cm_entry=0x7fffffffd3d0, len=16) at prov/rxm/src/rxm_conn.c:830
#6 0x00007ffff734d279 in rxm_conn_progress (ep=0x78b570) at prov/rxm/src/rxm_conn.c:850
#7 0x00007ffff736256e in rxm_ep_do_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1843
#8 0x00007ffff736268a in rxm_ep_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1863
#9 0x00007ffff72d5b7d in ofi_cq_progress (cq=0x787770) at prov/util/src/util_cq.c:498
#10 0x00007ffff72d51c0 in ofi_cq_readfrom (cq_fid=0x787770, buf=0x0, count=0, src_addr=0x0) at prov/util/src/util_cq.c:257
#11 0x00007ffff72d3a56 in fi_cq_readfrom (cq=0x787770, buf=0x0, count=0, src_addr=0x0) at ./include/rdma/fi_eq.h:400
#12 0x00007ffff72d5207 in ofi_cq_read (cq_fid=0x787770, buf=0x0, count=0) at prov/util/src/util_cq.c:264
#13 0x00007ffff72dbf4f in fi_cq_read (cq=0x787770, buf=0x0, count=0) at ./include/rdma/fi_eq.h:394
#14 0x00007ffff72dc4cc in util_poll_run (poll_fid=0x783390, context=0x7fffffffdb78, count=1) at prov/util/src/util_poll.c:95
#15 0x00007ffff72dc9a0 in fi_poll (pollset=0x783390, context=0x7fffffffdb78, count=1) at ./include/rdma/fi_eq.h:335
#16 0x00007ffff72de0bf in util_wait_fd_try (wait=0x783290) at prov/util/src/util_wait.c:366
#17 0x00007ffff72dd456 in ofi_trywait (fabric=0x781b40, fids=0x7fffffffdc40, count=1) at prov/util/src/util_wait.c:72
#18 0x00000000004266de in fi_trywait (count=1, fids=0x7fffffffdc40, fabric=<optimized out>) at /usr/local/include/rdma/fi_eq.h:323
#19 na_ofi_poll_try_wait (na_class=<optimized out>, context=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:8229
#20 0x0000000000417b4a in hg_core_poll_try_wait (context=0x786c80) at /home/xb/project/mercury/src/mercury_core.c:5058
#21 hg_core_progress (context=context@entry=0x786c80, timeout_ms=100) at /home/xb/project/mercury/src/mercury_core.c:5001
#22 0x000000000042043e in HG_Core_progress (context=0x786c80, timeout_ms=timeout_ms@entry=100) at /home/xb/project/mercury/src/mercury_core.c:6530
#23 0x000000000040bf02 in HG_Progress (context=<optimized out>, timeout=timeout@entry=100) at /home/xb/project/mercury/src/mercury.c:2178
#24 0x0000000000407741 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:69
创建RDMA事件通道调用栈
#0 0x00007ffff6c966d0 in rdma_create_event_channel () from /lib64/librdmacm.so.1
#1 0x00007ffff6c967c5 in ucma_alloc_id () from /lib64/librdmacm.so.1
#2 0x00007ffff6c96821 in rdma_create_id2.part.20 () from /lib64/librdmacm.so.1
#3 0x00007ffff6c96476 in ucma_init () from /lib64/librdmacm.so.1
#4 0x00007ffff6c9697a in rdma_create_id () from /lib64/librdmacm.so.1
#5 0x00007ffff7334ebe in vrb_ifa_rdma_info (ifa=0x6784b0, dev_name=0x7fffffffd428, rai=0x7fffffffd420) at prov/verbs/src/verbs_info.c:955
#6 0x00007ffff73359e6 in vrb_getifaddrs (verbs_devs=0x7ffff776aaa0 <verbs_devs>) at prov/verbs/src/verbs_info.c:1177
#7 0x00007ffff7336048 in vrb_init_info (all_infos=0x7ffff776a5e8 <vrb_util_prov+8>) at prov/verbs/src/verbs_info.c:1378
#8 0x00007ffff73379c9 in vrb_getinfo (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd600) at prov/verbs/src/verbs_info.c:1872
#9 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd750) at src/fabric.c:1282
#10 0x00007ffff72c70c0 in ofi_get_core_info (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, util_hints=0x6681e0, base_attr=0x7ffff776b8e0 <rxm_verbs_info>, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, core_info=0x7fffffffd750)
at prov/util/src/util_attr.c:305
#11 0x00007ffff72c71da in ofix_getinfo (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, hints=0x6681e0, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, info_to_util=0x7ffff7344ff7 <rxm_info_to_rxm>, info=0x7fffffffd8c0)
at prov/util/src/util_attr.c:328
#12 0x00007ffff7345d68 in rxm_getinfo (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd8c0) at prov/rxm/src/rxm_init.c:557
#13 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd9c0) at src/fabric.c:1282
--------------------------
#14 0x000000000042a22f in na_ofi_getinfo (prov_type=prov_type@entry=NA_OFI_PROV_VERBS_RXM, info=info@entry=0x0, fi_info_p=fi_info_p@entry=0x7fffffffd9c0) at /home/xb/project/mercury/src/na/na_ofi.c:3247
#15 0x00000000004313ee in na_ofi_check_protocol (protocol_name=protocol_name@entry=0x6681a0 "verbs;ofi_rxm") at /home/xb/project/mercury/src/na/na_ofi.c:6679
#16 0x00000000004225ce in na_plugin_check_protocol (class_ops=0x44a2e0 <na_plugin_static_g>, ops_p=<synthetic pointer>, protocol_name=<optimized out>, class_name=0x668180 "ofi") at /home/xb/project/mercury/src/na/na.c:451
#17 NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:776
#18 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#19 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
at /home/xb/project/mercury/src/mercury_core.c:1225
#20 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
at /home/xb/project/mercury/src/mercury_core.c:5620
#21 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#22 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '\001') at /home/xb/project/mercury/src/mercury.c:1041
#23 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42
vrb_getifaddrs
vrb_ifa_rdma_info
rdma_getaddrinfo
rdma_bind_addr
*dev_name = strdup(ibv_get_device_name(id->verbs->device))
#0 0x00007ffff6c966d0 in rdma_create_event_channel () from /lib64/librdmacm.so.1
#1 0x00007ffff6c967c5 in ucma_alloc_id () from /lib64/librdmacm.so.1
#2 0x00007ffff6c96821 in rdma_create_id2.part.20 () from /lib64/librdmacm.so.1
#3 0x00007ffff7320cc0 in vrb_get_rai_id (node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, rai=0x7fffffffd4c8, id=0x7fffffffd4c0) at prov/verbs/src/verbs_init.c:278
#4 0x00007ffff733745a in vrb_handle_sock_addr (node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1770
#5 0x00007ffff733764c in vrb_get_match_infos (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, raw_info=0x7ffff776a5e8 <vrb_util_prov+8>, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1805
#6 0x00007ffff7337a01 in vrb_getinfo (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1876
#7 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x78b700) at src/fabric.c:1282
#8 0x00007ffff72c70c0 in ofi_get_core_info (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, util_hints=0x784540, base_attr=0x0, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, core_info=0x78b700) at prov/util/src/util_attr.c:305
#9 0x00007ffff7353f15 in rxm_open_core_res (ep=0x78b560) at prov/rxm/src/rxm_ep.c:1748
#10 0x00007ffff73545b3 in rxm_endpoint (domain=0x788450, info=0x6683b0, ep_fid=0x670ad0, context=0x0) at prov/rxm/src/rxm_ep.c:1925
#11 0x0000000000434563 in fi_endpoint (context=0x0, ep=0x670ad0, info=0x6683b0, domain=<optimized out>) at /usr/local/include/rdma/fi_endpoint.h:178
#12 na_ofi_basic_ep_open (na_ofi_endpoint=0x670ad0, no_wait=false, fi_info=0x6683b0, na_ofi_domain=0x780ca0, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4385
#13 na_ofi_endpoint_open (na_ofi_endpoint_p=0x66a4f0, fi_info=0x6683b0, expected_msg_size_max=0, unexpected_msg_size_max=0, max_contexts=<optimized out>, sep=false, no_wait=false, na_ofi_domain=0x780ca0, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4357
#14 na_ofi_initialize (na_class=<optimized out>, na_info=<optimized out>, listen=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:6867
#15 0x00000000004227cb in NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:819
#16 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#17 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
at /home/xb/project/mercury/src/mercury_core.c:1225
#18 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
at /home/xb/project/mercury/src/mercury_core.c:5620
#19 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#20 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001') at /home/xb/project/mercury/src/mercury.c:1041
#21 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42
rdma_listen: 服务端RDMA监听, 调用栈
#0 0x00007ffff6c95dd0 in rdma_listen () from /lib64/librdmacm.so.1
#1 0x00007ffff7324566 in vrb_pep_listen (pep_fid=0x783dd0) at prov/verbs/src/verbs_cm.c:535
#2 0x00007ffff7349ea9 in fi_listen (pep=0x783dd0) at ./include/rdma/fi_cm.h:91
#3 0x00007ffff734dcee in rxm_start_listen (ep=0x78b560) at prov/rxm/src/rxm_conn.c:1009
#4 0x00007ffff7353959 in rxm_ep_ctrl (fid=0x78b560, command=6, arg=0x0) at prov/rxm/src/rxm_ep.c:1607
#5 0x00000000004346cf in fi_enable (ep=<optimized out>) at /usr/local/include/rdma/fi_endpoint.h:217
#6 na_ofi_basic_ep_open (na_ofi_endpoint=0x670ad0, no_wait=false, fi_info=0x6683b0, na_ofi_domain=<optimized out>, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4427
#7 na_ofi_endpoint_open (na_ofi_endpoint_p=0x66a4f0, fi_info=0x6683b0, expected_msg_size_max=0, unexpected_msg_size_max=0, max_contexts=<optimized out>, sep=false, no_wait=false, na_ofi_domain=<optimized out>, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4357
#8 na_ofi_initialize (na_class=<optimized out>, na_info=<optimized out>, listen=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:6867
#9 0x00000000004227cb in NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:819
#10 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#11 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
at /home/xb/project/mercury/src/mercury_core.c:1225
#12 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
at /home/xb/project/mercury/src/mercury_core.c:5620
#13 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#14 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi+verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '\001') at /home/xb/project/mercury/src/mercury.c:1041
#15 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42
(gdb) 发送端解析地址调用栈
#0 0x00007ffff6c97780 in rdma_resolve_addr () from /lib64/librdmacm.so.1
#1 0x00007ffff73211be in vrb_create_ep (ep=0xa42010, ps=RDMA_PS_TCP, id=0xa42200) at prov/verbs/src/verbs_init.c:346
#2 0x00007ffff733c492 in vrb_open_ep (domain=0x783730, info=0x793e00, ep_fid=0x7fffffffd790, context=0xa3ccb8) at prov/verbs/src/verbs_ep.c:1210
#3 0x00007ffff7349da7 in fi_endpoint (domain=0x783730, info=0x793e00, ep=0x7fffffffd790, context=0xa3ccb8) at ./include/rdma/fi_endpoint.h:178
#4 0x00007ffff734ad73 in rxm_open_conn (conn=0xa3ccb8, msg_info=0x793e00) at prov/rxm/src/rxm_conn.c:189
#5 0x00007ffff734b455 in rxm_send_connect (conn=0xa3ccb8) at prov/rxm/src/rxm_conn.c:289
#6 0x00007ffff734b5fb in rxm_connect (conn=0xa3ccb8) at prov/rxm/src/rxm_conn.c:321
#7 0x00007ffff734bcd3 in rxm_get_conn (ep=0x786300, addr=1, conn=0x7fffffffd928) at prov/rxm/src/rxm_conn.c:466
#8 0x00007ffff7357dba in rxm_senddata (ep_fid=0x786300, buf=0x7ffff7edb030, len=156, desc=0x7934b0, data=1, dest_addr=1, context=0xa3c310) at prov/rxm/src/rxm_msg.c:848
#9 0x000000000042762c in fi_senddata (context=0xa3c310, dest_addr=<optimized out>, data=<optimized out>, desc=<optimized out>, len=<optimized out>, buf=<optimized out>, ep=0x786300) at /usr/local/include/rdma/fi_endpoint.h:343
#10 na_ofi_msg_send (ep=0x786300, msg_info=0xa3c138, context=0xa3c310) at /home/xb/project/mercury/src/na/na_ofi.c:5065
#11 0x000000000042d088 in na_ofi_msg_send_unexpected (na_class=<optimized out>, context=<optimized out>, callback=<optimized out>, arg=0xa3bc50, buf=<optimized out>, buf_size=<optimized out>, plugin_data=<optimized out>, dest_addr=<optimized out>,
dest_id=<optimized out>, tag=<optimized out>, op_id=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:7554
#12 0x0000000000414b8e in NA_Msg_send_unexpected (op_id=<optimized out>, tag=<optimized out>, dest_id=<optimized out>, dest_addr=<optimized out>, plugin_data=<optimized out>, buf_size=<optimized out>, buf=<optimized out>, arg=0xa3bc50,
callback=0x41a620 <hg_core_send_input_cb>, context=<optimized out>, na_class=<optimized out>) at /home/xb/project/mercury/src/na/na.h:1140
#13 hg_core_forward_na (hg_core_handle=0xa3bc50) at /home/xb/project/mercury/src/mercury_core.c:3949
#14 0x00000000004200d5 in hg_core_forward (payload_size=128, flags=0 '\000', arg=0xa3c6c0, callback=0x408120 <hg_core_forward_cb>, hg_core_handle=0xa3bc50) at /home/xb/project/mercury/src/mercury_core.c:3887
#15 HG_Core_forward (handle=0xa3bc50, callback=callback@entry=0x408120 <hg_core_forward_cb>, arg=arg@entry=0xa3c6c0, flags=<optimized out>, payload_size=128) at /home/xb/project/mercury/src/mercury_core.c:6432
#16 0x000000000040bdad in HG_Forward (handle=0xa3c6c0, callback=callback@entry=0x4080a0 <save_completed>, arg=arg@entry=0x7fffffffdda0, in_struct=in_struct@entry=0x7fffffffdbb0) at /home/xb/project/mercury/src/mercury.c:2111
#17 0x000000000040807b in lookup_callback (callback_info=0x7fffffffdc00) at /home/xb/project/mercury/Examples/src/client.c:132
#18 0x0000000000408245 in hg_core_addr_lookup_cb (callback_info=<optimized out>) at /home/xb/project/mercury/src/mercury.c:437
#19 0x00000000004186d5 in hg_core_trigger_lookup_entry (hg_core_op_id=0x786260) at /home/xb/project/mercury/src/mercury_core.c:5387
#20 hg_core_trigger (context=0x78d750, timeout_ms=timeout_ms@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury_core.c:5339
#21 0x000000000042072f in HG_Core_trigger (context=<optimized out>, timeout=timeout@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury_core.c:6577
#22 0x000000000040c162 in HG_Trigger (context=<optimized out>, timeout=timeout@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury.c:2197
#23 0x0000000000407896 in main (argc=<optimized out>, argv=0x7fffffffdec8) at /home/xb/project/mercury/Examples/src/client.c:78
|