Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug]: EFA: ibv_open_device() eventually fails when running in loop #306

Closed
3 tasks done
tvegas1 opened this issue Jun 19, 2024 · 2 comments
Closed
3 tasks done

[Bug]: EFA: ibv_open_device() eventually fails when running in loop #306

tvegas1 opened this issue Jun 19, 2024 · 2 comments
Labels
bug Report errors or unexpected behavior Linux EFA driver triage Determine the priority and severity

Comments

@tvegas1
Copy link

tvegas1 commented Jun 19, 2024

Preliminary Actions

Driver Type

Linux kernel driver for Elastic Fabric Adapter (EFA)

Driver Tag/Commit

2.8.0g

Custom Code

No

OS Platform and Distribution

5.15.0-1055-aws #60~20.04.1-Ubuntu SMP

$ cat /sys/class/infiniband/rdmap79s0/device/driver/module/version
2.8.0g
$ cat /sys/class/infiniband/rdmap79s0/device/device
0xefa1

Bug description

The program below eventually fails after few loops with ENOMEM. It can be reproduced at will by restarting the program. When removing ibv_create_comp_channel(), the failure does not seem to reproduce anymore.

Is the cq creation with comp_channel supported on EFA?

Reproduction steps

Source for ibv.c is at the end of the description:

$ gcc ./ibv.c -libverbs && ./a.out
Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)

Expected Behavior

If cq with completion channel is not supported: maybe ibv failure
If cq with completion channel is supported: no failure, even when running in loop

Actual Behavior

The call ibv_open_device(rdmap79s0) eventually fails with ENOMEM.

Additional Data

No response

Relevant log output


$ strace ./a.out
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555008000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555008004, 4096)            = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096)            = 0
close(5)                                = 0
close(3)                                = 0
close(4)                                = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0 <============================ DID NOT FAIL
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555007000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555007004, 4096)            = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096)            = 0
close(5)                                = 0
close(3)                                = 0
close(4)                                = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = -1 ENOMEM (Cannot allocate memory) <====================== FAILS
close(3)                                = 0
write(1, "Using rdmap79s0:\n..............."..., 340Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)
) = 340
exit_group(1)                           = ?
+++ exited with 1 +++
$ cat ibv.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include <infiniband/verbs.h>

#define fatal(name, dev) { \
        printf("\n%s(%s) failed: %s (%d)\n", \
               name, ibv_get_device_name(device), strerror(errno), errno); \
        exit(1); \
}

static void cycle(struct ibv_device *device)
{
        struct ibv_cq *cq;
        struct ibv_comp_channel *comp_channel = NULL;
        struct ibv_context *context = ibv_open_device(device);

        if (!context) {
                fatal("ibv_open_device", device);
        }
#if 1
        comp_channel = ibv_create_comp_channel(context);
        if (!comp_channel) {
                fatal("ibv_create_comp_channel", device);
        }
#endif
        cq = ibv_create_cq(context, 100, NULL, comp_channel, 0);
        if (!cq) {
                fatal("ibv_create_cq", device);
        }
        if (ibv_destroy_cq(cq)) {
                fatal("ibv_destroy_cq", device);
        }
        if (comp_channel && ibv_destroy_comp_channel(comp_channel)) {
                fatal("ibv_destroy_comp_channel", device);
        }
        if (ibv_close_device(context)) {
                fatal("ibv_close_device", device);
        }
}

int main(void)
{
        int num_devices;
        struct ibv_device **device_list = ibv_get_device_list(&num_devices);

        if (device_list && num_devices > 0) {
                printf("Using %s:\n", ibv_get_device_name(device_list[0]));

                for (unsigned i = 0;; i++) {
                        printf("%s.", (i && (i % 64) == 0)? "\n" : "");
                        cycle(device_list[0]);
                }
        } else {
                printf("Cannot get device list\n");
                return -1;
        }

        ibv_free_device_list(device_list);
        return 0;
}

Contact Details

No response

@tvegas1 tvegas1 added bug Report errors or unexpected behavior triage Determine the priority and severity labels Jun 19, 2024
@mrgolin
Copy link
Contributor

mrgolin commented Jun 20, 2024

@tvegas1 Thanks for your detailed description, we will look into it and update accordingly.

@YonatanNachum
Copy link
Contributor

Hey @tvegas1, thanks for the info.
We have found the bug in our rdma-core code: linux-rdma/rdma-core#1536

Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Report errors or unexpected behavior Linux EFA driver triage Determine the priority and severity
Projects
None yet
Development

No branches or pull requests

3 participants