Home » News & Events » IMMERSION COOLING SOLUTION » Cable Gland

GPU direct access to DMA memory over PCIe

Mar 04, 2024

Hi everyone.

We are using the Xavier NX module (L4T 32.5.2) on a custom hardware with a FPGA based frame grabber, connected to the PCIe bus directly on the board. Our custom FPGA driver allocates coherent memory (dma_alloc_coherent) for 32bit DMA transfer of camera data. This memory is mapped into user space (dma_mmap_coherent) for processing with the CPU, where these data are computed.

Now we are trying to use this memory for CUDA processing on the GPU directly without memcopy. At this point we cannot gain access to the data memory by using ‘cudaHostRegister’.

The call is returning an ‘out of memory’ error, although the ‘cudaDevAttrHostRegisterSupported’ flag is valid.

cudaHostGetFlags(&flag, dma_data);

cudaDeviceGetAttribute(&attr, cudaDevAttrHostRegisterSupported, 0);

// cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, cudaCpuDeviceId);



cudaSetDeviceFlags(cudaDeviceMapHost);

ce = cudaHostRegister(dma_data, dma_size, cudaHostRegisterDefault);

if (ce != cudaSuccess) {

  printf("Error: Failed cudaHostRegister: %s, ptr %p\n", cudaGetErrorString(ce), dma_data);

  return 1;

}

Our next approach was allocating the memory in user space with mmap and HUGE_PAGES.

According to GitHub - NVIDIA/jetson-rdma-picoevb: Minimal HW-based demo of GPUDirect RDMA on NVIDIA Jetson AGX Xavier running L4T and Is GPU Direct RDMA supported on Xavier?, we added the ioctl functions to our FPGA driver and get access to this memory in kernel space.

int cxgx_ioctl_pin_cuda(struct cxgx_file *pcxgx_file, unsigned long arg) {

  void __user *argp = (void __user *)arg;

  struct cxgx_rdma_pin_cuda pin_params;

  struct nv_mem_context *nv_mem_context;

  int i, ret;

  if (copy_from_user(&pin_params, argp, sizeof(pin_params)))

    return -EFAULT;

  nv_mem_context = kzalloc(sizeof(*nv_mem_context), GFP_KERNEL);

  if (!nv_mem_context)

   return -ENOMEM;

  nv_mem_context->pcxgx_file = pcxgx_file;

  nv_mem_context->page_virt_start = pin_params.va & GPU_PAGE_MASK;

  nv_mem_context->page_virt_end = (pin_params.va + pin_params.size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;

  nv_mem_context->mapped_size = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;

  nv_mem_context->handle = -1;

  printk(KERN_INFO "nv_mem_context->page_virt_start 0x%p page_virt_end 0x%p mapped_size 0x%x handle %i\n", nv_mem_context->page_virt_start, nv_mem_context->page_virt_end, nv_mem_context->mapped_size, nv_mem_context->handle);

  ret = nvidia_p2p_get_pages(nv_mem_context->page_virt_start, nv_mem_context->mapped_size, &nv_mem_context->page_table, nv_mem_free_callback, nv_mem_context);

  if (ret < 0) {

    kfree(nv_mem_context);

    return ret;

  }

  printk(KERN_INFO "nv_mem_context->page_table %p version %x\n", nv_mem_context->page_table, nv_mem_context->page_table->version);

  if (!NVIDIA_P2P_PAGE_TABLE_VERSION_COMPATIBLE(nv_mem_context->page_table))

    printk(KERN_ERR "incompatible page table version 0x%08x\n", nv_mem_context->page_table->version);

    switch (nv_mem_context->page_table->page_size) {

      case NVIDIA_P2P_PAGE_SIZE_4KB:

        printk(KERN_INFO "page_size = 4KB\n");

        break;

      case NVIDIA_P2P_PAGE_SIZE_64KB:

        printk(KERN_INFO "page_size = 64KB\n");

        break;

      case NVIDIA_P2P_PAGE_SIZE_128KB:

        printk(KERN_INFO "page_size = 128KB\n");

        break;

      default:

        printk(KERN_ERR "unexpected page_size\n");

        ret = -EINVAL;

        goto put_pages;

  }

  printk(KERN_INFO "nv_mem_context->page_table entries 0x%x pages 0x%p *pages %p\n", nv_mem_context->page_table->entries, nv_mem_context->page_table->pages, (*nv_mem_context->page_table->pages));

  mutex_lock(&pcxgx_file->lock);

  nv_mem_context->handle = idr_alloc(&pcxgx_file->nv_mem_idr, nv_mem_context, 0, 0,	GFP_KERNEL);

  mutex_unlock(&pcxgx_file->lock);

  if (nv_mem_context->handle < 0) {

    ret = nv_mem_context->handle;

    goto put_pages;

  }

  pin_params.handle = nv_mem_context->handle;

  ret = copy_to_user(argp, &pin_params, sizeof(pin_params));

  if (ret)

    goto put_pages;

  ret = nvidia_p2p_dma_map_pages(&(cxgx_dev.pci_dev_ptr->dev), nv_mem_context->page_table, &nv_mem_context->dma_mapping, DMA_TO_DEVICE);

  if (ret)

    goto put_pages;

  nv_mem_context->npages = nv_mem_context->dma_mapping->entries;

  printk(KERN_INFO "nv_mem_context->dma_mapping entries %i hw_address 0x%p hw_len 0x%x\n", nv_mem_context->dma_mapping->entries, nv_mem_context->dma_mapping->hw_address[0], nv_mem_context->dma_mapping->hw_len[0]);

  return 0;

  put_pages:

    nvidia_p2p_put_pages(nv_mem_context->page_table);

  return ret;

}

Everthing works as expected, but our FPGA DMA controller is not able to access dma addresses beyond 32bit, so we cannot deliver data to CPU/GPU.

Thank you for any information you can provide.