Stratix 10 GX Dev Kit OpenCL: aocl diagnose error
Hello,
In my previous post I got the Intel OpenCL BSP for Stratix 10 version 20.2 running on the Stratix 10 GX Dev Kit (part 1SG280HU2F50E2VG). After also adjusting the driver for Linux kernel 6.6, I could run `aocl diagnose`.
Now, I get the following errors about memory containing the wrong data:
-------------------------------------------------------------------- BSP Diagnostics -------------------------------------------------------------------- Using Device with name: s10gx : Stratix 10 Reference Platform (acls10_ref0) Using Device from vendor: Intel(R) Corporation clGetDeviceInfo CL_DEVICE_GLOBAL_MEM_SIZE = 2147482624 clGetDeviceInfo CL_DEVICE_MAX_MEM_ALLOC_SIZE = 2147482624 Allocated 2147482624 bytes Actual maximum buffer size = 2147482624 bytes Writing 2047 MB to global memory ... Allocated 1073741824 Bytes host buffer for large transfers Write speed: 5897.15 MB/s [5848.87 -> 5946.23] Reading and verifying 2047 MB from global memory ... Verification failure at element 384, expected 180 but read back 10200000138 First failure at address c00 Verification failure at element 385, expected 181 but read back 220100000139 Verification failure at element 386, expected 182 but read back 418c0000013a Verification failure at element 387, expected 183 but read back 22640000013b Verification failure at element 388, expected 184 but read back 64010000013c Verification failure at element 389, expected 185 but read back 6000000013d Verification failure at element 390, expected 186 but read back 1a020000013e Verification failure at element 391, expected 187 but read back 20000013f Verification failure at element 392, expected 188 but read back 60000000148 Verification failure at element 393, expected 189 but read back 1a0200000149 Verification failure at element 394, expected 18a but read back 20000014a Verification failure at element 395, expected 18b but read back 64010000014b Verification failure at element 396, expected 18c but read back 22010000014c Verification failure at element 397, expected 18d but read back 418c0000014d Verification failure at element 398, expected 18e but read back 22640000014e Verification failure at element 399, expected 18f but read back 1020000014f Verification failure at element 400, expected 190 but read back 150 Verification failure at element 401, expected 191 but read back 151 Verification failure at element 402, expected 192 but read back 152 Verification failure at element 403, expected 193 but read back 153 Verification failure at element 404, expected 194 but read back 154 Verification failure at element 405, expected 195 but read back 155 Verification failure at element 406, expected 196 but read back 156 Verification failure at element 407, expected 197 but read back 157 Verification failure at element 408, expected 198 but read back 60000000158 Verification failure at element 409, expected 199 but read back 1a0200000159 Verification failure at element 410, expected 19a but read back 20000015a Verification failure at element 411, expected 19b but read back 64010000015b Verification failure at element 412, expected 19c but read back 22010000015c Verification failure at element 413, expected 19d but read back 418c0000015d Verification failure at element 414, expected 19e but read back 22640000015e Verification failure at element 415, expected 19f but read back 1020000015f Suppressing error output, counting # of errors ... Read speed: 6394.52 MB/s [6392.18 -> 6396.86] Failed write/readback test with 243050504 errors Error: Global memory test failed Error code: 0
I did fairly small adjustments to the BSP (change device part number, slightly adjust PLACE_REGION and ROUTE_REGION in base.qsf).
For the driver, I did the following adjustments:
+++ b/./aclpci.c @@ -55,7 +55,6 @@ MODULE_AUTHOR ("Dmitry Denisenko"); MODULE_DESCRIPTION ("Driver for Intel(R) OpenCL Acceleration Boards"); -MODULE_SUPPORTED_DEVICE ("Intel(R) OpenCL Boards"); MODULE_LICENSE("GPL"); @@ -409,8 +408,8 @@ int init_irq (struct pci_dev *dev, void *dev_id) { if(pci_enable_msi(dev) != 0){ ACL_DEBUG (KERN_WARNING "Could not enable MSI"); } - if (!pci_set_dma_mask(dev, DMA_BIT_MASK(64))) { - pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(64)); + if (!dma_set_mask(&dev->dev, DMA_BIT_MASK(64))) { + dma_set_coherent_mask(&dev->dev, DMA_BIT_MASK(64)); ACL_DEBUG (KERN_WARNING "using a 64-bit irq mask\n"); } else { ACL_DEBUG (KERN_WARNING "unable to use 64-bit irq mask\n"); @@ -813,7 +812,7 @@ static int __init aclpci_init(void) { } aclpci_major = MAJOR(dev); - aclpci_class = class_create(THIS_MODULE, DRIVER_NAME); + aclpci_class = class_create(DRIVER_NAME); if (IS_ERR(aclpci_class)) { printk(KERN_ERR "aclpci: can't create class\n"); goto err_unchr; +++ b/./aclpci_cmd.c @@ -294,12 +294,12 @@ static int __aclpci_get_user_pages(struct task_struct *target_task, unsigned lon for (got = 0; got < num_pages; got += ret) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) - ret = get_user_pages_remote(target_task, target_task->mm, + ret = get_user_pages_remote(target_task->mm, start_page + got * PAGE_SIZE, num_pages - got, FOLL_WRITE|FOLL_FORCE, p + got, - vma, NULL); + NULL); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0) ret = get_user_pages_remote(target_task, target_task->mm, start_page + got * PAGE_SIZE, @@ -350,9 +350,9 @@ int aclpci_get_user_pages(struct task_struct *target_task, unsigned long start_p if( target_task->mm == NULL) { ret = -EIO; } else { - down_write(&target_task->mm->mmap_sem); + down_write(&target_task->mm->mmap_lock); ret = __aclpci_get_user_pages(target_task, start_page, num_pages, p, NULL); - up_write(&target_task->mm->mmap_sem); + up_write(&target_task->mm->mmap_lock); } return ret; @@ -361,13 +361,13 @@ int aclpci_get_user_pages(struct task_struct *target_task, unsigned long start_p void aclpci_release_user_pages(struct task_struct *target_task, struct page **p, size_t num_pages) { if( target_task->mm != NULL) { - down_write(&target_task->mm->mmap_sem); + down_write(&target_task->mm->mmap_lock); __aclpci_release_user_pages(p, num_pages, 1); target_task->mm->locked_vm -= num_pages; - up_write(&target_task->mm->mmap_sem); + up_write(&target_task->mm->mmap_lock); } } +++ b/./aclpci_dma.c @@ -355,7 +355,7 @@ int lock_dma_buffer (struct aclpci_dev *aclpci, void *addr, ssize_t len, struct dma->ptr = addr; dma->len = len; - dma->dir = d->m_read ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE; + dma->dir = d->m_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE; /* num_pages that [addr, addr+len] map to. */ start_page = (ssize_t)addr >> PAGE_SHIFT; end_page = ((ssize_t)addr + len - 1) >> PAGE_SHIFT; @@ -393,7 +393,7 @@ int lock_dma_buffer (struct aclpci_dev *aclpci, void *addr, ssize_t len, struct // ACL_DEBUG (KERN_DEBUG "p[%d] = 0x%p", i, cur); if (cur != NULL) { // ACL_DEBUG (KERN_DEBUG " phys_addr = 0x%llx", page_to_phys(cur)); - phys = pci_map_page (d->m_pci_dev, cur, 0, PAGE_SIZE, dma->dir); + phys = dma_map_page (&d->m_pci_dev->dev, cur, 0, PAGE_SIZE, dma->dir); if (phys == 0) { ACL_DEBUG (KERN_DEBUG " Couldn't pci_map_page!"); return -EFAULT; @@ -445,7 +445,7 @@ void unlock_dma_buffer (struct aclpci_dev *aclpci, struct dma_t *dma) { // ACL_DEBUG (KERN_DEBUG "p[%d] = %p", i, cur); if (cur != NULL) { dma_addr_t phys = dma->dma_addrs[i]; - pci_unmap_page (d->m_pci_dev, phys, PAGE_SIZE, dma->dir); + dma_unmap_page (&d->m_pci_dev->dev, phys, PAGE_SIZE, dma->dir); } } #endif @@ -1047,7 +1047,7 @@ int hostch_buffer_lock(struct aclpci_dev *aclpci, void *addr, ssize_t len, struc dma->ptr = addr; dma->len = len; - dma->dir = direction ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE; + dma->dir = direction ? DMA_FROM_DEVICE : DMA_TO_DEVICE; /* num_pages that [addr, addr+len] map to. */ start_page = (ssize_t)addr >> PAGE_SHIFT; end_page = ((ssize_t)addr + len - 1) >> PAGE_SHIFT; +++ b/./aclpci_fileio.c @@ -150,7 +150,7 @@ int aclpci_open(struct inode *inode, struct file *file) { /* pointer to containing data structure of the character device inode */ aclpci = container_of(inode->i_cdev, struct aclpci_dev, cdev); - spin_lock(&aclpci->lock); + // spin_lock(&aclpci->lock); if (aclpci->num_handles_open) { printk("Device already in use\n"); result = -EBUSY; @@ -215,7 +215,7 @@ int aclpci_open(struct inode *inode, struct file *file) { result = 0; done: - spin_unlock(&aclpci->lock); + // spin_unlock(&aclpci->lock); up (&aclpci->sem); return result; }
Most modifications were just function names or parameters that changed. Removing the locking in the last file was my solution to the driver crashing with the message `BUG: scheduling while atomic
` in dmesg. As far as I understood, removing the locking there would not be a problem as long as there aren't multiple processes calling `aclpci_open()` at the same time.
I'd appreciate some feedback where you think the problem with aocl diagnose could come from.
Thanks and best regards!
Felix