Lines Matching refs:hdev
51 static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region, in hl_set_dram_bar() argument
54 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_set_dram_bar()
64 old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); in hl_set_dram_bar()
73 int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, in hl_access_sram_dram_region() argument
76 struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; in hl_access_sram_dram_region()
81 old_base = hl_set_dram_bar(hdev, addr, region, &bar_region_base); in hl_access_sram_dram_region()
86 acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + in hl_access_sram_dram_region()
111 rc = hl_set_dram_bar(hdev, old_base, region, NULL); in hl_access_sram_dram_region()
119 static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, in hl_dma_alloc_common() argument
127 ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); in hl_dma_alloc_common()
130 ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); in hl_dma_alloc_common()
135 trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle, in hl_dma_alloc_common()
141 static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *cpu_addr, in hl_asic_dma_free_common() argument
150 hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); in hl_asic_dma_free_common()
153 hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); in hl_asic_dma_free_common()
157 trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller); in hl_asic_dma_free_common()
160 void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, in hl_asic_dma_alloc_coherent_caller() argument
163 return hl_dma_alloc_common(hdev, size, dma_handle, flag, DMA_ALLOC_COHERENT, caller); in hl_asic_dma_alloc_coherent_caller()
166 void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, in hl_asic_dma_free_coherent_caller() argument
169 hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); in hl_asic_dma_free_coherent_caller()
172 void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, in hl_asic_dma_pool_zalloc_caller() argument
175 return hl_dma_alloc_common(hdev, size, dma_handle, mem_flags, DMA_ALLOC_POOL, caller); in hl_asic_dma_pool_zalloc_caller()
178 void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, in hl_asic_dma_pool_free_caller() argument
181 hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); in hl_asic_dma_pool_free_caller()
184 void *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle) in hl_cpu_accessible_dma_pool_alloc() argument
186 return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); in hl_cpu_accessible_dma_pool_alloc()
189 void hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr) in hl_cpu_accessible_dma_pool_free() argument
191 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr); in hl_cpu_accessible_dma_pool_free()
194 int hl_dma_map_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt, in hl_dma_map_sgtable_caller() argument
197 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_map_sgtable_caller()
201 rc = hdev->asic_funcs->dma_map_sgtable(hdev, sgt, dir); in hl_dma_map_sgtable_caller()
209 trace_habanalabs_dma_map_page(&(hdev)->pdev->dev, in hl_dma_map_sgtable_caller()
222 int hl_asic_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, in hl_asic_dma_map_sgtable() argument
225 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_asic_dma_map_sgtable()
229 rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_asic_dma_map_sgtable()
241 void hl_dma_unmap_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt, in hl_dma_unmap_sgtable_caller() argument
244 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_unmap_sgtable_caller()
248 hdev->asic_funcs->dma_unmap_sgtable(hdev, sgt, dir); in hl_dma_unmap_sgtable_caller()
252 trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev, in hl_dma_unmap_sgtable_caller()
264 void hl_asic_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, in hl_asic_dma_unmap_sgtable() argument
267 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_asic_dma_unmap_sgtable()
276 dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_asic_dma_unmap_sgtable()
287 int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, in hl_access_cfg_region() argument
290 struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG]; in hl_access_cfg_region()
294 dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32)); in hl_access_cfg_region()
316 dev_err(hdev->dev, "access type %d is not supported\n", acc_type); in hl_access_cfg_region()
332 int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, in hl_access_dev_mem() argument
337 return hl_access_cfg_region(hdev, addr, val, acc_type); in hl_access_dev_mem()
340 return hl_access_sram_dram_region(hdev, addr, val, acc_type, in hl_access_dev_mem()
373 enum hl_device_status hl_device_status(struct hl_device *hdev) in hl_device_status() argument
377 if (hdev->device_fini_pending) { in hl_device_status()
379 } else if (hdev->reset_info.in_reset) { in hl_device_status()
380 if (hdev->reset_info.in_compute_reset) in hl_device_status()
384 } else if (hdev->reset_info.needs_reset) { in hl_device_status()
386 } else if (hdev->disabled) { in hl_device_status()
388 } else if (!hdev->init_done) { in hl_device_status()
397 bool hl_device_operational(struct hl_device *hdev, in hl_device_operational() argument
402 current_status = hl_device_status(hdev); in hl_device_operational()
419 bool hl_ctrl_device_operational(struct hl_device *hdev, in hl_ctrl_device_operational() argument
424 current_status = hl_device_status(hdev); in hl_ctrl_device_operational()
441 static void print_idle_status_mask(struct hl_device *hdev, const char *message, in print_idle_status_mask() argument
445 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n", in print_idle_status_mask()
446 dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
449 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n", in print_idle_status_mask()
450 dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
453 dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n", in print_idle_status_mask()
454 dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]); in print_idle_status_mask()
456 dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message, in print_idle_status_mask()
465 struct hl_device *hdev; in hpriv_release() local
469 hdev = hpriv->hdev; in hpriv_release()
471 hdev->asic_funcs->send_device_activity(hdev, false); in hpriv_release()
484 reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active; in hpriv_release()
489 if (!hdev->reset_info.in_reset && !reset_device && !hdev->pldm) in hpriv_release()
490 device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, in hpriv_release()
493 print_idle_status_mask(hdev, "device is not idle after user context is closed", in hpriv_release()
507 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
509 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
514 hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); in hpriv_release()
517 int rc = hdev->asic_funcs->scrub_device_mem(hdev); in hpriv_release()
520 dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc); in hpriv_release()
521 hl_device_reset(hdev, HL_DRV_RESET_HARD); in hpriv_release()
529 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
530 hdev->is_compute_ctx_active = false; in hpriv_release()
531 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
533 hdev->compute_ctx_in_release = 0; in hpriv_release()
554 static void print_device_in_use_info(struct hl_device *hdev, in print_device_in_use_info() argument
566 active_cs_num = hl_get_active_cs_num(hdev); in print_device_in_use_info()
572 dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt); in print_device_in_use_info()
588 dev_notice(hdev->dev, "%s%s\n", message, buf); in print_device_in_use_info()
601 struct hl_device *hdev = to_hl_device(ddev); in hl_device_release() local
604 if (!hdev) { in hl_device_release()
609 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); in hl_device_release()
616 hdev->compute_ctx_in_release = 1; in hl_device_release()
619 print_device_in_use_info(hdev, &mm_fini_stats, in hl_device_release()
621 hl_device_reset(hdev, HL_DRV_RESET_HARD); in hl_device_release()
624 hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif; in hl_device_release()
630 struct hl_device *hdev = hpriv->hdev; in hl_device_release_ctrl() local
634 if (!hdev) { in hl_device_release_ctrl()
639 mutex_lock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
641 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
652 struct hl_device *hdev = hpriv->hdev; in __hl_mmap() local
655 if (!hdev) { in __hl_mmap()
717 static int device_init_cdev(struct hl_device *hdev, const struct class *class, in device_init_cdev() argument
730 (*dev)->devt = MKDEV(hdev->major, minor); in device_init_cdev()
733 dev_set_drvdata(*dev, hdev); in device_init_cdev()
739 static int cdev_sysfs_debugfs_add(struct hl_device *hdev) in cdev_sysfs_debugfs_add() argument
741 const struct class *accel_class = hdev->drm.accel->kdev->class; in cdev_sysfs_debugfs_add()
745 hdev->cdev_idx = hdev->drm.accel->index; in cdev_sysfs_debugfs_add()
748 snprintf(name, sizeof(name), "accel_controlD%d", hdev->cdev_idx); in cdev_sysfs_debugfs_add()
749 rc = device_init_cdev(hdev, accel_class, hdev->cdev_idx, &hl_ctrl_ops, name, in cdev_sysfs_debugfs_add()
750 &hdev->cdev_ctrl, &hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
754 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
756 dev_err(hdev->dev_ctrl, in cdev_sysfs_debugfs_add()
761 rc = hl_sysfs_init(hdev); in cdev_sysfs_debugfs_add()
763 dev_err(hdev->dev, "failed to initialize sysfs\n"); in cdev_sysfs_debugfs_add()
767 hl_debugfs_add_device(hdev); in cdev_sysfs_debugfs_add()
769 hdev->cdev_sysfs_debugfs_created = true; in cdev_sysfs_debugfs_add()
774 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
776 put_device(hdev->dev_ctrl); in cdev_sysfs_debugfs_add()
780 static void cdev_sysfs_debugfs_remove(struct hl_device *hdev) in cdev_sysfs_debugfs_remove() argument
782 if (!hdev->cdev_sysfs_debugfs_created) in cdev_sysfs_debugfs_remove()
785 hl_sysfs_fini(hdev); in cdev_sysfs_debugfs_remove()
787 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in cdev_sysfs_debugfs_remove()
788 put_device(hdev->dev_ctrl); in cdev_sysfs_debugfs_remove()
795 struct hl_device *hdev = device_reset_work->hdev; in device_hard_reset_pending() local
801 rc = hl_device_reset(hdev, flags); in device_hard_reset_pending()
803 if ((rc == -EBUSY) && !hdev->device_fini_pending) { in device_hard_reset_pending()
804 struct hl_ctx *ctx = hl_get_compute_ctx(hdev); in device_hard_reset_pending()
810 dev_info(hdev->dev, in device_hard_reset_pending()
815 dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", in device_hard_reset_pending()
819 queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, in device_hard_reset_pending()
828 struct hl_device *hdev = watchdog_work->hdev; in device_release_watchdog_func() local
831 dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n"); in device_release_watchdog_func()
835 hl_device_reset(hdev, flags); in device_release_watchdog_func()
846 static int device_early_init(struct hl_device *hdev) in device_early_init() argument
851 switch (hdev->asic_type) { in device_early_init()
853 goya_set_asic_funcs(hdev); in device_early_init()
854 strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); in device_early_init()
857 gaudi_set_asic_funcs(hdev); in device_early_init()
858 strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name)); in device_early_init()
861 gaudi_set_asic_funcs(hdev); in device_early_init()
862 strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name)); in device_early_init()
865 gaudi2_set_asic_funcs(hdev); in device_early_init()
866 strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); in device_early_init()
869 gaudi2_set_asic_funcs(hdev); in device_early_init()
870 strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); in device_early_init()
873 gaudi2_set_asic_funcs(hdev); in device_early_init()
874 strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name)); in device_early_init()
877 gaudi2_set_asic_funcs(hdev); in device_early_init()
878 strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name)); in device_early_init()
881 dev_err(hdev->dev, "Unrecognized ASIC type %d\n", in device_early_init()
882 hdev->asic_type); in device_early_init()
886 rc = hdev->asic_funcs->early_init(hdev); in device_early_init()
890 rc = hl_asid_init(hdev); in device_early_init()
894 if (hdev->asic_prop.completion_queues_count) { in device_early_init()
895 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, in device_early_init()
898 if (!hdev->cq_wq) { in device_early_init()
904 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { in device_early_init()
905 snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i); in device_early_init()
906 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); in device_early_init()
907 if (hdev->cq_wq[i] == NULL) { in device_early_init()
908 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); in device_early_init()
914 snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx); in device_early_init()
915 hdev->eq_wq = create_singlethread_workqueue(workq_name); in device_early_init()
916 if (hdev->eq_wq == NULL) { in device_early_init()
917 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); in device_early_init()
922 snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx); in device_early_init()
923 hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
924 if (!hdev->cs_cmplt_wq) { in device_early_init()
925 dev_err(hdev->dev, in device_early_init()
931 snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx); in device_early_init()
932 hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
933 if (!hdev->ts_free_obj_wq) { in device_early_init()
934 dev_err(hdev->dev, in device_early_init()
940 snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx); in device_early_init()
941 hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); in device_early_init()
942 if (!hdev->prefetch_wq) { in device_early_init()
943 dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); in device_early_init()
948 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL); in device_early_init()
949 if (!hdev->hl_chip_info) { in device_early_init()
954 rc = hl_mmu_if_set_funcs(hdev); in device_early_init()
958 hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); in device_early_init()
960 snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx); in device_early_init()
961 hdev->reset_wq = create_singlethread_workqueue(workq_name); in device_early_init()
962 if (!hdev->reset_wq) { in device_early_init()
964 dev_err(hdev->dev, "Failed to create device reset WQ\n"); in device_early_init()
968 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); in device_early_init()
970 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending); in device_early_init()
971 hdev->device_reset_work.hdev = hdev; in device_early_init()
972 hdev->device_fini_pending = 0; in device_early_init()
974 INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work, in device_early_init()
976 hdev->device_release_watchdog_work.hdev = hdev; in device_early_init()
978 mutex_init(&hdev->send_cpu_message_lock); in device_early_init()
979 mutex_init(&hdev->debug_lock); in device_early_init()
980 INIT_LIST_HEAD(&hdev->cs_mirror_list); in device_early_init()
981 spin_lock_init(&hdev->cs_mirror_lock); in device_early_init()
982 spin_lock_init(&hdev->reset_info.lock); in device_early_init()
983 INIT_LIST_HEAD(&hdev->fpriv_list); in device_early_init()
984 INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); in device_early_init()
985 mutex_init(&hdev->fpriv_list_lock); in device_early_init()
986 mutex_init(&hdev->fpriv_ctrl_list_lock); in device_early_init()
987 mutex_init(&hdev->clk_throttling.lock); in device_early_init()
992 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL); in device_early_init()
993 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); in device_early_init()
995 kfree(hdev->hl_chip_info); in device_early_init()
997 destroy_workqueue(hdev->prefetch_wq); in device_early_init()
999 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_init()
1001 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_init()
1003 destroy_workqueue(hdev->eq_wq); in device_early_init()
1005 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_init()
1006 if (hdev->cq_wq[i]) in device_early_init()
1007 destroy_workqueue(hdev->cq_wq[i]); in device_early_init()
1008 kfree(hdev->cq_wq); in device_early_init()
1010 hl_asid_fini(hdev); in device_early_init()
1012 if (hdev->asic_funcs->early_fini) in device_early_init()
1013 hdev->asic_funcs->early_fini(hdev); in device_early_init()
1024 static void device_early_fini(struct hl_device *hdev) in device_early_fini() argument
1028 mutex_destroy(&hdev->debug_lock); in device_early_fini()
1029 mutex_destroy(&hdev->send_cpu_message_lock); in device_early_fini()
1031 mutex_destroy(&hdev->fpriv_list_lock); in device_early_fini()
1032 mutex_destroy(&hdev->fpriv_ctrl_list_lock); in device_early_fini()
1034 mutex_destroy(&hdev->clk_throttling.lock); in device_early_fini()
1036 hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL); in device_early_fini()
1037 hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); in device_early_fini()
1039 kfree(hdev->hl_chip_info); in device_early_fini()
1041 destroy_workqueue(hdev->prefetch_wq); in device_early_fini()
1042 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_fini()
1043 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_fini()
1044 destroy_workqueue(hdev->eq_wq); in device_early_fini()
1045 destroy_workqueue(hdev->reset_wq); in device_early_fini()
1047 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_fini()
1048 destroy_workqueue(hdev->cq_wq[i]); in device_early_fini()
1049 kfree(hdev->cq_wq); in device_early_fini()
1051 hl_asid_fini(hdev); in device_early_fini()
1053 if (hdev->asic_funcs->early_fini) in device_early_fini()
1054 hdev->asic_funcs->early_fini(hdev); in device_early_fini()
1057 static bool is_pci_link_healthy(struct hl_device *hdev) in is_pci_link_healthy() argument
1061 if (!hdev->pdev) in is_pci_link_healthy()
1064 pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id); in is_pci_link_healthy()
1066 return (device_id == hdev->pdev->device); in is_pci_link_healthy()
1069 static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) in hl_device_eq_heartbeat_received() argument
1071 struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info; in hl_device_eq_heartbeat_received()
1073 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_device_eq_heartbeat_received()
1078 if (!hdev->eq_heartbeat_received) { in hl_device_eq_heartbeat_received()
1079 dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); in hl_device_eq_heartbeat_received()
1081 dev_err(hdev->dev, in hl_device_eq_heartbeat_received()
1083 hdev->event_queue.ci, in hl_device_eq_heartbeat_received()
1085 &hdev->heartbeat_debug_info.last_eq_heartbeat_ts, in hl_device_eq_heartbeat_received()
1086 hdev->kernel_queues[cpu_q_id].pi, in hl_device_eq_heartbeat_received()
1087 atomic_read(&hdev->kernel_queues[cpu_q_id].ci), in hl_device_eq_heartbeat_received()
1088 atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask, in hl_device_eq_heartbeat_received()
1089 &hdev->heartbeat_debug_info.last_pq_heartbeat_ts); in hl_device_eq_heartbeat_received()
1091 hl_eq_dump(hdev, &hdev->event_queue); in hl_device_eq_heartbeat_received()
1096 hdev->eq_heartbeat_received = false; in hl_device_eq_heartbeat_received()
1103 struct hl_device *hdev = container_of(work, struct hl_device, in hl_device_heartbeat() local
1109 if (!hl_device_operational(hdev, NULL) || !hdev->init_done) in hl_device_heartbeat()
1117 if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev))) in hl_device_heartbeat()
1120 if (hl_device_operational(hdev, NULL)) in hl_device_heartbeat()
1121 dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n", in hl_device_heartbeat()
1122 is_pci_link_healthy(hdev) ? "healthy" : "broken"); in hl_device_heartbeat()
1126 hl_handle_fw_err(hdev, &info); in hl_device_heartbeat()
1127 hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask); in hl_device_heartbeat()
1142 if (!hdev->reset_info.in_reset) in hl_device_heartbeat()
1143 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; in hl_device_heartbeat()
1145 schedule_delayed_work(&hdev->work_heartbeat, in hl_device_heartbeat()
1157 static int device_late_init(struct hl_device *hdev) in device_late_init() argument
1161 if (hdev->asic_funcs->late_init) { in device_late_init()
1162 rc = hdev->asic_funcs->late_init(hdev); in device_late_init()
1164 dev_err(hdev->dev, in device_late_init()
1170 hdev->high_pll = hdev->asic_prop.high_pll; in device_late_init()
1171 hdev->late_init_done = true; in device_late_init()
1182 static void device_late_fini(struct hl_device *hdev) in device_late_fini() argument
1184 if (!hdev->late_init_done) in device_late_fini()
1187 if (hdev->asic_funcs->late_fini) in device_late_fini()
1188 hdev->asic_funcs->late_fini(hdev); in device_late_fini()
1190 hdev->late_init_done = false; in device_late_fini()
1193 int hl_device_utilization(struct hl_device *hdev, u32 *utilization) in hl_device_utilization() argument
1198 max_power = hdev->max_power; in hl_device_utilization()
1199 dc_power = hdev->asic_prop.dc_power_default; in hl_device_utilization()
1202 dev_warn(hdev->dev, "device utilization is not supported\n"); in hl_device_utilization()
1205 rc = hl_fw_cpucp_power_get(hdev, &curr_power); in hl_device_utilization()
1218 int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable) in hl_device_set_debug_mode() argument
1222 mutex_lock(&hdev->debug_lock); in hl_device_set_debug_mode()
1225 if (!hdev->in_debug) { in hl_device_set_debug_mode()
1226 dev_err(hdev->dev, in hl_device_set_debug_mode()
1232 if (!hdev->reset_info.hard_reset_pending) in hl_device_set_debug_mode()
1233 hdev->asic_funcs->halt_coresight(hdev, ctx); in hl_device_set_debug_mode()
1235 hdev->in_debug = 0; in hl_device_set_debug_mode()
1240 if (hdev->in_debug) { in hl_device_set_debug_mode()
1241 dev_err(hdev->dev, in hl_device_set_debug_mode()
1247 hdev->in_debug = 1; in hl_device_set_debug_mode()
1250 mutex_unlock(&hdev->debug_lock); in hl_device_set_debug_mode()
1255 static void take_release_locks(struct hl_device *hdev) in take_release_locks() argument
1260 hdev->asic_funcs->hw_queues_lock(hdev); in take_release_locks()
1261 hdev->asic_funcs->hw_queues_unlock(hdev); in take_release_locks()
1264 mutex_lock(&hdev->send_cpu_message_lock); in take_release_locks()
1265 mutex_unlock(&hdev->send_cpu_message_lock); in take_release_locks()
1268 mutex_lock(&hdev->fpriv_list_lock); in take_release_locks()
1269 mutex_unlock(&hdev->fpriv_list_lock); in take_release_locks()
1270 mutex_lock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1271 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1274 static void hl_abort_waiting_for_completions(struct hl_device *hdev) in hl_abort_waiting_for_completions() argument
1276 hl_abort_waiting_for_cs_completions(hdev); in hl_abort_waiting_for_completions()
1281 hl_release_pending_user_interrupts(hdev); in hl_abort_waiting_for_completions()
1284 static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, in cleanup_resources() argument
1288 if (hdev->heartbeat) in cleanup_resources()
1289 cancel_delayed_work_sync(&hdev->work_heartbeat); in cleanup_resources()
1291 device_late_fini(hdev); in cleanup_resources()
1299 hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); in cleanup_resources()
1302 hl_cs_rollback_all(hdev, skip_wq_flush); in cleanup_resources()
1305 flush_workqueue(hdev->prefetch_wq); in cleanup_resources()
1307 hl_abort_waiting_for_completions(hdev); in cleanup_resources()
1319 int hl_device_suspend(struct hl_device *hdev) in hl_device_suspend() argument
1323 pci_save_state(hdev->pdev); in hl_device_suspend()
1326 spin_lock(&hdev->reset_info.lock); in hl_device_suspend()
1327 if (hdev->reset_info.in_reset) { in hl_device_suspend()
1328 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1329 dev_err(hdev->dev, "Can't suspend while in reset\n"); in hl_device_suspend()
1332 hdev->reset_info.in_reset = 1; in hl_device_suspend()
1333 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1336 hdev->disabled = true; in hl_device_suspend()
1338 take_release_locks(hdev); in hl_device_suspend()
1340 rc = hdev->asic_funcs->suspend(hdev); in hl_device_suspend()
1342 dev_err(hdev->dev, in hl_device_suspend()
1346 pci_disable_device(hdev->pdev); in hl_device_suspend()
1347 pci_set_power_state(hdev->pdev, PCI_D3hot); in hl_device_suspend()
1361 int hl_device_resume(struct hl_device *hdev) in hl_device_resume() argument
1365 pci_set_power_state(hdev->pdev, PCI_D0); in hl_device_resume()
1366 pci_restore_state(hdev->pdev); in hl_device_resume()
1367 rc = pci_enable_device_mem(hdev->pdev); in hl_device_resume()
1369 dev_err(hdev->dev, in hl_device_resume()
1374 pci_set_master(hdev->pdev); in hl_device_resume()
1376 rc = hdev->asic_funcs->resume(hdev); in hl_device_resume()
1378 dev_err(hdev->dev, "Failed to resume device after suspend\n"); in hl_device_resume()
1386 spin_lock(&hdev->reset_info.lock); in hl_device_resume()
1387 hdev->reset_info.in_reset = 0; in hl_device_resume()
1388 spin_unlock(&hdev->reset_info.lock); in hl_device_resume()
1390 rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); in hl_device_resume()
1392 dev_err(hdev->dev, "Failed to reset device during resume\n"); in hl_device_resume()
1399 pci_disable_device(hdev->pdev); in hl_device_resume()
1404 static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev) in device_kill_open_processes() argument
1412 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_kill_open_processes()
1413 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_kill_open_processes()
1424 if (hdev->process_kill_trial_cnt) { in device_kill_open_processes()
1442 dev_info(hdev->dev, "Killing user process pid=%d\n", in device_kill_open_processes()
1449 dev_dbg(hdev->dev, in device_kill_open_processes()
1468 dev_dbg(hdev->dev, in device_kill_open_processes()
1481 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS) in device_kill_open_processes()
1484 hdev->process_kill_trial_cnt++; in device_kill_open_processes()
1489 static void device_disable_open_processes(struct hl_device *hdev, bool control_dev) in device_disable_open_processes() argument
1495 hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_disable_open_processes()
1496 hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_disable_open_processes()
1500 hpriv->hdev = NULL; in device_disable_open_processes()
1504 static void send_disable_pci_access(struct hl_device *hdev, u32 flags) in send_disable_pci_access() argument
1520 if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) in send_disable_pci_access()
1526 if (hdev->cpu_queues_enable) in send_disable_pci_access()
1527 disable_irq(pci_irq_vector(hdev->pdev, hdev->asic_prop.eq_interrupt_id)); in send_disable_pci_access()
1531 static void handle_reset_trigger(struct hl_device *hdev, u32 flags) in handle_reset_trigger() argument
1536 if (hdev->is_compute_ctx_active) in handle_reset_trigger()
1546 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; in handle_reset_trigger()
1549 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; in handle_reset_trigger()
1552 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1555 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1563 if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { in handle_reset_trigger()
1564 hdev->reset_info.prev_reset_trigger = cur_reset_trigger; in handle_reset_trigger()
1565 hdev->reset_info.reset_trigger_repeated = 0; in handle_reset_trigger()
1567 hdev->reset_info.reset_trigger_repeated = 1; in handle_reset_trigger()
1571 static void reset_heartbeat_debug_info(struct hl_device *hdev) in reset_heartbeat_debug_info() argument
1573 hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0; in reset_heartbeat_debug_info()
1574 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0; in reset_heartbeat_debug_info()
1575 hdev->heartbeat_debug_info.heartbeat_event_counter = 0; in reset_heartbeat_debug_info()
1578 static inline void device_heartbeat_schedule(struct hl_device *hdev) in device_heartbeat_schedule() argument
1580 if (!hdev->heartbeat) in device_heartbeat_schedule()
1583 reset_heartbeat_debug_info(hdev); in device_heartbeat_schedule()
1590 hdev->eq_heartbeat_received = true; in device_heartbeat_schedule()
1592 schedule_delayed_work(&hdev->work_heartbeat, in device_heartbeat_schedule()
1612 int hl_device_reset(struct hl_device *hdev, u32 flags) in hl_device_reset() argument
1620 if (!hdev->init_done) { in hl_device_reset()
1621 dev_err(hdev->dev, "Can't reset before initialization is done\n"); in hl_device_reset()
1631 reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release; in hl_device_reset()
1633 if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) { in hl_device_reset()
1634 dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); in hl_device_reset()
1638 if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { in hl_device_reset()
1639 dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n"); in hl_device_reset()
1645 dev_crit(hdev->dev, in hl_device_reset()
1653 if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { in hl_device_reset()
1654 dev_dbg(hdev->dev, in hl_device_reset()
1661 if (from_hard_reset_thread && hdev->process_kill_trial_cnt) in hl_device_reset()
1671 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1672 if (hdev->reset_info.in_reset) { in hl_device_reset()
1674 if (hard_reset && hdev->reset_info.in_compute_reset) in hl_device_reset()
1675 hdev->reset_info.hard_reset_schedule_flags = flags; in hl_device_reset()
1676 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1683 hdev->reset_info.in_compute_reset = !hard_reset; in hl_device_reset()
1685 hdev->reset_info.in_reset = 1; in hl_device_reset()
1687 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1693 if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { in hl_device_reset()
1695 &hdev->device_release_watchdog_work; in hl_device_reset()
1697 hdev->reset_info.watchdog_active = 0; in hl_device_reset()
1702 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1713 handle_reset_trigger(hdev, flags); in hl_device_reset()
1714 send_disable_pci_access(hdev, flags); in hl_device_reset()
1717 hdev->disabled = true; in hl_device_reset()
1719 take_release_locks(hdev); in hl_device_reset()
1722 dev_info(hdev->dev, "Going to reset device\n"); in hl_device_reset()
1724 dev_dbg(hdev->dev, "Going to reset device after release by user\n"); in hl_device_reset()
1726 dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); in hl_device_reset()
1730 hdev->reset_info.hard_reset_pending = true; in hl_device_reset()
1732 hdev->process_kill_trial_cnt = 0; in hl_device_reset()
1734 hdev->device_reset_work.flags = flags; in hl_device_reset()
1740 queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0); in hl_device_reset()
1745 cleanup_resources(hdev, hard_reset, fw_reset, from_dev_release); in hl_device_reset()
1753 rc = device_kill_open_processes(hdev, 0, false); in hl_device_reset()
1756 if (hdev->device_fini_pending) { in hl_device_reset()
1757 dev_crit(hdev->dev, in hl_device_reset()
1759 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1768 dev_crit(hdev->dev, in hl_device_reset()
1770 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1777 flush_workqueue(hdev->eq_wq); in hl_device_reset()
1781 hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); in hl_device_reset()
1784 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_reset()
1787 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) in hl_device_reset()
1788 hdev->kernel_ctx = NULL; in hl_device_reset()
1790 hl_vm_fini(hdev); in hl_device_reset()
1791 hl_mmu_fini(hdev); in hl_device_reset()
1792 hl_eq_reset(hdev, &hdev->event_queue); in hl_device_reset()
1796 hl_hw_queue_reset(hdev, hard_reset); in hl_device_reset()
1797 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_reset()
1798 hl_cq_reset(hdev, &hdev->completion_queue[i]); in hl_device_reset()
1801 ctx = hl_get_compute_ctx(hdev); in hl_device_reset()
1815 hdev->device_cpu_disabled = false; in hl_device_reset()
1816 hdev->reset_info.hard_reset_pending = false; in hl_device_reset()
1822 if (hdev->reset_info.reset_trigger_repeated && in hl_device_reset()
1823 (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR || in hl_device_reset()
1824 hdev->reset_info.prev_reset_trigger == in hl_device_reset()
1826 dev_crit(hdev->dev, in hl_device_reset()
1828 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1833 if (hdev->kernel_ctx) { in hl_device_reset()
1834 dev_crit(hdev->dev, in hl_device_reset()
1836 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1841 rc = hl_mmu_init(hdev); in hl_device_reset()
1843 dev_err(hdev->dev, in hl_device_reset()
1849 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), in hl_device_reset()
1851 if (!hdev->kernel_ctx) { in hl_device_reset()
1853 hl_mmu_fini(hdev); in hl_device_reset()
1857 hdev->is_compute_ctx_active = false; in hl_device_reset()
1859 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_reset()
1861 dev_err(hdev->dev, in hl_device_reset()
1863 kfree(hdev->kernel_ctx); in hl_device_reset()
1864 hdev->kernel_ctx = NULL; in hl_device_reset()
1865 hl_mmu_fini(hdev); in hl_device_reset()
1874 hdev->disabled = false; in hl_device_reset()
1878 rc = hl_fw_read_preboot_status(hdev); in hl_device_reset()
1883 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_reset()
1885 dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); in hl_device_reset()
1890 if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, in hl_device_reset()
1892 print_idle_status_mask(hdev, "device is not idle after reset", idle_mask); in hl_device_reset()
1898 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_reset()
1900 dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); in hl_device_reset()
1905 rc = device_late_init(hdev); in hl_device_reset()
1907 dev_err(hdev->dev, "Failed late init after hard reset\n"); in hl_device_reset()
1911 rc = hl_vm_init(hdev); in hl_device_reset()
1913 dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); in hl_device_reset()
1917 if (!hdev->asic_prop.fw_security_enabled) in hl_device_reset()
1918 hl_fw_set_max_power(hdev); in hl_device_reset()
1920 rc = hdev->asic_funcs->compute_reset_late_init(hdev); in hl_device_reset()
1923 dev_err(hdev->dev, in hl_device_reset()
1926 dev_err(hdev->dev, "Failed late init after compute reset\n"); in hl_device_reset()
1931 rc = hdev->asic_funcs->scrub_device_mem(hdev); in hl_device_reset()
1933 dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc); in hl_device_reset()
1937 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1938 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1944 if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) in hl_device_reset()
1947 hdev->reset_info.in_reset = 0; in hl_device_reset()
1949 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1951 hdev->reset_info.needs_reset = false; in hl_device_reset()
1954 dev_info(hdev->dev, in hl_device_reset()
1956 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1958 dev_dbg(hdev->dev, in hl_device_reset()
1960 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1963 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
1965 device_heartbeat_schedule(hdev); in hl_device_reset()
1972 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_reset()
1975 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
1978 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); in hl_device_reset()
1979 flags = hdev->reset_info.hard_reset_schedule_flags; in hl_device_reset()
1980 hdev->reset_info.hard_reset_schedule_flags = 0; in hl_device_reset()
1989 hdev->disabled = true; in hl_device_reset()
1991 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1992 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1995 dev_err(hdev->dev, in hl_device_reset()
1997 dev_name(&(hdev)->pdev->dev)); in hl_device_reset()
1998 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
2001 dev_err(hdev->dev, "Failed to reset device after user release\n"); in hl_device_reset()
2004 dev_err(hdev->dev, "Failed to do compute reset\n"); in hl_device_reset()
2005 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
2008 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
2014 hdev->reset_info.in_reset = 0; in hl_device_reset()
2016 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
2030 int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) in hl_device_cond_reset() argument
2040 dev_err(hdev->dev, "Resetting device without a reset indication to user\n"); in hl_device_cond_reset()
2044 ctx = hl_get_compute_ctx(hdev); in hl_device_cond_reset()
2054 if (!ctx->hpriv->notifier_event.eventfd && !hdev->reset_info.watchdog_active) in hl_device_cond_reset()
2060 spin_lock(&hdev->reset_info.lock); in hl_device_cond_reset()
2061 if (hdev->reset_info.in_reset) { in hl_device_cond_reset()
2062 spin_unlock(&hdev->reset_info.lock); in hl_device_cond_reset()
2066 if (hdev->reset_info.watchdog_active) { in hl_device_cond_reset()
2067 hdev->device_release_watchdog_work.flags |= flags; in hl_device_cond_reset()
2071 hdev->device_release_watchdog_work.flags = flags; in hl_device_cond_reset()
2072 dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n", in hl_device_cond_reset()
2073 hdev->device_release_watchdog_timeout_sec); in hl_device_cond_reset()
2074 schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, in hl_device_cond_reset()
2075 secs_to_jiffies(hdev->device_release_watchdog_timeout_sec)); in hl_device_cond_reset()
2076 hdev->reset_info.watchdog_active = 1; in hl_device_cond_reset()
2078 spin_unlock(&hdev->reset_info.lock); in hl_device_cond_reset()
2080 hl_notifier_event_send_all(hdev, event_mask); in hl_device_cond_reset()
2084 hl_abort_waiting_for_completions(hdev); in hl_device_cond_reset()
2090 hl_notifier_event_send_all(hdev, event_mask); in hl_device_cond_reset()
2094 return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD); in hl_device_cond_reset()
2115 void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) in hl_notifier_event_send_all() argument
2120 dev_warn(hdev->dev, "Skip sending zero event"); in hl_notifier_event_send_all()
2124 mutex_lock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
2126 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) in hl_notifier_event_send_all()
2129 mutex_unlock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
2141 int hl_device_init(struct hl_device *hdev) in hl_device_init() argument
2149 rc = device_early_init(hdev); in hl_device_init()
2153 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + in hl_device_init()
2154 hdev->asic_prop.user_interrupt_count; in hl_device_init()
2157 hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt), in hl_device_init()
2159 if (!hdev->user_interrupt) { in hl_device_init()
2165 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) { in hl_device_init()
2173 free_jobs_data = &hdev->user_interrupt[i].ts_free_jobs_data; in hl_device_init()
2181 free_jobs_data = &hdev->common_user_cq_interrupt.ts_free_jobs_data; in hl_device_init()
2197 rc = hdev->asic_funcs->sw_init(hdev); in hl_device_init()
2203 hl_multi_cs_completion_init(hdev); in hl_device_init()
2210 rc = hl_hw_queues_create(hdev); in hl_device_init()
2212 dev_err(hdev->dev, "failed to initialize kernel queues\n"); in hl_device_init()
2216 cq_cnt = hdev->asic_prop.completion_queues_count; in hl_device_init()
2224 hdev->completion_queue = kcalloc(cq_cnt, in hl_device_init()
2225 sizeof(*hdev->completion_queue), in hl_device_init()
2228 if (!hdev->completion_queue) { in hl_device_init()
2229 dev_err(hdev->dev, in hl_device_init()
2237 rc = hl_cq_init(hdev, &hdev->completion_queue[i], in hl_device_init()
2238 hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); in hl_device_init()
2240 dev_err(hdev->dev, in hl_device_init()
2244 hdev->completion_queue[i].cq_idx = i; in hl_device_init()
2247 hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs, in hl_device_init()
2249 if (!hdev->shadow_cs_queue) { in hl_device_init()
2259 rc = hl_eq_init(hdev, &hdev->event_queue); in hl_device_init()
2261 dev_err(hdev->dev, "failed to initialize event queue\n"); in hl_device_init()
2266 rc = hl_mmu_init(hdev); in hl_device_init()
2268 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); in hl_device_init()
2273 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); in hl_device_init()
2274 if (!hdev->kernel_ctx) { in hl_device_init()
2279 hdev->is_compute_ctx_active = false; in hl_device_init()
2281 hdev->asic_funcs->state_dump_init(hdev); in hl_device_init()
2283 hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; in hl_device_init()
2285 hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; in hl_device_init()
2287 rc = hl_debugfs_device_init(hdev); in hl_device_init()
2289 dev_err(hdev->dev, "failed to initialize debugfs entry structure\n"); in hl_device_init()
2290 kfree(hdev->kernel_ctx); in hl_device_init()
2297 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_init()
2299 dev_err(hdev->dev, "failed to initialize kernel context\n"); in hl_device_init()
2300 kfree(hdev->kernel_ctx); in hl_device_init()
2304 rc = hl_cb_pool_init(hdev); in hl_device_init()
2306 dev_err(hdev->dev, "failed to initialize CB pool\n"); in hl_device_init()
2310 rc = hl_dec_init(hdev); in hl_device_init()
2312 dev_err(hdev->dev, "Failed to initialize the decoder module\n"); in hl_device_init()
2326 hdev->disabled = false; in hl_device_init()
2328 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_init()
2330 dev_err(hdev->dev, "failed to initialize the H/W\n"); in hl_device_init()
2336 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_init()
2338 dev_err(hdev->dev, "Failed to detect if device is alive\n"); in hl_device_init()
2343 rc = device_late_init(hdev); in hl_device_init()
2345 dev_err(hdev->dev, "Failed late initialization\n"); in hl_device_init()
2350 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", in hl_device_init()
2351 hdev->asic_name, in hl_device_init()
2352 hdev->asic_prop.dram_size / SZ_1G); in hl_device_init()
2354 rc = hl_vm_init(hdev); in hl_device_init()
2356 dev_err(hdev->dev, "Failed to initialize memory module\n"); in hl_device_init()
2367 rc = drm_dev_register(&hdev->drm, 0); in hl_device_init()
2369 dev_err(hdev->dev, "Failed to register DRM device, rc %d\n", rc); in hl_device_init()
2374 rc = cdev_sysfs_debugfs_add(hdev); in hl_device_init()
2376 dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n"); in hl_device_init()
2384 if (hdev->asic_prop.set_max_power_on_device_init && in hl_device_init()
2385 !hdev->asic_prop.fw_security_enabled) in hl_device_init()
2386 hl_fw_set_max_power(hdev); in hl_device_init()
2394 rc = hl_hwmon_init(hdev); in hl_device_init()
2396 dev_err(hdev->dev, "Failed to initialize hwmon\n"); in hl_device_init()
2405 device_heartbeat_schedule(hdev); in hl_device_init()
2407 dev_notice(hdev->dev, in hl_device_init()
2409 dev_name(&(hdev)->pdev->dev)); in hl_device_init()
2416 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_init()
2418 hdev->init_done = true; in hl_device_init()
2423 hl_cb_pool_fini(hdev); in hl_device_init()
2425 if (hl_ctx_put(hdev->kernel_ctx) != 1) in hl_device_init()
2426 dev_err(hdev->dev, in hl_device_init()
2429 hl_debugfs_device_fini(hdev); in hl_device_init()
2431 hl_mmu_fini(hdev); in hl_device_init()
2433 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_init()
2435 kfree(hdev->shadow_cs_queue); in hl_device_init()
2438 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_init()
2439 kfree(hdev->completion_queue); in hl_device_init()
2441 hl_hw_queues_destroy(hdev); in hl_device_init()
2443 hdev->asic_funcs->sw_fini(hdev); in hl_device_init()
2445 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool); in hl_device_init()
2449 if (!hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool) in hl_device_init()
2451 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool); in hl_device_init()
2453 kfree(hdev->user_interrupt); in hl_device_init()
2456 device_early_fini(hdev); in hl_device_init()
2458 hdev->disabled = true; in hl_device_init()
2460 drm_dev_register(&hdev->drm, 0); in hl_device_init()
2461 cdev_sysfs_debugfs_add(hdev); in hl_device_init()
2465 hdev->cdev_idx, dev_name(&hdev->pdev->dev)); in hl_device_init()
2477 void hl_device_fini(struct hl_device *hdev) in hl_device_fini() argument
2485 dev_info(hdev->dev, "Removing device %s\n", dev_name(&(hdev)->pdev->dev)); in hl_device_fini()
2487 hdev->device_fini_pending = 1; in hl_device_fini()
2488 flush_delayed_work(&hdev->device_reset_work.reset_work); in hl_device_fini()
2490 if (hdev->pldm) in hl_device_fini()
2505 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2506 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2508 hdev->reset_info.in_reset = 1; in hl_device_fini()
2509 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2514 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2515 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2517 hdev->reset_info.in_reset = 1; in hl_device_fini()
2518 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2521 dev_crit(hdev->dev, in hl_device_fini()
2523 dev_name(&(hdev)->pdev->dev)); in hl_device_fini()
2528 cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work); in hl_device_fini()
2537 hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); in hl_device_fini()
2540 hdev->disabled = true; in hl_device_fini()
2542 take_release_locks(hdev); in hl_device_fini()
2544 hdev->reset_info.hard_reset_pending = true; in hl_device_fini()
2546 hl_hwmon_fini(hdev); in hl_device_fini()
2548 cleanup_resources(hdev, true, false, false); in hl_device_fini()
2554 dev_info(hdev->dev, in hl_device_fini()
2558 hdev->process_kill_trial_cnt = 0; in hl_device_fini()
2559 rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false); in hl_device_fini()
2561 dev_crit(hdev->dev, "Failed to kill all open processes (%d)\n", rc); in hl_device_fini()
2562 device_disable_open_processes(hdev, false); in hl_device_fini()
2565 hdev->process_kill_trial_cnt = 0; in hl_device_fini()
2566 rc = device_kill_open_processes(hdev, 0, true); in hl_device_fini()
2568 dev_crit(hdev->dev, "Failed to kill all control device open processes (%d)\n", rc); in hl_device_fini()
2569 device_disable_open_processes(hdev, true); in hl_device_fini()
2572 hl_cb_pool_fini(hdev); in hl_device_fini()
2575 rc = hdev->asic_funcs->hw_fini(hdev, true, false); in hl_device_fini()
2577 dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc); in hl_device_fini()
2579 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_fini()
2582 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) in hl_device_fini()
2583 dev_err(hdev->dev, "kernel ctx is still alive\n"); in hl_device_fini()
2585 hl_dec_fini(hdev); in hl_device_fini()
2587 hl_vm_fini(hdev); in hl_device_fini()
2589 hl_mmu_fini(hdev); in hl_device_fini()
2591 vfree(hdev->captured_err_info.page_fault_info.user_mappings); in hl_device_fini()
2593 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_fini()
2595 kfree(hdev->shadow_cs_queue); in hl_device_fini()
2597 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_fini()
2598 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_fini()
2599 kfree(hdev->completion_queue); in hl_device_fini()
2601 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + in hl_device_fini()
2602 hdev->asic_prop.user_interrupt_count; in hl_device_fini()
2605 if (hdev->asic_prop.first_available_cq[0] != USHRT_MAX) { in hl_device_fini()
2607 vfree(hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool); in hl_device_fini()
2610 kfree(hdev->user_interrupt); in hl_device_fini()
2613 vfree(hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool); in hl_device_fini()
2615 hl_hw_queues_destroy(hdev); in hl_device_fini()
2618 hdev->asic_funcs->sw_fini(hdev); in hl_device_fini()
2620 device_early_fini(hdev); in hl_device_fini()
2623 cdev_sysfs_debugfs_remove(hdev); in hl_device_fini()
2624 drm_dev_unregister(&hdev->drm); in hl_device_fini()
2626 hl_debugfs_device_fini(hdev); in hl_device_fini()
2644 inline u32 hl_rreg(struct hl_device *hdev, u32 reg) in hl_rreg() argument
2646 u32 val = readl(hdev->rmmio + reg); in hl_rreg()
2649 trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val); in hl_rreg()
2664 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) in hl_wreg() argument
2667 trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val); in hl_wreg()
2669 writel(val, hdev->rmmio + reg); in hl_wreg()
2672 void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, in hl_capture_razwi() argument
2675 struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info; in hl_capture_razwi()
2678 dev_err(hdev->dev, in hl_capture_razwi()
2685 if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1)) in hl_capture_razwi()
2698 void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, in hl_handle_razwi() argument
2701 hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); in hl_handle_razwi()
2707 static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) in hl_capture_user_mappings() argument
2709 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; in hl_capture_user_mappings()
2721 ctx = hl_get_compute_ctx(hdev); in hl_capture_user_mappings()
2723 dev_err(hdev->dev, "Can't get user context for user mappings\n"); in hl_capture_user_mappings()
2769 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu) in hl_capture_page_fault() argument
2771 struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info; in hl_capture_page_fault()
2780 hl_capture_user_mappings(hdev, is_pmmu); in hl_capture_page_fault()
2785 void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, in hl_handle_page_fault() argument
2788 hl_capture_page_fault(hdev, addr, eng_id, is_pmmu); in hl_handle_page_fault()
2794 static void hl_capture_hw_err(struct hl_device *hdev, u16 event_id) in hl_capture_hw_err() argument
2796 struct hw_err_info *info = &hdev->captured_err_info.hw_err; in hl_capture_hw_err()
2808 void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask) in hl_handle_critical_hw_err() argument
2810 hl_capture_hw_err(hdev, event_id); in hl_handle_critical_hw_err()
2816 static void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info) in hl_capture_fw_err() argument
2818 struct fw_err_info *info = &hdev->captured_err_info.fw_err; in hl_capture_fw_err()
2832 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) in hl_handle_fw_err() argument
2834 hl_capture_fw_err(hdev, info); in hl_handle_fw_err()
2840 void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count) in hl_capture_engine_err() argument
2842 struct engine_err_info *info = &hdev->captured_err_info.engine_err; in hl_capture_engine_err()
2862 void hl_init_cpu_for_irq(struct hl_device *hdev) in hl_init_cpu_for_irq() argument
2865 struct cpumask *available_mask = &hdev->irq_affinity_mask; in hl_init_cpu_for_irq()
2866 int numa_node = hdev->pdev->dev.numa_node, i; in hl_init_cpu_for_irq()
2873 dev_err(hdev->dev, "No available affinities in current numa node\n"); in hl_init_cpu_for_irq()
2883 void hl_set_irq_affinity(struct hl_device *hdev, int irq) in hl_set_irq_affinity() argument
2885 if (cpumask_empty(&hdev->irq_affinity_mask)) { in hl_set_irq_affinity()
2886 dev_dbg(hdev->dev, "affinity mask is empty\n"); in hl_set_irq_affinity()
2890 if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask)) in hl_set_irq_affinity()
2891 dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq); in hl_set_irq_affinity()
2894 void hl_eq_heartbeat_event_handle(struct hl_device *hdev) in hl_eq_heartbeat_event_handle() argument
2896 hdev->heartbeat_debug_info.heartbeat_event_counter++; in hl_eq_heartbeat_event_handle()
2897 hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds(); in hl_eq_heartbeat_event_handle()
2898 hdev->eq_heartbeat_received = true; in hl_eq_heartbeat_event_handle()
2901 void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask) in hl_handle_clk_change_event() argument
2903 struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling; in hl_handle_clk_change_event()
2914 dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); in hl_handle_clk_change_event()
2920 dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); in hl_handle_clk_change_event()
2929 dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); in hl_handle_clk_change_event()
2936 dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); in hl_handle_clk_change_event()
2940 dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type); in hl_handle_clk_change_event()