Merge tag 'drm-xe-next-2023-12-21-pr1-1' of https://gitlab.freedesktop.org/drm/xe...
authorDave Airlie <airlied@redhat.com>
Thu, 21 Dec 2023 21:55:59 +0000 (07:55 +1000)
committerDave Airlie <airlied@redhat.com>
Fri, 22 Dec 2023 00:36:21 +0000 (10:36 +1000)
Introduce a new DRM driver for Intel GPUs

Xe, is a new driver for Intel GPUs that supports both integrated and
discrete platforms. The experimental support starts with Tiger Lake.
i915 will continue be the main production driver for the platforms
up to Meteor Lake and Alchemist. Then the goal is to make this Intel
Xe driver the primary driver for Lunar Lake and newer platforms.

It uses most, if not all, of the key drm concepts, in special: TTM,
drm-scheduler, drm-exec, drm-gpuvm/gpuva and others.

Signed-off-by: Dave Airlie <airlied@redhat.com>
[airlied: add an extra X86 check, fix a typo, fix drm_exec_init interface
change].

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ZYSwLgXZUZ57qGPQ@intel.com
1  2 
MAINTAINERS
drivers/gpu/drm/xe/Kconfig
drivers/gpu/drm/xe/xe_exec.c
drivers/gpu/drm/xe/xe_gt_pagefault.c
drivers/gpu/drm/xe/xe_vm.c
include/drm/drm_gpuvm.h

diff --cc MAINTAINERS
Simple merge
index 0000000000000000000000000000000000000000,5b3da06e7ba3043e8775401a313936ab1d7f7e83..90c676d14c509a6c30607f89d9b236ad1a5a2227
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,96 +1,96 @@@
 -      # i915 depends on ACPI_VIDEO when ACPI is enabled
+ # SPDX-License-Identifier: GPL-2.0-only
+ config DRM_XE
+       tristate "Intel Xe Graphics"
+       depends on DRM && PCI && MMU && (m || (y && KUNIT=y))
+       select INTERVAL_TREE
+       # we need shmfs for the swappable backing store, and in particular
+       # the shmem_readpage() which depends upon tmpfs
+       select SHMEM
+       select TMPFS
+       select DRM_BUDDY
+       select DRM_EXEC
+       select DRM_KMS_HELPER
+       select DRM_PANEL
+       select DRM_SUBALLOC_HELPER
+       select DRM_DISPLAY_DP_HELPER
+       select DRM_DISPLAY_HDCP_HELPER
+       select DRM_DISPLAY_HDMI_HELPER
+       select DRM_DISPLAY_HELPER
+       select DRM_MIPI_DSI
+       select RELAY
+       select IRQ_WORK
 -      select ACPI_WMI if ACPI
++      # xe depends on ACPI_VIDEO when ACPI is enabled
+       # but for select to work, need to select ACPI_VIDEO's dependencies, ick
+       select BACKLIGHT_CLASS_DEVICE if ACPI
+       select INPUT if ACPI
+       select ACPI_VIDEO if X86 && ACPI
+       select ACPI_BUTTON if ACPI
++      select ACPI_WMI if X86 && ACPI
+       select SYNC_FILE
+       select IOSF_MBI
+       select CRC32
+       select SND_HDA_I915 if SND_HDA_CORE
+       select CEC_CORE if CEC_NOTIFIER
+       select VMAP_PFN
+       select DRM_TTM
+       select DRM_TTM_HELPER
+       select DRM_EXEC
+       select DRM_GPUVM
+       select DRM_SCHED
+       select MMU_NOTIFIER
+       select WANT_DEV_COREDUMP
+       select AUXILIARY_BUS
+       help
+         Experimental driver for Intel Xe series GPUs
+         If "M" is selected, the module will be called xe.
+ config DRM_XE_DISPLAY
+       bool "Enable display support"
+       depends on DRM_XE && EXPERT && DRM_XE=m
+       select FB_IOMEM_HELPERS
+       select I2C
+       select I2C_ALGOBIT
+       default y
+       help
+         Disable this option only if you want to compile out display support.
+ config DRM_XE_FORCE_PROBE
+       string "Force probe xe for selected Intel hardware IDs"
+       depends on DRM_XE
+       help
+         This is the default value for the xe.force_probe module
+         parameter. Using the module parameter overrides this option.
+         Force probe the xe for Intel graphics devices that are
+         recognized but not properly supported by this kernel version. It is
+         recommended to upgrade to a kernel version with proper support as soon
+         as it is available.
+         It can also be used to block the probe of recognized and fully
+         supported devices.
+         Use "" to disable force probe. If in doubt, use this.
+         Use "<pci-id>[,<pci-id>,...]" to force probe the xe for listed
+         devices. For example, "4500" or "4500,4571".
+         Use "*" to force probe the driver for all known devices.
+         Use "!" right before the ID to block the probe of the device. For
+         example, "4500,!4571" forces the probe of 4500 and blocks the probe of
+         4571.
+         Use "!*" to block the probe of the driver for all known devices.
+ menu "drm/Xe Debugging"
+ depends on DRM_XE
+ depends on EXPERT
+ source "drivers/gpu/drm/xe/Kconfig.debug"
+ endmenu
+ menu "drm/xe Profile Guided Optimisation"
+       visible if EXPERT
+       depends on DRM_XE
+       source "drivers/gpu/drm/xe/Kconfig.profile"
+ endmenu
index 0000000000000000000000000000000000000000,ba92e5619da39007ac829136475b9e9049b28387..d30c0d0689bcc7d4ae55cdd7fc93b116826160e6
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,350 +1,350 @@@
 -              drm_exec_init(exec, vm_exec.flags);
+ // SPDX-License-Identifier: MIT
+ /*
+  * Copyright © 2022 Intel Corporation
+  */
+ #include "xe_exec.h"
+ #include <drm/drm_device.h>
+ #include <drm/drm_exec.h>
+ #include <drm/drm_file.h>
+ #include <drm/xe_drm.h>
+ #include <linux/delay.h>
+ #include "xe_bo.h"
+ #include "xe_device.h"
+ #include "xe_exec_queue.h"
+ #include "xe_macros.h"
+ #include "xe_ring_ops_types.h"
+ #include "xe_sched_job.h"
+ #include "xe_sync.h"
+ #include "xe_vm.h"
+ /**
+  * DOC: Execbuf (User GPU command submission)
+  *
+  * Execs have historically been rather complicated in DRM drivers (at least in
+  * the i915) because a few things:
+  *
+  * - Passing in a list BO which are read / written to creating implicit syncs
+  * - Binding at exec time
+  * - Flow controlling the ring at exec time
+  *
+  * In XE we avoid all of this complication by not allowing a BO list to be
+  * passed into an exec, using the dma-buf implicit sync uAPI, have binds as
+  * seperate operations, and using the DRM scheduler to flow control the ring.
+  * Let's deep dive on each of these.
+  *
+  * We can get away from a BO list by forcing the user to use in / out fences on
+  * every exec rather than the kernel tracking dependencies of BO (e.g. if the
+  * user knows an exec writes to a BO and reads from the BO in the next exec, it
+  * is the user's responsibility to pass in / out fence between the two execs).
+  *
+  * Implicit dependencies for external BOs are handled by using the dma-buf
+  * implicit dependency uAPI (TODO: add link). To make this works each exec must
+  * install the job's fence into the DMA_RESV_USAGE_WRITE slot of every external
+  * BO mapped in the VM.
+  *
+  * We do not allow a user to trigger a bind at exec time rather we have a VM
+  * bind IOCTL which uses the same in / out fence interface as exec. In that
+  * sense, a VM bind is basically the same operation as an exec from the user
+  * perspective. e.g. If an exec depends on a VM bind use the in / out fence
+  * interface (struct drm_xe_sync) to synchronize like syncing between two
+  * dependent execs.
+  *
+  * Although a user cannot trigger a bind, we still have to rebind userptrs in
+  * the VM that have been invalidated since the last exec, likewise we also have
+  * to rebind BOs that have been evicted by the kernel. We schedule these rebinds
+  * behind any pending kernel operations on any external BOs in VM or any BOs
+  * private to the VM. This is accomplished by the rebinds waiting on BOs
+  * DMA_RESV_USAGE_KERNEL slot (kernel ops) and kernel ops waiting on all BOs
+  * slots (inflight execs are in the DMA_RESV_USAGE_BOOKING for private BOs and
+  * in DMA_RESV_USAGE_WRITE for external BOs).
+  *
+  * Rebinds / dma-resv usage applies to non-compute mode VMs only as for compute
+  * mode VMs we use preempt fences and a rebind worker (TODO: add link).
+  *
+  * There is no need to flow control the ring in the exec as we write the ring at
+  * submission time and set the DRM scheduler max job limit SIZE_OF_RING /
+  * MAX_JOB_SIZE. The DRM scheduler will then hold all jobs until space in the
+  * ring is available.
+  *
+  * All of this results in a rather simple exec implementation.
+  *
+  * Flow
+  * ~~~~
+  *
+  * .. code-block::
+  *
+  *    Parse input arguments
+  *    Wait for any async VM bind passed as in-fences to start
+  *    <----------------------------------------------------------------------|
+  *    Lock global VM lock in read mode                                       |
+  *    Pin userptrs (also finds userptr invalidated since last exec)          |
+  *    Lock exec (VM dma-resv lock, external BOs dma-resv locks)              |
+  *    Validate BOs that have been evicted                                    |
+  *    Create job                                                             |
+  *    Rebind invalidated userptrs + evicted BOs (non-compute-mode)           |
+  *    Add rebind fence dependency to job                                     |
+  *    Add job VM dma-resv bookkeeping slot (non-compute mode)                |
+  *    Add job to external BOs dma-resv write slots (non-compute mode)        |
+  *    Check if any userptrs invalidated since pin ------ Drop locks ---------|
+  *    Install in / out fences for job
+  *    Submit job
+  *    Unlock all
+  */
+ static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec)
+ {
+       return drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
+ }
+ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+ {
+       struct xe_device *xe = to_xe_device(dev);
+       struct xe_file *xef = to_xe_file(file);
+       struct drm_xe_exec *args = data;
+       struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs);
+       u64 __user *addresses_user = u64_to_user_ptr(args->address);
+       struct xe_exec_queue *q;
+       struct xe_sync_entry *syncs = NULL;
+       u64 addresses[XE_HW_ENGINE_MAX_INSTANCE];
+       struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
+       struct drm_exec *exec = &vm_exec.exec;
+       u32 i, num_syncs = 0;
+       struct xe_sched_job *job;
+       struct dma_fence *rebind_fence;
+       struct xe_vm *vm;
+       bool write_locked;
+       ktime_t end = 0;
+       int err = 0;
+       if (XE_IOCTL_DBG(xe, args->extensions) ||
+           XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
+           XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+               return -EINVAL;
+       q = xe_exec_queue_lookup(xef, args->exec_queue_id);
+       if (XE_IOCTL_DBG(xe, !q))
+               return -ENOENT;
+       if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->num_batch_buffer &&
+                        q->width != args->num_batch_buffer))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_BANNED)) {
+               err = -ECANCELED;
+               goto err_exec_queue;
+       }
+       if (args->num_syncs) {
+               syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
+               if (!syncs) {
+                       err = -ENOMEM;
+                       goto err_exec_queue;
+               }
+       }
+       vm = q->vm;
+       for (i = 0; i < args->num_syncs; i++) {
+               err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
+                                         &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
+                                         (xe_vm_in_lr_mode(vm) ?
+                                          SYNC_PARSE_FLAG_LR_MODE : 0));
+               if (err)
+                       goto err_syncs;
+       }
+       if (xe_exec_queue_is_parallel(q)) {
+               err = __copy_from_user(addresses, addresses_user, sizeof(u64) *
+                                      q->width);
+               if (err) {
+                       err = -EFAULT;
+                       goto err_syncs;
+               }
+       }
+ retry:
+       if (!xe_vm_in_lr_mode(vm) && xe_vm_userptr_check_repin(vm)) {
+               err = down_write_killable(&vm->lock);
+               write_locked = true;
+       } else {
+               /* We don't allow execs while the VM is in error state */
+               err = down_read_interruptible(&vm->lock);
+               write_locked = false;
+       }
+       if (err)
+               goto err_syncs;
+       if (write_locked) {
+               err = xe_vm_userptr_pin(vm);
+               downgrade_write(&vm->lock);
+               write_locked = false;
+               if (err)
+                       goto err_unlock_list;
+       }
+       vm_exec.vm = &vm->gpuvm;
+       vm_exec.num_fences = 1 + vm->xe->info.tile_count;
+       vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
+       if (xe_vm_in_lr_mode(vm)) {
++              drm_exec_init(exec, vm_exec.flags, 0);
+       } else {
+               err = drm_gpuvm_exec_lock(&vm_exec);
+               if (err) {
+                       if (xe_vm_validate_should_retry(exec, err, &end))
+                               err = -EAGAIN;
+                       goto err_unlock_list;
+               }
+       }
+       if (xe_vm_is_closed_or_banned(q->vm)) {
+               drm_warn(&xe->drm, "Trying to schedule after vm is closed or banned\n");
+               err = -ECANCELED;
+               goto err_exec;
+       }
+       if (!args->num_batch_buffer) {
+               if (!xe_vm_in_lr_mode(vm)) {
+                       struct dma_fence *fence;
+                       fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm);
+                       if (IS_ERR(fence)) {
+                               err = PTR_ERR(fence);
+                               goto err_exec;
+                       }
+                       for (i = 0; i < num_syncs; i++)
+                               xe_sync_entry_signal(&syncs[i], NULL, fence);
+                       xe_exec_queue_last_fence_set(q, vm, fence);
+                       dma_fence_put(fence);
+               }
+               goto err_exec;
+       }
+       if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) {
+               err = -EWOULDBLOCK;
+               goto err_exec;
+       }
+       job = xe_sched_job_create(q, xe_exec_queue_is_parallel(q) ?
+                                 addresses : &args->address);
+       if (IS_ERR(job)) {
+               err = PTR_ERR(job);
+               goto err_exec;
+       }
+       /*
+        * Rebind any invalidated userptr or evicted BOs in the VM, non-compute
+        * VM mode only.
+        */
+       rebind_fence = xe_vm_rebind(vm, false);
+       if (IS_ERR(rebind_fence)) {
+               err = PTR_ERR(rebind_fence);
+               goto err_put_job;
+       }
+       /*
+        * We store the rebind_fence in the VM so subsequent execs don't get
+        * scheduled before the rebinds of userptrs / evicted BOs is complete.
+        */
+       if (rebind_fence) {
+               dma_fence_put(vm->rebind_fence);
+               vm->rebind_fence = rebind_fence;
+       }
+       if (vm->rebind_fence) {
+               if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+                            &vm->rebind_fence->flags)) {
+                       dma_fence_put(vm->rebind_fence);
+                       vm->rebind_fence = NULL;
+               } else {
+                       dma_fence_get(vm->rebind_fence);
+                       err = drm_sched_job_add_dependency(&job->drm,
+                                                          vm->rebind_fence);
+                       if (err)
+                               goto err_put_job;
+               }
+       }
+       /* Wait behind munmap style rebinds */
+       if (!xe_vm_in_lr_mode(vm)) {
+               err = drm_sched_job_add_resv_dependencies(&job->drm,
+                                                         xe_vm_resv(vm),
+                                                         DMA_RESV_USAGE_KERNEL);
+               if (err)
+                       goto err_put_job;
+       }
+       for (i = 0; i < num_syncs && !err; i++)
+               err = xe_sync_entry_add_deps(&syncs[i], job);
+       if (err)
+               goto err_put_job;
+       if (!xe_vm_in_lr_mode(vm)) {
+               err = xe_sched_job_last_fence_add_dep(job, vm);
+               if (err)
+                       goto err_put_job;
+               err = down_read_interruptible(&vm->userptr.notifier_lock);
+               if (err)
+                       goto err_put_job;
+               err = __xe_vm_userptr_needs_repin(vm);
+               if (err)
+                       goto err_repin;
+       }
+       /*
+        * Point of no return, if we error after this point just set an error on
+        * the job and let the DRM scheduler / backend clean up the job.
+        */
+       xe_sched_job_arm(job);
+       if (!xe_vm_in_lr_mode(vm))
+               drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, &job->drm.s_fence->finished,
+                                        DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE);
+       for (i = 0; i < num_syncs; i++)
+               xe_sync_entry_signal(&syncs[i], job,
+                                    &job->drm.s_fence->finished);
+       if (xe_exec_queue_is_lr(q))
+               q->ring_ops->emit_job(job);
+       if (!xe_vm_in_lr_mode(vm))
+               xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
+       xe_sched_job_push(job);
+       xe_vm_reactivate_rebind(vm);
+       if (!err && !xe_vm_in_lr_mode(vm)) {
+               spin_lock(&xe->ttm.lru_lock);
+               ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
+               spin_unlock(&xe->ttm.lru_lock);
+       }
+ err_repin:
+       if (!xe_vm_in_lr_mode(vm))
+               up_read(&vm->userptr.notifier_lock);
+ err_put_job:
+       if (err)
+               xe_sched_job_put(job);
+ err_exec:
+       drm_exec_fini(exec);
+ err_unlock_list:
+       if (write_locked)
+               up_write(&vm->lock);
+       else
+               up_read(&vm->lock);
+       if (err == -EAGAIN)
+               goto retry;
+ err_syncs:
+       for (i = 0; i < num_syncs; i++)
+               xe_sync_entry_cleanup(&syncs[i]);
+       kfree(syncs);
+ err_exec_queue:
+       xe_exec_queue_put(q);
+       return err;
+ }
index 0000000000000000000000000000000000000000,4489aadc7a525f4222c0d8841e3e2bdf2cfc1b5a..59a70d2e0a7a33386fdcfca9cc158919aab1e32c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,646 +1,646 @@@
 -      drm_exec_init(&exec, 0);
+ // SPDX-License-Identifier: MIT
+ /*
+  * Copyright © 2022 Intel Corporation
+  */
+ #include "xe_gt_pagefault.h"
+ #include <linux/bitfield.h>
+ #include <linux/circ_buf.h>
+ #include <drm/drm_exec.h>
+ #include <drm/drm_managed.h>
+ #include <drm/ttm/ttm_execbuf_util.h>
+ #include "abi/guc_actions_abi.h"
+ #include "xe_bo.h"
+ #include "xe_gt.h"
+ #include "xe_gt_tlb_invalidation.h"
+ #include "xe_guc.h"
+ #include "xe_guc_ct.h"
+ #include "xe_migrate.h"
+ #include "xe_pt.h"
+ #include "xe_trace.h"
+ #include "xe_vm.h"
+ struct pagefault {
+       u64 page_addr;
+       u32 asid;
+       u16 pdata;
+       u8 vfid;
+       u8 access_type;
+       u8 fault_type;
+       u8 fault_level;
+       u8 engine_class;
+       u8 engine_instance;
+       u8 fault_unsuccessful;
+       bool trva_fault;
+ };
+ enum access_type {
+       ACCESS_TYPE_READ = 0,
+       ACCESS_TYPE_WRITE = 1,
+       ACCESS_TYPE_ATOMIC = 2,
+       ACCESS_TYPE_RESERVED = 3,
+ };
+ enum fault_type {
+       NOT_PRESENT = 0,
+       WRITE_ACCESS_VIOLATION = 1,
+       ATOMIC_ACCESS_VIOLATION = 2,
+ };
+ struct acc {
+       u64 va_range_base;
+       u32 asid;
+       u32 sub_granularity;
+       u8 granularity;
+       u8 vfid;
+       u8 access_type;
+       u8 engine_class;
+       u8 engine_instance;
+ };
+ static bool access_is_atomic(enum access_type access_type)
+ {
+       return access_type == ACCESS_TYPE_ATOMIC;
+ }
+ static bool vma_is_valid(struct xe_tile *tile, struct xe_vma *vma)
+ {
+       return BIT(tile->id) & vma->tile_present &&
+               !(BIT(tile->id) & vma->usm.tile_invalidated);
+ }
+ static bool vma_matches(struct xe_vma *vma, u64 page_addr)
+ {
+       if (page_addr > xe_vma_end(vma) - 1 ||
+           page_addr + SZ_4K - 1 < xe_vma_start(vma))
+               return false;
+       return true;
+ }
+ static struct xe_vma *lookup_vma(struct xe_vm *vm, u64 page_addr)
+ {
+       struct xe_vma *vma = NULL;
+       if (vm->usm.last_fault_vma) {   /* Fast lookup */
+               if (vma_matches(vm->usm.last_fault_vma, page_addr))
+                       vma = vm->usm.last_fault_vma;
+       }
+       if (!vma)
+               vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
+       return vma;
+ }
+ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma,
+                      bool atomic, unsigned int id)
+ {
+       struct xe_bo *bo = xe_vma_bo(vma);
+       struct xe_vm *vm = xe_vma_vm(vma);
+       unsigned int num_shared = 2; /* slots for bind + move */
+       int err;
+       err = xe_vm_prepare_vma(exec, vma, num_shared);
+       if (err)
+               return err;
+       if (atomic && IS_DGFX(vm->xe)) {
+               if (xe_vma_is_userptr(vma)) {
+                       err = -EACCES;
+                       return err;
+               }
+               /* Migrate to VRAM, move should invalidate the VMA first */
+               err = xe_bo_migrate(bo, XE_PL_VRAM0 + id);
+               if (err)
+                       return err;
+       } else if (bo) {
+               /* Create backing store if needed */
+               err = xe_bo_validate(bo, vm, true);
+               if (err)
+                       return err;
+       }
+       return 0;
+ }
+ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
+ {
+       struct xe_device *xe = gt_to_xe(gt);
+       struct xe_tile *tile = gt_to_tile(gt);
+       struct drm_exec exec;
+       struct xe_vm *vm;
+       struct xe_vma *vma = NULL;
+       struct dma_fence *fence;
+       bool write_locked;
+       int ret = 0;
+       bool atomic;
+       /* SW isn't expected to handle TRTT faults */
+       if (pf->trva_fault)
+               return -EFAULT;
+       /* ASID to VM */
+       mutex_lock(&xe->usm.lock);
+       vm = xa_load(&xe->usm.asid_to_vm, pf->asid);
+       if (vm)
+               xe_vm_get(vm);
+       mutex_unlock(&xe->usm.lock);
+       if (!vm || !xe_vm_in_fault_mode(vm))
+               return -EINVAL;
+ retry_userptr:
+       /*
+        * TODO: Avoid exclusive lock if VM doesn't have userptrs, or
+        * start out read-locked?
+        */
+       down_write(&vm->lock);
+       write_locked = true;
+       vma = lookup_vma(vm, pf->page_addr);
+       if (!vma) {
+               ret = -EINVAL;
+               goto unlock_vm;
+       }
+       if (!xe_vma_is_userptr(vma) || !xe_vma_userptr_check_repin(vma)) {
+               downgrade_write(&vm->lock);
+               write_locked = false;
+       }
+       trace_xe_vma_pagefault(vma);
+       atomic = access_is_atomic(pf->access_type);
+       /* Check if VMA is valid */
+       if (vma_is_valid(tile, vma) && !atomic)
+               goto unlock_vm;
+       /* TODO: Validate fault */
+       if (xe_vma_is_userptr(vma) && write_locked) {
+               spin_lock(&vm->userptr.invalidated_lock);
+               list_del_init(&vma->userptr.invalidate_link);
+               spin_unlock(&vm->userptr.invalidated_lock);
+               ret = xe_vma_userptr_pin_pages(vma);
+               if (ret)
+                       goto unlock_vm;
+               downgrade_write(&vm->lock);
+               write_locked = false;
+       }
+       /* Lock VM and BOs dma-resv */
 -      drm_exec_init(&exec, 0);
++      drm_exec_init(&exec, 0, 0);
+       drm_exec_until_all_locked(&exec) {
+               ret = xe_pf_begin(&exec, vma, atomic, tile->id);
+               drm_exec_retry_on_contention(&exec);
+               if (ret)
+                       goto unlock_dma_resv;
+       }
+       /* Bind VMA only to the GT that has faulted */
+       trace_xe_vma_pf_bind(vma);
+       fence = __xe_pt_bind_vma(tile, vma, xe_tile_migrate_engine(tile), NULL, 0,
+                                vma->tile_present & BIT(tile->id));
+       if (IS_ERR(fence)) {
+               ret = PTR_ERR(fence);
+               goto unlock_dma_resv;
+       }
+       /*
+        * XXX: Should we drop the lock before waiting? This only helps if doing
+        * GPU binds which is currently only done if we have to wait for more
+        * than 10ms on a move.
+        */
+       dma_fence_wait(fence, false);
+       dma_fence_put(fence);
+       if (xe_vma_is_userptr(vma))
+               ret = xe_vma_userptr_check_repin(vma);
+       vma->usm.tile_invalidated &= ~BIT(tile->id);
+ unlock_dma_resv:
+       drm_exec_fini(&exec);
+ unlock_vm:
+       if (!ret)
+               vm->usm.last_fault_vma = vma;
+       if (write_locked)
+               up_write(&vm->lock);
+       else
+               up_read(&vm->lock);
+       if (ret == -EAGAIN)
+               goto retry_userptr;
+       if (!ret) {
+               ret = xe_gt_tlb_invalidation_vma(gt, NULL, vma);
+               if (ret >= 0)
+                       ret = 0;
+       }
+       xe_vm_put(vm);
+       return ret;
+ }
+ static int send_pagefault_reply(struct xe_guc *guc,
+                               struct xe_guc_pagefault_reply *reply)
+ {
+       u32 action[] = {
+               XE_GUC_ACTION_PAGE_FAULT_RES_DESC,
+               reply->dw0,
+               reply->dw1,
+       };
+       return xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0);
+ }
+ static void print_pagefault(struct xe_device *xe, struct pagefault *pf)
+ {
+       drm_dbg(&xe->drm, "\n\tASID: %d\n"
+                "\tVFID: %d\n"
+                "\tPDATA: 0x%04x\n"
+                "\tFaulted Address: 0x%08x%08x\n"
+                "\tFaultType: %d\n"
+                "\tAccessType: %d\n"
+                "\tFaultLevel: %d\n"
+                "\tEngineClass: %d\n"
+                "\tEngineInstance: %d\n",
+                pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
+                lower_32_bits(pf->page_addr),
+                pf->fault_type, pf->access_type, pf->fault_level,
+                pf->engine_class, pf->engine_instance);
+ }
+ #define PF_MSG_LEN_DW 4
+ static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf)
+ {
+       const struct xe_guc_pagefault_desc *desc;
+       bool ret = false;
+       spin_lock_irq(&pf_queue->lock);
+       if (pf_queue->head != pf_queue->tail) {
+               desc = (const struct xe_guc_pagefault_desc *)
+                       (pf_queue->data + pf_queue->head);
+               pf->fault_level = FIELD_GET(PFD_FAULT_LEVEL, desc->dw0);
+               pf->trva_fault = FIELD_GET(XE2_PFD_TRVA_FAULT, desc->dw0);
+               pf->engine_class = FIELD_GET(PFD_ENG_CLASS, desc->dw0);
+               pf->engine_instance = FIELD_GET(PFD_ENG_INSTANCE, desc->dw0);
+               pf->pdata = FIELD_GET(PFD_PDATA_HI, desc->dw1) <<
+                       PFD_PDATA_HI_SHIFT;
+               pf->pdata |= FIELD_GET(PFD_PDATA_LO, desc->dw0);
+               pf->asid = FIELD_GET(PFD_ASID, desc->dw1);
+               pf->vfid = FIELD_GET(PFD_VFID, desc->dw2);
+               pf->access_type = FIELD_GET(PFD_ACCESS_TYPE, desc->dw2);
+               pf->fault_type = FIELD_GET(PFD_FAULT_TYPE, desc->dw2);
+               pf->page_addr = (u64)(FIELD_GET(PFD_VIRTUAL_ADDR_HI, desc->dw3)) <<
+                       PFD_VIRTUAL_ADDR_HI_SHIFT;
+               pf->page_addr |= FIELD_GET(PFD_VIRTUAL_ADDR_LO, desc->dw2) <<
+                       PFD_VIRTUAL_ADDR_LO_SHIFT;
+               pf_queue->head = (pf_queue->head + PF_MSG_LEN_DW) %
+                       PF_QUEUE_NUM_DW;
+               ret = true;
+       }
+       spin_unlock_irq(&pf_queue->lock);
+       return ret;
+ }
+ static bool pf_queue_full(struct pf_queue *pf_queue)
+ {
+       lockdep_assert_held(&pf_queue->lock);
+       return CIRC_SPACE(pf_queue->tail, pf_queue->head, PF_QUEUE_NUM_DW) <=
+               PF_MSG_LEN_DW;
+ }
+ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
+ {
+       struct xe_gt *gt = guc_to_gt(guc);
+       struct xe_device *xe = gt_to_xe(gt);
+       struct pf_queue *pf_queue;
+       unsigned long flags;
+       u32 asid;
+       bool full;
+       if (unlikely(len != PF_MSG_LEN_DW))
+               return -EPROTO;
+       asid = FIELD_GET(PFD_ASID, msg[1]);
+       pf_queue = &gt->usm.pf_queue[asid % NUM_PF_QUEUE];
+       spin_lock_irqsave(&pf_queue->lock, flags);
+       full = pf_queue_full(pf_queue);
+       if (!full) {
+               memcpy(pf_queue->data + pf_queue->tail, msg, len * sizeof(u32));
+               pf_queue->tail = (pf_queue->tail + len) % PF_QUEUE_NUM_DW;
+               queue_work(gt->usm.pf_wq, &pf_queue->worker);
+       } else {
+               drm_warn(&xe->drm, "PF Queue full, shouldn't be possible");
+       }
+       spin_unlock_irqrestore(&pf_queue->lock, flags);
+       return full ? -ENOSPC : 0;
+ }
+ #define USM_QUEUE_MAX_RUNTIME_MS      20
+ static void pf_queue_work_func(struct work_struct *w)
+ {
+       struct pf_queue *pf_queue = container_of(w, struct pf_queue, worker);
+       struct xe_gt *gt = pf_queue->gt;
+       struct xe_device *xe = gt_to_xe(gt);
+       struct xe_guc_pagefault_reply reply = {};
+       struct pagefault pf = {};
+       unsigned long threshold;
+       int ret;
+       threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
+       while (get_pagefault(pf_queue, &pf)) {
+               ret = handle_pagefault(gt, &pf);
+               if (unlikely(ret)) {
+                       print_pagefault(xe, &pf);
+                       pf.fault_unsuccessful = 1;
+                       drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
+               }
+               reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
+                       FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
+                       FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
+                       FIELD_PREP(PFR_DESC_TYPE, FAULT_RESPONSE_DESC) |
+                       FIELD_PREP(PFR_ASID, pf.asid);
+               reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) |
+                       FIELD_PREP(PFR_ENG_INSTANCE, pf.engine_instance) |
+                       FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
+                       FIELD_PREP(PFR_PDATA, pf.pdata);
+               send_pagefault_reply(&gt->uc.guc, &reply);
+               if (time_after(jiffies, threshold) &&
+                   pf_queue->head != pf_queue->tail) {
+                       queue_work(gt->usm.pf_wq, w);
+                       break;
+               }
+       }
+ }
+ static void acc_queue_work_func(struct work_struct *w);
+ int xe_gt_pagefault_init(struct xe_gt *gt)
+ {
+       struct xe_device *xe = gt_to_xe(gt);
+       int i;
+       if (!xe->info.has_usm)
+               return 0;
+       for (i = 0; i < NUM_PF_QUEUE; ++i) {
+               gt->usm.pf_queue[i].gt = gt;
+               spin_lock_init(&gt->usm.pf_queue[i].lock);
+               INIT_WORK(&gt->usm.pf_queue[i].worker, pf_queue_work_func);
+       }
+       for (i = 0; i < NUM_ACC_QUEUE; ++i) {
+               gt->usm.acc_queue[i].gt = gt;
+               spin_lock_init(&gt->usm.acc_queue[i].lock);
+               INIT_WORK(&gt->usm.acc_queue[i].worker, acc_queue_work_func);
+       }
+       gt->usm.pf_wq = alloc_workqueue("xe_gt_page_fault_work_queue",
+                                       WQ_UNBOUND | WQ_HIGHPRI, NUM_PF_QUEUE);
+       if (!gt->usm.pf_wq)
+               return -ENOMEM;
+       gt->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue",
+                                        WQ_UNBOUND | WQ_HIGHPRI,
+                                        NUM_ACC_QUEUE);
+       if (!gt->usm.acc_wq)
+               return -ENOMEM;
+       return 0;
+ }
+ void xe_gt_pagefault_reset(struct xe_gt *gt)
+ {
+       struct xe_device *xe = gt_to_xe(gt);
+       int i;
+       if (!xe->info.has_usm)
+               return;
+       for (i = 0; i < NUM_PF_QUEUE; ++i) {
+               spin_lock_irq(&gt->usm.pf_queue[i].lock);
+               gt->usm.pf_queue[i].head = 0;
+               gt->usm.pf_queue[i].tail = 0;
+               spin_unlock_irq(&gt->usm.pf_queue[i].lock);
+       }
+       for (i = 0; i < NUM_ACC_QUEUE; ++i) {
+               spin_lock(&gt->usm.acc_queue[i].lock);
+               gt->usm.acc_queue[i].head = 0;
+               gt->usm.acc_queue[i].tail = 0;
+               spin_unlock(&gt->usm.acc_queue[i].lock);
+       }
+ }
+ static int granularity_in_byte(int val)
+ {
+       switch (val) {
+       case 0:
+               return SZ_128K;
+       case 1:
+               return SZ_2M;
+       case 2:
+               return SZ_16M;
+       case 3:
+               return SZ_64M;
+       default:
+               return 0;
+       }
+ }
+ static int sub_granularity_in_byte(int val)
+ {
+       return (granularity_in_byte(val) / 32);
+ }
+ static void print_acc(struct xe_device *xe, struct acc *acc)
+ {
+       drm_warn(&xe->drm, "Access counter request:\n"
+                "\tType: %s\n"
+                "\tASID: %d\n"
+                "\tVFID: %d\n"
+                "\tEngine: %d:%d\n"
+                "\tGranularity: 0x%x KB Region/ %d KB sub-granularity\n"
+                "\tSub_Granularity Vector: 0x%08x\n"
+                "\tVA Range base: 0x%016llx\n",
+                acc->access_type ? "AC_NTFY_VAL" : "AC_TRIG_VAL",
+                acc->asid, acc->vfid, acc->engine_class, acc->engine_instance,
+                granularity_in_byte(acc->granularity) / SZ_1K,
+                sub_granularity_in_byte(acc->granularity) / SZ_1K,
+                acc->sub_granularity, acc->va_range_base);
+ }
+ static struct xe_vma *get_acc_vma(struct xe_vm *vm, struct acc *acc)
+ {
+       u64 page_va = acc->va_range_base + (ffs(acc->sub_granularity) - 1) *
+               sub_granularity_in_byte(acc->granularity);
+       return xe_vm_find_overlapping_vma(vm, page_va, SZ_4K);
+ }
+ static int handle_acc(struct xe_gt *gt, struct acc *acc)
+ {
+       struct xe_device *xe = gt_to_xe(gt);
+       struct xe_tile *tile = gt_to_tile(gt);
+       struct drm_exec exec;
+       struct xe_vm *vm;
+       struct xe_vma *vma;
+       int ret = 0;
+       /* We only support ACC_TRIGGER at the moment */
+       if (acc->access_type != ACC_TRIGGER)
+               return -EINVAL;
+       /* ASID to VM */
+       mutex_lock(&xe->usm.lock);
+       vm = xa_load(&xe->usm.asid_to_vm, acc->asid);
+       if (vm)
+               xe_vm_get(vm);
+       mutex_unlock(&xe->usm.lock);
+       if (!vm || !xe_vm_in_fault_mode(vm))
+               return -EINVAL;
+       down_read(&vm->lock);
+       /* Lookup VMA */
+       vma = get_acc_vma(vm, acc);
+       if (!vma) {
+               ret = -EINVAL;
+               goto unlock_vm;
+       }
+       trace_xe_vma_acc(vma);
+       /* Userptr or null can't be migrated, nothing to do */
+       if (xe_vma_has_no_bo(vma))
+               goto unlock_vm;
+       /* Lock VM and BOs dma-resv */
++      drm_exec_init(&exec, 0, 0);
+       drm_exec_until_all_locked(&exec) {
+               ret = xe_pf_begin(&exec, vma, true, tile->id);
+               drm_exec_retry_on_contention(&exec);
+               if (ret)
+                       break;
+       }
+       drm_exec_fini(&exec);
+ unlock_vm:
+       up_read(&vm->lock);
+       xe_vm_put(vm);
+       return ret;
+ }
+ #define make_u64(hi__, low__)  ((u64)(hi__) << 32 | (u64)(low__))
+ #define ACC_MSG_LEN_DW        4
+ static bool get_acc(struct acc_queue *acc_queue, struct acc *acc)
+ {
+       const struct xe_guc_acc_desc *desc;
+       bool ret = false;
+       spin_lock(&acc_queue->lock);
+       if (acc_queue->head != acc_queue->tail) {
+               desc = (const struct xe_guc_acc_desc *)
+                       (acc_queue->data + acc_queue->head);
+               acc->granularity = FIELD_GET(ACC_GRANULARITY, desc->dw2);
+               acc->sub_granularity = FIELD_GET(ACC_SUBG_HI, desc->dw1) << 31 |
+                       FIELD_GET(ACC_SUBG_LO, desc->dw0);
+               acc->engine_class = FIELD_GET(ACC_ENG_CLASS, desc->dw1);
+               acc->engine_instance = FIELD_GET(ACC_ENG_INSTANCE, desc->dw1);
+               acc->asid =  FIELD_GET(ACC_ASID, desc->dw1);
+               acc->vfid =  FIELD_GET(ACC_VFID, desc->dw2);
+               acc->access_type = FIELD_GET(ACC_TYPE, desc->dw0);
+               acc->va_range_base = make_u64(desc->dw3 & ACC_VIRTUAL_ADDR_RANGE_HI,
+                                             desc->dw2 & ACC_VIRTUAL_ADDR_RANGE_LO);
+               acc_queue->head = (acc_queue->head + ACC_MSG_LEN_DW) %
+                                 ACC_QUEUE_NUM_DW;
+               ret = true;
+       }
+       spin_unlock(&acc_queue->lock);
+       return ret;
+ }
+ static void acc_queue_work_func(struct work_struct *w)
+ {
+       struct acc_queue *acc_queue = container_of(w, struct acc_queue, worker);
+       struct xe_gt *gt = acc_queue->gt;
+       struct xe_device *xe = gt_to_xe(gt);
+       struct acc acc = {};
+       unsigned long threshold;
+       int ret;
+       threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
+       while (get_acc(acc_queue, &acc)) {
+               ret = handle_acc(gt, &acc);
+               if (unlikely(ret)) {
+                       print_acc(xe, &acc);
+                       drm_warn(&xe->drm, "ACC: Unsuccessful %d\n", ret);
+               }
+               if (time_after(jiffies, threshold) &&
+                   acc_queue->head != acc_queue->tail) {
+                       queue_work(gt->usm.acc_wq, w);
+                       break;
+               }
+       }
+ }
+ static bool acc_queue_full(struct acc_queue *acc_queue)
+ {
+       lockdep_assert_held(&acc_queue->lock);
+       return CIRC_SPACE(acc_queue->tail, acc_queue->head, ACC_QUEUE_NUM_DW) <=
+               ACC_MSG_LEN_DW;
+ }
+ int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len)
+ {
+       struct xe_gt *gt = guc_to_gt(guc);
+       struct acc_queue *acc_queue;
+       u32 asid;
+       bool full;
+       if (unlikely(len != ACC_MSG_LEN_DW))
+               return -EPROTO;
+       asid = FIELD_GET(ACC_ASID, msg[1]);
+       acc_queue = &gt->usm.acc_queue[asid % NUM_ACC_QUEUE];
+       spin_lock(&acc_queue->lock);
+       full = acc_queue_full(acc_queue);
+       if (!full) {
+               memcpy(acc_queue->data + acc_queue->tail, msg,
+                      len * sizeof(u32));
+               acc_queue->tail = (acc_queue->tail + len) % ACC_QUEUE_NUM_DW;
+               queue_work(gt->usm.acc_wq, &acc_queue->worker);
+       } else {
+               drm_warn(&gt_to_xe(gt)->drm, "ACC Queue full, dropping ACC");
+       }
+       spin_unlock(&acc_queue->lock);
+       return full ? -ENOSPC : 0;
+ }
index 0000000000000000000000000000000000000000,322c1ecceccac528b15ee33ceecaaec40b7baf9a..9180f2d2d71d27daa6d8fb4fb40256edbfc6dda8
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,3206 +1,3206 @@@
 -      drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
+ // SPDX-License-Identifier: MIT
+ /*
+  * Copyright © 2021 Intel Corporation
+  */
+ #include "xe_vm.h"
+ #include <linux/dma-fence-array.h>
+ #include <linux/nospec.h>
+ #include <drm/drm_exec.h>
+ #include <drm/drm_print.h>
+ #include <drm/ttm/ttm_execbuf_util.h>
+ #include <drm/ttm/ttm_tt.h>
+ #include <drm/xe_drm.h>
+ #include <linux/delay.h>
+ #include <linux/kthread.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h>
+ #include "xe_assert.h"
+ #include "xe_bo.h"
+ #include "xe_device.h"
+ #include "xe_drm_client.h"
+ #include "xe_exec_queue.h"
+ #include "xe_gt.h"
+ #include "xe_gt_pagefault.h"
+ #include "xe_gt_tlb_invalidation.h"
+ #include "xe_migrate.h"
+ #include "xe_pat.h"
+ #include "xe_pm.h"
+ #include "xe_preempt_fence.h"
+ #include "xe_pt.h"
+ #include "xe_res_cursor.h"
+ #include "xe_sync.h"
+ #include "xe_trace.h"
+ #include "generated/xe_wa_oob.h"
+ #include "xe_wa.h"
+ #define TEST_VM_ASYNC_OPS_ERROR
+ static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
+ {
+       return vm->gpuvm.r_obj;
+ }
+ /**
+  * xe_vma_userptr_check_repin() - Advisory check for repin needed
+  * @vma: The userptr vma
+  *
+  * Check if the userptr vma has been invalidated since last successful
+  * repin. The check is advisory only and can the function can be called
+  * without the vm->userptr.notifier_lock held. There is no guarantee that the
+  * vma userptr will remain valid after a lockless check, so typically
+  * the call needs to be followed by a proper check under the notifier_lock.
+  *
+  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
+  */
+ int xe_vma_userptr_check_repin(struct xe_vma *vma)
+ {
+       return mmu_interval_check_retry(&vma->userptr.notifier,
+                                       vma->userptr.notifier_seq) ?
+               -EAGAIN : 0;
+ }
+ int xe_vma_userptr_pin_pages(struct xe_vma *vma)
+ {
+       struct xe_vm *vm = xe_vma_vm(vma);
+       struct xe_device *xe = vm->xe;
+       const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
+       struct page **pages;
+       bool in_kthread = !current->mm;
+       unsigned long notifier_seq;
+       int pinned, ret, i;
+       bool read_only = xe_vma_read_only(vma);
+       lockdep_assert_held(&vm->lock);
+       xe_assert(xe, xe_vma_is_userptr(vma));
+ retry:
+       if (vma->gpuva.flags & XE_VMA_DESTROYED)
+               return 0;
+       notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
+       if (notifier_seq == vma->userptr.notifier_seq)
+               return 0;
+       pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
+       if (!pages)
+               return -ENOMEM;
+       if (vma->userptr.sg) {
+               dma_unmap_sgtable(xe->drm.dev,
+                                 vma->userptr.sg,
+                                 read_only ? DMA_TO_DEVICE :
+                                 DMA_BIDIRECTIONAL, 0);
+               sg_free_table(vma->userptr.sg);
+               vma->userptr.sg = NULL;
+       }
+       pinned = ret = 0;
+       if (in_kthread) {
+               if (!mmget_not_zero(vma->userptr.notifier.mm)) {
+                       ret = -EFAULT;
+                       goto mm_closed;
+               }
+               kthread_use_mm(vma->userptr.notifier.mm);
+       }
+       while (pinned < num_pages) {
+               ret = get_user_pages_fast(xe_vma_userptr(vma) +
+                                         pinned * PAGE_SIZE,
+                                         num_pages - pinned,
+                                         read_only ? 0 : FOLL_WRITE,
+                                         &pages[pinned]);
+               if (ret < 0) {
+                       if (in_kthread)
+                               ret = 0;
+                       break;
+               }
+               pinned += ret;
+               ret = 0;
+       }
+       if (in_kthread) {
+               kthread_unuse_mm(vma->userptr.notifier.mm);
+               mmput(vma->userptr.notifier.mm);
+       }
+ mm_closed:
+       if (ret)
+               goto out;
+       ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
+                                               pinned, 0,
+                                               (u64)pinned << PAGE_SHIFT,
+                                               xe_sg_segment_size(xe->drm.dev),
+                                               GFP_KERNEL);
+       if (ret) {
+               vma->userptr.sg = NULL;
+               goto out;
+       }
+       vma->userptr.sg = &vma->userptr.sgt;
+       ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
+                             read_only ? DMA_TO_DEVICE :
+                             DMA_BIDIRECTIONAL,
+                             DMA_ATTR_SKIP_CPU_SYNC |
+                             DMA_ATTR_NO_KERNEL_MAPPING);
+       if (ret) {
+               sg_free_table(vma->userptr.sg);
+               vma->userptr.sg = NULL;
+               goto out;
+       }
+       for (i = 0; i < pinned; ++i) {
+               if (!read_only) {
+                       lock_page(pages[i]);
+                       set_page_dirty(pages[i]);
+                       unlock_page(pages[i]);
+               }
+               mark_page_accessed(pages[i]);
+       }
+ out:
+       release_pages(pages, pinned);
+       kvfree(pages);
+       if (!(ret < 0)) {
+               vma->userptr.notifier_seq = notifier_seq;
+               if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
+                       goto retry;
+       }
+       return ret < 0 ? ret : 0;
+ }
+ static bool preempt_fences_waiting(struct xe_vm *vm)
+ {
+       struct xe_exec_queue *q;
+       lockdep_assert_held(&vm->lock);
+       xe_vm_assert_held(vm);
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
+               if (!q->compute.pfence ||
+                   (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+                                                  &q->compute.pfence->flags))) {
+                       return true;
+               }
+       }
+       return false;
+ }
+ static void free_preempt_fences(struct list_head *list)
+ {
+       struct list_head *link, *next;
+       list_for_each_safe(link, next, list)
+               xe_preempt_fence_free(to_preempt_fence_from_link(link));
+ }
+ static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
+                               unsigned int *count)
+ {
+       lockdep_assert_held(&vm->lock);
+       xe_vm_assert_held(vm);
+       if (*count >= vm->preempt.num_exec_queues)
+               return 0;
+       for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
+               struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
+               if (IS_ERR(pfence))
+                       return PTR_ERR(pfence);
+               list_move_tail(xe_preempt_fence_link(pfence), list);
+       }
+       return 0;
+ }
+ static int wait_for_existing_preempt_fences(struct xe_vm *vm)
+ {
+       struct xe_exec_queue *q;
+       xe_vm_assert_held(vm);
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
+               if (q->compute.pfence) {
+                       long timeout = dma_fence_wait(q->compute.pfence, false);
+                       if (timeout < 0)
+                               return -ETIME;
+                       dma_fence_put(q->compute.pfence);
+                       q->compute.pfence = NULL;
+               }
+       }
+       return 0;
+ }
+ static bool xe_vm_is_idle(struct xe_vm *vm)
+ {
+       struct xe_exec_queue *q;
+       xe_vm_assert_held(vm);
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
+               if (!xe_exec_queue_is_idle(q))
+                       return false;
+       }
+       return true;
+ }
+ static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
+ {
+       struct list_head *link;
+       struct xe_exec_queue *q;
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
+               struct dma_fence *fence;
+               link = list->next;
+               xe_assert(vm->xe, link != list);
+               fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
+                                            q, q->compute.context,
+                                            ++q->compute.seqno);
+               dma_fence_put(q->compute.pfence);
+               q->compute.pfence = fence;
+       }
+ }
+ static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
+ {
+       struct xe_exec_queue *q;
+       int err;
+       err = xe_bo_lock(bo, true);
+       if (err)
+               return err;
+       err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
+       if (err)
+               goto out_unlock;
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
+               if (q->compute.pfence) {
+                       dma_resv_add_fence(bo->ttm.base.resv,
+                                          q->compute.pfence,
+                                          DMA_RESV_USAGE_BOOKKEEP);
+               }
+ out_unlock:
+       xe_bo_unlock(bo);
+       return err;
+ }
+ static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
+                                               struct drm_exec *exec)
+ {
+       struct xe_exec_queue *q;
+       lockdep_assert_held(&vm->lock);
+       xe_vm_assert_held(vm);
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
+               q->ops->resume(q);
+               drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->compute.pfence,
+                                        DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
+       }
+ }
+ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
+ {
+       struct drm_gpuvm_exec vm_exec = {
+               .vm = &vm->gpuvm,
+               .flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
+               .num_fences = 1,
+       };
+       struct drm_exec *exec = &vm_exec.exec;
+       struct dma_fence *pfence;
+       int err;
+       bool wait;
+       xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
+       down_write(&vm->lock);
+       err = drm_gpuvm_exec_lock(&vm_exec);
+       if (err)
+               return err;
+       pfence = xe_preempt_fence_create(q, q->compute.context,
+                                        ++q->compute.seqno);
+       if (!pfence) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+       list_add(&q->compute.link, &vm->preempt.exec_queues);
+       ++vm->preempt.num_exec_queues;
+       q->compute.pfence = pfence;
+       down_read(&vm->userptr.notifier_lock);
+       drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
+                                DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
+       /*
+        * Check to see if a preemption on VM is in flight or userptr
+        * invalidation, if so trigger this preempt fence to sync state with
+        * other preempt fences on the VM.
+        */
+       wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
+       if (wait)
+               dma_fence_enable_sw_signaling(pfence);
+       up_read(&vm->userptr.notifier_lock);
+ out_unlock:
+       drm_exec_fini(exec);
+       up_write(&vm->lock);
+       return err;
+ }
+ /**
+  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
+  * @vm: The VM.
+  * @q: The exec_queue
+  */
+ void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
+ {
+       if (!xe_vm_in_preempt_fence_mode(vm))
+               return;
+       down_write(&vm->lock);
+       list_del(&q->compute.link);
+       --vm->preempt.num_exec_queues;
+       if (q->compute.pfence) {
+               dma_fence_enable_sw_signaling(q->compute.pfence);
+               dma_fence_put(q->compute.pfence);
+               q->compute.pfence = NULL;
+       }
+       up_write(&vm->lock);
+ }
+ /**
+  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
+  * that need repinning.
+  * @vm: The VM.
+  *
+  * This function checks for whether the VM has userptrs that need repinning,
+  * and provides a release-type barrier on the userptr.notifier_lock after
+  * checking.
+  *
+  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
+  */
+ int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
+ {
+       lockdep_assert_held_read(&vm->userptr.notifier_lock);
+       return (list_empty(&vm->userptr.repin_list) &&
+               list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
+ }
+ #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
+ static void xe_vm_kill(struct xe_vm *vm)
+ {
+       struct xe_exec_queue *q;
+       lockdep_assert_held(&vm->lock);
+       xe_vm_lock(vm, false);
+       vm->flags |= XE_VM_FLAG_BANNED;
+       trace_xe_vm_kill(vm);
+       list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
+               q->ops->kill(q);
+       xe_vm_unlock(vm);
+       /* TODO: Inform user the VM is banned */
+ }
+ /**
+  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
+  * @exec: The drm_exec object used for locking before validation.
+  * @err: The error returned from ttm_bo_validate().
+  * @end: A ktime_t cookie that should be set to 0 before first use and
+  * that should be reused on subsequent calls.
+  *
+  * With multiple active VMs, under memory pressure, it is possible that
+  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
+  * Until ttm properly handles locking in such scenarios, best thing the
+  * driver can do is retry with a timeout. Check if that is necessary, and
+  * if so unlock the drm_exec's objects while keeping the ticket to prepare
+  * for a rerun.
+  *
+  * Return: true if a retry after drm_exec_init() is recommended;
+  * false otherwise.
+  */
+ bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
+ {
+       ktime_t cur;
+       if (err != -ENOMEM)
+               return false;
+       cur = ktime_get();
+       *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
+       if (!ktime_before(cur, *end))
+               return false;
+       msleep(20);
+       return true;
+ }
+ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
+ {
+       struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
+       struct drm_gpuva *gpuva;
+       int ret;
+       lockdep_assert_held(&vm->lock);
+       drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
+               list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
+                              &vm->rebind_list);
+       ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
+       if (ret)
+               return ret;
+       vm_bo->evicted = false;
+       return 0;
+ }
+ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
+                                bool *done)
+ {
+       int err;
+       /*
+        * 1 fence for each preempt fence plus a fence for each tile from a
+        * possible rebind
+        */
+       err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm->preempt.num_exec_queues +
+                                  vm->xe->info.tile_count);
+       if (err)
+               return err;
+       if (xe_vm_is_idle(vm)) {
+               vm->preempt.rebind_deactivated = true;
+               *done = true;
+               return 0;
+       }
+       if (!preempt_fences_waiting(vm)) {
+               *done = true;
+               return 0;
+       }
+       err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm->preempt.num_exec_queues);
+       if (err)
+               return err;
+       err = wait_for_existing_preempt_fences(vm);
+       if (err)
+               return err;
+       return drm_gpuvm_validate(&vm->gpuvm, exec);
+ }
+ static void preempt_rebind_work_func(struct work_struct *w)
+ {
+       struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
+       struct drm_exec exec;
+       struct dma_fence *rebind_fence;
+       unsigned int fence_count = 0;
+       LIST_HEAD(preempt_fences);
+       ktime_t end = 0;
+       int err = 0;
+       long wait;
+       int __maybe_unused tries = 0;
+       xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
+       trace_xe_vm_rebind_worker_enter(vm);
+       down_write(&vm->lock);
+       if (xe_vm_is_closed_or_banned(vm)) {
+               up_write(&vm->lock);
+               trace_xe_vm_rebind_worker_exit(vm);
+               return;
+       }
+ retry:
+       if (xe_vm_userptr_check_repin(vm)) {
+               err = xe_vm_userptr_pin(vm);
+               if (err)
+                       goto out_unlock_outer;
+       }
 -      drm_exec_init(&exec, 0);
++      drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+       drm_exec_until_all_locked(&exec) {
+               bool done = false;
+               err = xe_preempt_work_begin(&exec, vm, &done);
+               drm_exec_retry_on_contention(&exec);
+               if (err || done) {
+                       drm_exec_fini(&exec);
+                       if (err && xe_vm_validate_should_retry(&exec, err, &end))
+                               err = -EAGAIN;
+                       goto out_unlock_outer;
+               }
+       }
+       err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
+       if (err)
+               goto out_unlock;
+       rebind_fence = xe_vm_rebind(vm, true);
+       if (IS_ERR(rebind_fence)) {
+               err = PTR_ERR(rebind_fence);
+               goto out_unlock;
+       }
+       if (rebind_fence) {
+               dma_fence_wait(rebind_fence, false);
+               dma_fence_put(rebind_fence);
+       }
+       /* Wait on munmap style VM unbinds */
+       wait = dma_resv_wait_timeout(xe_vm_resv(vm),
+                                    DMA_RESV_USAGE_KERNEL,
+                                    false, MAX_SCHEDULE_TIMEOUT);
+       if (wait <= 0) {
+               err = -ETIME;
+               goto out_unlock;
+       }
+ #define retry_required(__tries, __vm) \
+       (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
+       (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
+       __xe_vm_userptr_needs_repin(__vm))
+       down_read(&vm->userptr.notifier_lock);
+       if (retry_required(tries, vm)) {
+               up_read(&vm->userptr.notifier_lock);
+               err = -EAGAIN;
+               goto out_unlock;
+       }
+ #undef retry_required
+       spin_lock(&vm->xe->ttm.lru_lock);
+       ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
+       spin_unlock(&vm->xe->ttm.lru_lock);
+       /* Point of no return. */
+       arm_preempt_fences(vm, &preempt_fences);
+       resume_and_reinstall_preempt_fences(vm, &exec);
+       up_read(&vm->userptr.notifier_lock);
+ out_unlock:
+       drm_exec_fini(&exec);
+ out_unlock_outer:
+       if (err == -EAGAIN) {
+               trace_xe_vm_rebind_worker_retry(vm);
+               goto retry;
+       }
+       if (err) {
+               drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
+               xe_vm_kill(vm);
+       }
+       up_write(&vm->lock);
+       free_preempt_fences(&preempt_fences);
+       trace_xe_vm_rebind_worker_exit(vm);
+ }
+ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
+                                  const struct mmu_notifier_range *range,
+                                  unsigned long cur_seq)
+ {
+       struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
+       struct xe_vm *vm = xe_vma_vm(vma);
+       struct dma_resv_iter cursor;
+       struct dma_fence *fence;
+       long err;
+       xe_assert(vm->xe, xe_vma_is_userptr(vma));
+       trace_xe_vma_userptr_invalidate(vma);
+       if (!mmu_notifier_range_blockable(range))
+               return false;
+       down_write(&vm->userptr.notifier_lock);
+       mmu_interval_set_seq(mni, cur_seq);
+       /* No need to stop gpu access if the userptr is not yet bound. */
+       if (!vma->userptr.initial_bind) {
+               up_write(&vm->userptr.notifier_lock);
+               return true;
+       }
+       /*
+        * Tell exec and rebind worker they need to repin and rebind this
+        * userptr.
+        */
+       if (!xe_vm_in_fault_mode(vm) &&
+           !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
+               spin_lock(&vm->userptr.invalidated_lock);
+               list_move_tail(&vma->userptr.invalidate_link,
+                              &vm->userptr.invalidated);
+               spin_unlock(&vm->userptr.invalidated_lock);
+       }
+       up_write(&vm->userptr.notifier_lock);
+       /*
+        * Preempt fences turn into schedule disables, pipeline these.
+        * Note that even in fault mode, we need to wait for binds and
+        * unbinds to complete, and those are attached as BOOKMARK fences
+        * to the vm.
+        */
+       dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
+                           DMA_RESV_USAGE_BOOKKEEP);
+       dma_resv_for_each_fence_unlocked(&cursor, fence)
+               dma_fence_enable_sw_signaling(fence);
+       dma_resv_iter_end(&cursor);
+       err = dma_resv_wait_timeout(xe_vm_resv(vm),
+                                   DMA_RESV_USAGE_BOOKKEEP,
+                                   false, MAX_SCHEDULE_TIMEOUT);
+       XE_WARN_ON(err <= 0);
+       if (xe_vm_in_fault_mode(vm)) {
+               err = xe_vm_invalidate_vma(vma);
+               XE_WARN_ON(err);
+       }
+       trace_xe_vma_userptr_invalidate_complete(vma);
+       return true;
+ }
+ static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
+       .invalidate = vma_userptr_invalidate,
+ };
+ int xe_vm_userptr_pin(struct xe_vm *vm)
+ {
+       struct xe_vma *vma, *next;
+       int err = 0;
+       LIST_HEAD(tmp_evict);
+       lockdep_assert_held_write(&vm->lock);
+       /* Collect invalidated userptrs */
+       spin_lock(&vm->userptr.invalidated_lock);
+       list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
+                                userptr.invalidate_link) {
+               list_del_init(&vma->userptr.invalidate_link);
+               list_move_tail(&vma->combined_links.userptr,
+                              &vm->userptr.repin_list);
+       }
+       spin_unlock(&vm->userptr.invalidated_lock);
+       /* Pin and move to temporary list */
+       list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
+                                combined_links.userptr) {
+               err = xe_vma_userptr_pin_pages(vma);
+               if (err < 0)
+                       return err;
+               list_move_tail(&vma->combined_links.userptr, &vm->rebind_list);
+       }
+       return 0;
+ }
+ /**
+  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
+  * that need repinning.
+  * @vm: The VM.
+  *
+  * This function does an advisory check for whether the VM has userptrs that
+  * need repinning.
+  *
+  * Return: 0 if there are no indications of userptrs needing repinning,
+  * -EAGAIN if there are.
+  */
+ int xe_vm_userptr_check_repin(struct xe_vm *vm)
+ {
+       return (list_empty_careful(&vm->userptr.repin_list) &&
+               list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
+ }
+ static struct dma_fence *
+ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
+              struct xe_sync_entry *syncs, u32 num_syncs,
+              bool first_op, bool last_op);
+ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
+ {
+       struct dma_fence *fence = NULL;
+       struct xe_vma *vma, *next;
+       lockdep_assert_held(&vm->lock);
+       if (xe_vm_in_lr_mode(vm) && !rebind_worker)
+               return NULL;
+       xe_vm_assert_held(vm);
+       list_for_each_entry_safe(vma, next, &vm->rebind_list,
+                                combined_links.rebind) {
+               xe_assert(vm->xe, vma->tile_present);
+               list_del_init(&vma->combined_links.rebind);
+               dma_fence_put(fence);
+               if (rebind_worker)
+                       trace_xe_vma_rebind_worker(vma);
+               else
+                       trace_xe_vma_rebind_exec(vma);
+               fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
+               if (IS_ERR(fence))
+                       return fence;
+       }
+       return fence;
+ }
+ #define VMA_CREATE_FLAG_READ_ONLY     BIT(0)
+ #define VMA_CREATE_FLAG_IS_NULL               BIT(1)
+ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
+                                   struct xe_bo *bo,
+                                   u64 bo_offset_or_userptr,
+                                   u64 start, u64 end,
+                                   u16 pat_index, unsigned int flags)
+ {
+       struct xe_vma *vma;
+       struct xe_tile *tile;
+       u8 id;
+       bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
+       bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
+       xe_assert(vm->xe, start < end);
+       xe_assert(vm->xe, end < vm->size);
+       if (!bo && !is_null)    /* userptr */
+               vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+       else
+               vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
+                             GFP_KERNEL);
+       if (!vma) {
+               vma = ERR_PTR(-ENOMEM);
+               return vma;
+       }
+       INIT_LIST_HEAD(&vma->combined_links.rebind);
+       INIT_LIST_HEAD(&vma->gpuva.gem.entry);
+       vma->gpuva.vm = &vm->gpuvm;
+       vma->gpuva.va.addr = start;
+       vma->gpuva.va.range = end - start + 1;
+       if (read_only)
+               vma->gpuva.flags |= XE_VMA_READ_ONLY;
+       if (is_null)
+               vma->gpuva.flags |= DRM_GPUVA_SPARSE;
+       for_each_tile(tile, vm->xe, id)
+               vma->tile_mask |= 0x1 << id;
+       if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC)
+               vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
+       vma->pat_index = pat_index;
+       if (bo) {
+               struct drm_gpuvm_bo *vm_bo;
+               xe_bo_assert_held(bo);
+               vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
+               if (IS_ERR(vm_bo)) {
+                       kfree(vma);
+                       return ERR_CAST(vm_bo);
+               }
+               drm_gpuvm_bo_extobj_add(vm_bo);
+               drm_gem_object_get(&bo->ttm.base);
+               vma->gpuva.gem.obj = &bo->ttm.base;
+               vma->gpuva.gem.offset = bo_offset_or_userptr;
+               drm_gpuva_link(&vma->gpuva, vm_bo);
+               drm_gpuvm_bo_put(vm_bo);
+       } else /* userptr or null */ {
+               if (!is_null) {
+                       u64 size = end - start + 1;
+                       int err;
+                       INIT_LIST_HEAD(&vma->userptr.invalidate_link);
+                       vma->gpuva.gem.offset = bo_offset_or_userptr;
+                       err = mmu_interval_notifier_insert(&vma->userptr.notifier,
+                                                          current->mm,
+                                                          xe_vma_userptr(vma), size,
+                                                          &vma_userptr_notifier_ops);
+                       if (err) {
+                               kfree(vma);
+                               vma = ERR_PTR(err);
+                               return vma;
+                       }
+                       vma->userptr.notifier_seq = LONG_MAX;
+               }
+               xe_vm_get(vm);
+       }
+       return vma;
+ }
+ static void xe_vma_destroy_late(struct xe_vma *vma)
+ {
+       struct xe_vm *vm = xe_vma_vm(vma);
+       struct xe_device *xe = vm->xe;
+       bool read_only = xe_vma_read_only(vma);
+       if (xe_vma_is_userptr(vma)) {
+               if (vma->userptr.sg) {
+                       dma_unmap_sgtable(xe->drm.dev,
+                                         vma->userptr.sg,
+                                         read_only ? DMA_TO_DEVICE :
+                                         DMA_BIDIRECTIONAL, 0);
+                       sg_free_table(vma->userptr.sg);
+                       vma->userptr.sg = NULL;
+               }
+               /*
+                * Since userptr pages are not pinned, we can't remove
+                * the notifer until we're sure the GPU is not accessing
+                * them anymore
+                */
+               mmu_interval_notifier_remove(&vma->userptr.notifier);
+               xe_vm_put(vm);
+       } else if (xe_vma_is_null(vma)) {
+               xe_vm_put(vm);
+       } else {
+               xe_bo_put(xe_vma_bo(vma));
+       }
+       kfree(vma);
+ }
+ static void vma_destroy_work_func(struct work_struct *w)
+ {
+       struct xe_vma *vma =
+               container_of(w, struct xe_vma, destroy_work);
+       xe_vma_destroy_late(vma);
+ }
+ static void vma_destroy_cb(struct dma_fence *fence,
+                          struct dma_fence_cb *cb)
+ {
+       struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
+       INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
+       queue_work(system_unbound_wq, &vma->destroy_work);
+ }
+ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
+ {
+       struct xe_vm *vm = xe_vma_vm(vma);
+       lockdep_assert_held_write(&vm->lock);
+       xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
+       if (xe_vma_is_userptr(vma)) {
+               xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
+               spin_lock(&vm->userptr.invalidated_lock);
+               list_del(&vma->userptr.invalidate_link);
+               spin_unlock(&vm->userptr.invalidated_lock);
+       } else if (!xe_vma_is_null(vma)) {
+               xe_bo_assert_held(xe_vma_bo(vma));
+               drm_gpuva_unlink(&vma->gpuva);
+       }
+       xe_vm_assert_held(vm);
+       if (fence) {
+               int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
+                                                vma_destroy_cb);
+               if (ret) {
+                       XE_WARN_ON(ret != -ENOENT);
+                       xe_vma_destroy_late(vma);
+               }
+       } else {
+               xe_vma_destroy_late(vma);
+       }
+ }
+ /**
+  * xe_vm_prepare_vma() - drm_exec utility to lock a vma
+  * @exec: The drm_exec object we're currently locking for.
+  * @vma: The vma for witch we want to lock the vm resv and any attached
+  * object's resv.
+  * @num_shared: The number of dma-fence slots to pre-allocate in the
+  * objects' reservation objects.
+  *
+  * Return: 0 on success, negative error code on error. In particular
+  * may return -EDEADLK on WW transaction contention and -EINTR if
+  * an interruptible wait is terminated by a signal.
+  */
+ int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
+                     unsigned int num_shared)
+ {
+       struct xe_vm *vm = xe_vma_vm(vma);
+       struct xe_bo *bo = xe_vma_bo(vma);
+       int err;
+       XE_WARN_ON(!vm);
+       err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
+       if (!err && bo && !bo->vm)
+               err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared);
+       return err;
+ }
+ static void xe_vma_destroy_unlocked(struct xe_vma *vma)
+ {
+       struct drm_exec exec;
+       int err;
 -              drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
++      drm_exec_init(&exec, 0, 0);
+       drm_exec_until_all_locked(&exec) {
+               err = xe_vm_prepare_vma(&exec, vma, 0);
+               drm_exec_retry_on_contention(&exec);
+               if (XE_WARN_ON(err))
+                       break;
+       }
+       xe_vma_destroy(vma, NULL);
+       drm_exec_fini(&exec);
+ }
+ struct xe_vma *
+ xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
+ {
+       struct drm_gpuva *gpuva;
+       lockdep_assert_held(&vm->lock);
+       if (xe_vm_is_closed_or_banned(vm))
+               return NULL;
+       xe_assert(vm->xe, start + range <= vm->size);
+       gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
+       return gpuva ? gpuva_to_vma(gpuva) : NULL;
+ }
+ static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
+ {
+       int err;
+       xe_assert(vm->xe, xe_vma_vm(vma) == vm);
+       lockdep_assert_held(&vm->lock);
+       err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
+       XE_WARN_ON(err);        /* Shouldn't be possible */
+       return err;
+ }
+ static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
+ {
+       xe_assert(vm->xe, xe_vma_vm(vma) == vm);
+       lockdep_assert_held(&vm->lock);
+       drm_gpuva_remove(&vma->gpuva);
+       if (vm->usm.last_fault_vma == vma)
+               vm->usm.last_fault_vma = NULL;
+ }
+ static struct drm_gpuva_op *xe_vm_op_alloc(void)
+ {
+       struct xe_vma_op *op;
+       op = kzalloc(sizeof(*op), GFP_KERNEL);
+       if (unlikely(!op))
+               return NULL;
+       return &op->base;
+ }
+ static void xe_vm_free(struct drm_gpuvm *gpuvm);
+ static struct drm_gpuvm_ops gpuvm_ops = {
+       .op_alloc = xe_vm_op_alloc,
+       .vm_bo_validate = xe_gpuvm_validate,
+       .vm_free = xe_vm_free,
+ };
+ static u64 pde_encode_pat_index(struct xe_device *xe, u16 pat_index)
+ {
+       u64 pte = 0;
+       if (pat_index & BIT(0))
+               pte |= XE_PPGTT_PTE_PAT0;
+       if (pat_index & BIT(1))
+               pte |= XE_PPGTT_PTE_PAT1;
+       return pte;
+ }
+ static u64 pte_encode_pat_index(struct xe_device *xe, u16 pat_index,
+                               u32 pt_level)
+ {
+       u64 pte = 0;
+       if (pat_index & BIT(0))
+               pte |= XE_PPGTT_PTE_PAT0;
+       if (pat_index & BIT(1))
+               pte |= XE_PPGTT_PTE_PAT1;
+       if (pat_index & BIT(2)) {
+               if (pt_level)
+                       pte |= XE_PPGTT_PDE_PDPE_PAT2;
+               else
+                       pte |= XE_PPGTT_PTE_PAT2;
+       }
+       if (pat_index & BIT(3))
+               pte |= XELPG_PPGTT_PTE_PAT3;
+       if (pat_index & (BIT(4)))
+               pte |= XE2_PPGTT_PTE_PAT4;
+       return pte;
+ }
+ static u64 pte_encode_ps(u32 pt_level)
+ {
+       XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
+       if (pt_level == 1)
+               return XE_PDE_PS_2M;
+       else if (pt_level == 2)
+               return XE_PDPE_PS_1G;
+       return 0;
+ }
+ static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
+                             const u16 pat_index)
+ {
+       struct xe_device *xe = xe_bo_device(bo);
+       u64 pde;
+       pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
+       pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
+       pde |= pde_encode_pat_index(xe, pat_index);
+       return pde;
+ }
+ static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
+                             u16 pat_index, u32 pt_level)
+ {
+       struct xe_device *xe = xe_bo_device(bo);
+       u64 pte;
+       pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
+       pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
+       pte |= pte_encode_pat_index(xe, pat_index, pt_level);
+       pte |= pte_encode_ps(pt_level);
+       if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
+               pte |= XE_PPGTT_PTE_DM;
+       return pte;
+ }
+ static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
+                              u16 pat_index, u32 pt_level)
+ {
+       struct xe_device *xe = xe_vma_vm(vma)->xe;
+       pte |= XE_PAGE_PRESENT;
+       if (likely(!xe_vma_read_only(vma)))
+               pte |= XE_PAGE_RW;
+       pte |= pte_encode_pat_index(xe, pat_index, pt_level);
+       pte |= pte_encode_ps(pt_level);
+       if (unlikely(xe_vma_is_null(vma)))
+               pte |= XE_PTE_NULL;
+       return pte;
+ }
+ static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
+                               u16 pat_index,
+                               u32 pt_level, bool devmem, u64 flags)
+ {
+       u64 pte;
+       /* Avoid passing random bits directly as flags */
+       xe_assert(xe, !(flags & ~XE_PTE_PS64));
+       pte = addr;
+       pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
+       pte |= pte_encode_pat_index(xe, pat_index, pt_level);
+       pte |= pte_encode_ps(pt_level);
+       if (devmem)
+               pte |= XE_PPGTT_PTE_DM;
+       pte |= flags;
+       return pte;
+ }
+ static const struct xe_pt_ops xelp_pt_ops = {
+       .pte_encode_bo = xelp_pte_encode_bo,
+       .pte_encode_vma = xelp_pte_encode_vma,
+       .pte_encode_addr = xelp_pte_encode_addr,
+       .pde_encode_bo = xelp_pde_encode_bo,
+ };
+ static void vm_destroy_work_func(struct work_struct *w);
+ /**
+  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
+  * given tile and vm.
+  * @xe: xe device.
+  * @tile: tile to set up for.
+  * @vm: vm to set up for.
+  *
+  * Sets up a pagetable tree with one page-table per level and a single
+  * leaf PTE. All pagetable entries point to the single page-table or,
+  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
+  * writes become NOPs.
+  *
+  * Return: 0 on success, negative error code on error.
+  */
+ static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
+                               struct xe_vm *vm)
+ {
+       u8 id = tile->id;
+       int i;
+       for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
+               vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
+               if (IS_ERR(vm->scratch_pt[id][i]))
+                       return PTR_ERR(vm->scratch_pt[id][i]);
+               xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
+       }
+       return 0;
+ }
+ static void xe_vm_free_scratch(struct xe_vm *vm)
+ {
+       struct xe_tile *tile;
+       u8 id;
+       if (!xe_vm_has_scratch(vm))
+               return;
+       for_each_tile(tile, vm->xe, id) {
+               u32 i;
+               if (!vm->pt_root[id])
+                       continue;
+               for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
+                       if (vm->scratch_pt[id][i])
+                               xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
+       }
+ }
+ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
+ {
+       struct drm_gem_object *vm_resv_obj;
+       struct xe_vm *vm;
+       int err, number_tiles = 0;
+       struct xe_tile *tile;
+       u8 id;
+       vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+       if (!vm)
+               return ERR_PTR(-ENOMEM);
+       vm->xe = xe;
+       vm->size = 1ull << xe->info.va_bits;
+       vm->flags = flags;
+       init_rwsem(&vm->lock);
+       INIT_LIST_HEAD(&vm->rebind_list);
+       INIT_LIST_HEAD(&vm->userptr.repin_list);
+       INIT_LIST_HEAD(&vm->userptr.invalidated);
+       init_rwsem(&vm->userptr.notifier_lock);
+       spin_lock_init(&vm->userptr.invalidated_lock);
+       INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
+       INIT_LIST_HEAD(&vm->preempt.exec_queues);
+       vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
+       for_each_tile(tile, xe, id)
+               xe_range_fence_tree_init(&vm->rftree[id]);
+       vm->pt_ops = &xelp_pt_ops;
+       if (!(flags & XE_VM_FLAG_MIGRATION))
+               xe_device_mem_access_get(xe);
+       vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
+       if (!vm_resv_obj) {
+               err = -ENOMEM;
+               goto err_no_resv;
+       }
+       drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
+                      vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
+       drm_gem_object_put(vm_resv_obj);
+       err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
+       if (err)
+               goto err_close;
+       if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
+               vm->flags |= XE_VM_FLAG_64K;
+       for_each_tile(tile, xe, id) {
+               if (flags & XE_VM_FLAG_MIGRATION &&
+                   tile->id != XE_VM_FLAG_TILE_ID(flags))
+                       continue;
+               vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
+               if (IS_ERR(vm->pt_root[id])) {
+                       err = PTR_ERR(vm->pt_root[id]);
+                       vm->pt_root[id] = NULL;
+                       goto err_unlock_close;
+               }
+       }
+       if (xe_vm_has_scratch(vm)) {
+               for_each_tile(tile, xe, id) {
+                       if (!vm->pt_root[id])
+                               continue;
+                       err = xe_vm_create_scratch(xe, tile, vm);
+                       if (err)
+                               goto err_unlock_close;
+               }
+               vm->batch_invalidate_tlb = true;
+       }
+       if (flags & XE_VM_FLAG_LR_MODE) {
+               INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
+               vm->flags |= XE_VM_FLAG_LR_MODE;
+               vm->batch_invalidate_tlb = false;
+       }
+       /* Fill pt_root after allocating scratch tables */
+       for_each_tile(tile, xe, id) {
+               if (!vm->pt_root[id])
+                       continue;
+               xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
+       }
+       dma_resv_unlock(xe_vm_resv(vm));
+       /* Kernel migration VM shouldn't have a circular loop.. */
+       if (!(flags & XE_VM_FLAG_MIGRATION)) {
+               for_each_tile(tile, xe, id) {
+                       struct xe_gt *gt = tile->primary_gt;
+                       struct xe_vm *migrate_vm;
+                       struct xe_exec_queue *q;
+                       u32 create_flags = EXEC_QUEUE_FLAG_VM;
+                       if (!vm->pt_root[id])
+                               continue;
+                       migrate_vm = xe_migrate_get_vm(tile->migrate);
+                       q = xe_exec_queue_create_class(xe, gt, migrate_vm,
+                                                      XE_ENGINE_CLASS_COPY,
+                                                      create_flags);
+                       xe_vm_put(migrate_vm);
+                       if (IS_ERR(q)) {
+                               err = PTR_ERR(q);
+                               goto err_close;
+                       }
+                       vm->q[id] = q;
+                       number_tiles++;
+               }
+       }
+       if (number_tiles > 1)
+               vm->composite_fence_ctx = dma_fence_context_alloc(1);
+       mutex_lock(&xe->usm.lock);
+       if (flags & XE_VM_FLAG_FAULT_MODE)
+               xe->usm.num_vm_in_fault_mode++;
+       else if (!(flags & XE_VM_FLAG_MIGRATION))
+               xe->usm.num_vm_in_non_fault_mode++;
+       mutex_unlock(&xe->usm.lock);
+       trace_xe_vm_create(vm);
+       return vm;
+ err_unlock_close:
+       dma_resv_unlock(xe_vm_resv(vm));
+ err_close:
+       xe_vm_close_and_put(vm);
+       return ERR_PTR(err);
+ err_no_resv:
+       for_each_tile(tile, xe, id)
+               xe_range_fence_tree_fini(&vm->rftree[id]);
+       kfree(vm);
+       if (!(flags & XE_VM_FLAG_MIGRATION))
+               xe_device_mem_access_put(xe);
+       return ERR_PTR(err);
+ }
+ static void xe_vm_close(struct xe_vm *vm)
+ {
+       down_write(&vm->lock);
+       vm->size = 0;
+       up_write(&vm->lock);
+ }
+ void xe_vm_close_and_put(struct xe_vm *vm)
+ {
+       LIST_HEAD(contested);
+       struct xe_device *xe = vm->xe;
+       struct xe_tile *tile;
+       struct xe_vma *vma, *next_vma;
+       struct drm_gpuva *gpuva, *next;
+       u8 id;
+       xe_assert(xe, !vm->preempt.num_exec_queues);
+       xe_vm_close(vm);
+       if (xe_vm_in_preempt_fence_mode(vm))
+               flush_work(&vm->preempt.rebind_work);
+       down_write(&vm->lock);
+       for_each_tile(tile, xe, id) {
+               if (vm->q[id])
+                       xe_exec_queue_last_fence_put(vm->q[id], vm);
+       }
+       up_write(&vm->lock);
+       for_each_tile(tile, xe, id) {
+               if (vm->q[id]) {
+                       xe_exec_queue_kill(vm->q[id]);
+                       xe_exec_queue_put(vm->q[id]);
+                       vm->q[id] = NULL;
+               }
+       }
+       down_write(&vm->lock);
+       xe_vm_lock(vm, false);
+       drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
+               vma = gpuva_to_vma(gpuva);
+               if (xe_vma_has_no_bo(vma)) {
+                       down_read(&vm->userptr.notifier_lock);
+                       vma->gpuva.flags |= XE_VMA_DESTROYED;
+                       up_read(&vm->userptr.notifier_lock);
+               }
+               xe_vm_remove_vma(vm, vma);
+               /* easy case, remove from VMA? */
+               if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
+                       list_del_init(&vma->combined_links.rebind);
+                       xe_vma_destroy(vma, NULL);
+                       continue;
+               }
+               list_move_tail(&vma->combined_links.destroy, &contested);
+               vma->gpuva.flags |= XE_VMA_DESTROYED;
+       }
+       /*
+        * All vm operations will add shared fences to resv.
+        * The only exception is eviction for a shared object,
+        * but even so, the unbind when evicted would still
+        * install a fence to resv. Hence it's safe to
+        * destroy the pagetables immediately.
+        */
+       xe_vm_free_scratch(vm);
+       for_each_tile(tile, xe, id) {
+               if (vm->pt_root[id]) {
+                       xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
+                       vm->pt_root[id] = NULL;
+               }
+       }
+       xe_vm_unlock(vm);
+       /*
+        * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
+        * Since we hold a refcount to the bo, we can remove and free
+        * the members safely without locking.
+        */
+       list_for_each_entry_safe(vma, next_vma, &contested,
+                                combined_links.destroy) {
+               list_del_init(&vma->combined_links.destroy);
+               xe_vma_destroy_unlocked(vma);
+       }
+       up_write(&vm->lock);
+       mutex_lock(&xe->usm.lock);
+       if (vm->flags & XE_VM_FLAG_FAULT_MODE)
+               xe->usm.num_vm_in_fault_mode--;
+       else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
+               xe->usm.num_vm_in_non_fault_mode--;
+       mutex_unlock(&xe->usm.lock);
+       for_each_tile(tile, xe, id)
+               xe_range_fence_tree_fini(&vm->rftree[id]);
+       xe_vm_put(vm);
+ }
+ static void vm_destroy_work_func(struct work_struct *w)
+ {
+       struct xe_vm *vm =
+               container_of(w, struct xe_vm, destroy_work);
+       struct xe_device *xe = vm->xe;
+       struct xe_tile *tile;
+       u8 id;
+       void *lookup;
+       /* xe_vm_close_and_put was not called? */
+       xe_assert(xe, !vm->size);
+       if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
+               xe_device_mem_access_put(xe);
+               if (xe->info.has_asid && vm->usm.asid) {
+                       mutex_lock(&xe->usm.lock);
+                       lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
+                       xe_assert(xe, lookup == vm);
+                       mutex_unlock(&xe->usm.lock);
+               }
+       }
+       for_each_tile(tile, xe, id)
+               XE_WARN_ON(vm->pt_root[id]);
+       trace_xe_vm_free(vm);
+       dma_fence_put(vm->rebind_fence);
+       kfree(vm);
+ }
+ static void xe_vm_free(struct drm_gpuvm *gpuvm)
+ {
+       struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
+       /* To destroy the VM we need to be able to sleep */
+       queue_work(system_unbound_wq, &vm->destroy_work);
+ }
+ struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
+ {
+       struct xe_vm *vm;
+       mutex_lock(&xef->vm.lock);
+       vm = xa_load(&xef->vm.xa, id);
+       if (vm)
+               xe_vm_get(vm);
+       mutex_unlock(&xef->vm.lock);
+       return vm;
+ }
+ u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
+ {
+       return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
+                                        tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
+ }
+ static struct xe_exec_queue *
+ to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
+ {
+       return q ? q : vm->q[0];
+ }
+ static struct dma_fence *
+ xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
+                struct xe_sync_entry *syncs, u32 num_syncs,
+                bool first_op, bool last_op)
+ {
+       struct xe_vm *vm = xe_vma_vm(vma);
+       struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
+       struct xe_tile *tile;
+       struct dma_fence *fence = NULL;
+       struct dma_fence **fences = NULL;
+       struct dma_fence_array *cf = NULL;
+       int cur_fence = 0, i;
+       int number_tiles = hweight8(vma->tile_present);
+       int err;
+       u8 id;
+       trace_xe_vma_unbind(vma);
+       if (number_tiles > 1) {
+               fences = kmalloc_array(number_tiles, sizeof(*fences),
+                                      GFP_KERNEL);
+               if (!fences)
+                       return ERR_PTR(-ENOMEM);
+       }
+       for_each_tile(tile, vm->xe, id) {
+               if (!(vma->tile_present & BIT(id)))
+                       goto next;
+               fence = __xe_pt_unbind_vma(tile, vma, q ? q : vm->q[id],
+                                          first_op ? syncs : NULL,
+                                          first_op ? num_syncs : 0);
+               if (IS_ERR(fence)) {
+                       err = PTR_ERR(fence);
+                       goto err_fences;
+               }
+               if (fences)
+                       fences[cur_fence++] = fence;
+ next:
+               if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
+                       q = list_next_entry(q, multi_gt_list);
+       }
+       if (fences) {
+               cf = dma_fence_array_create(number_tiles, fences,
+                                           vm->composite_fence_ctx,
+                                           vm->composite_fence_seqno++,
+                                           false);
+               if (!cf) {
+                       --vm->composite_fence_seqno;
+                       err = -ENOMEM;
+                       goto err_fences;
+               }
+       }
+       fence = cf ? &cf->base : !fence ?
+               xe_exec_queue_last_fence_get(wait_exec_queue, vm) : fence;
+       if (last_op) {
+               for (i = 0; i < num_syncs; i++)
+                       xe_sync_entry_signal(&syncs[i], NULL, fence);
+       }
+       return fence;
+ err_fences:
+       if (fences) {
+               while (cur_fence)
+                       dma_fence_put(fences[--cur_fence]);
+               kfree(fences);
+       }
+       return ERR_PTR(err);
+ }
+ static struct dma_fence *
+ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
+              struct xe_sync_entry *syncs, u32 num_syncs,
+              bool first_op, bool last_op)
+ {
+       struct xe_tile *tile;
+       struct dma_fence *fence;
+       struct dma_fence **fences = NULL;
+       struct dma_fence_array *cf = NULL;
+       struct xe_vm *vm = xe_vma_vm(vma);
+       int cur_fence = 0, i;
+       int number_tiles = hweight8(vma->tile_mask);
+       int err;
+       u8 id;
+       trace_xe_vma_bind(vma);
+       if (number_tiles > 1) {
+               fences = kmalloc_array(number_tiles, sizeof(*fences),
+                                      GFP_KERNEL);
+               if (!fences)
+                       return ERR_PTR(-ENOMEM);
+       }
+       for_each_tile(tile, vm->xe, id) {
+               if (!(vma->tile_mask & BIT(id)))
+                       goto next;
+               fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
+                                        first_op ? syncs : NULL,
+                                        first_op ? num_syncs : 0,
+                                        vma->tile_present & BIT(id));
+               if (IS_ERR(fence)) {
+                       err = PTR_ERR(fence);
+                       goto err_fences;
+               }
+               if (fences)
+                       fences[cur_fence++] = fence;
+ next:
+               if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
+                       q = list_next_entry(q, multi_gt_list);
+       }
+       if (fences) {
+               cf = dma_fence_array_create(number_tiles, fences,
+                                           vm->composite_fence_ctx,
+                                           vm->composite_fence_seqno++,
+                                           false);
+               if (!cf) {
+                       --vm->composite_fence_seqno;
+                       err = -ENOMEM;
+                       goto err_fences;
+               }
+       }
+       if (last_op) {
+               for (i = 0; i < num_syncs; i++)
+                       xe_sync_entry_signal(&syncs[i], NULL,
+                                            cf ? &cf->base : fence);
+       }
+       return cf ? &cf->base : fence;
+ err_fences:
+       if (fences) {
+               while (cur_fence)
+                       dma_fence_put(fences[--cur_fence]);
+               kfree(fences);
+       }
+       return ERR_PTR(err);
+ }
+ static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
+                       struct xe_exec_queue *q, struct xe_sync_entry *syncs,
+                       u32 num_syncs, bool immediate, bool first_op,
+                       bool last_op)
+ {
+       struct dma_fence *fence;
+       struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
+       xe_vm_assert_held(vm);
+       if (immediate) {
+               fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
+                                      last_op);
+               if (IS_ERR(fence))
+                       return PTR_ERR(fence);
+       } else {
+               int i;
+               xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
+               fence = xe_exec_queue_last_fence_get(wait_exec_queue, vm);
+               if (last_op) {
+                       for (i = 0; i < num_syncs; i++)
+                               xe_sync_entry_signal(&syncs[i], NULL, fence);
+               }
+       }
+       if (last_op)
+               xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
+       dma_fence_put(fence);
+       return 0;
+ }
+ static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
+                     struct xe_bo *bo, struct xe_sync_entry *syncs,
+                     u32 num_syncs, bool immediate, bool first_op,
+                     bool last_op)
+ {
+       int err;
+       xe_vm_assert_held(vm);
+       xe_bo_assert_held(bo);
+       if (bo && immediate) {
+               err = xe_bo_validate(bo, vm, true);
+               if (err)
+                       return err;
+       }
+       return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
+                           last_op);
+ }
+ static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
+                       struct xe_exec_queue *q, struct xe_sync_entry *syncs,
+                       u32 num_syncs, bool first_op, bool last_op)
+ {
+       struct dma_fence *fence;
+       struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
+       xe_vm_assert_held(vm);
+       xe_bo_assert_held(xe_vma_bo(vma));
+       fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
+       if (IS_ERR(fence))
+               return PTR_ERR(fence);
+       xe_vma_destroy(vma, fence);
+       if (last_op)
+               xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
+       dma_fence_put(fence);
+       return 0;
+ }
+ #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
+                                   DRM_XE_VM_CREATE_FLAG_LR_MODE | \
+                                   DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
+ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
+                      struct drm_file *file)
+ {
+       struct xe_device *xe = to_xe_device(dev);
+       struct xe_file *xef = to_xe_file(file);
+       struct drm_xe_vm_create *args = data;
+       struct xe_tile *tile;
+       struct xe_vm *vm;
+       u32 id, asid;
+       int err;
+       u32 flags = 0;
+       if (XE_IOCTL_DBG(xe, args->extensions))
+               return -EINVAL;
+       if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
+               args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
+       if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
+                        !xe->info.has_usm))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
+                        args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
+                        args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
+                        xe_device_in_non_fault_mode(xe)))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
+                        xe_device_in_fault_mode(xe)))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->extensions))
+               return -EINVAL;
+       if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
+               flags |= XE_VM_FLAG_SCRATCH_PAGE;
+       if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
+               flags |= XE_VM_FLAG_LR_MODE;
+       if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
+               flags |= XE_VM_FLAG_FAULT_MODE;
+       vm = xe_vm_create(xe, flags);
+       if (IS_ERR(vm))
+               return PTR_ERR(vm);
+       mutex_lock(&xef->vm.lock);
+       err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
+       mutex_unlock(&xef->vm.lock);
+       if (err) {
+               xe_vm_close_and_put(vm);
+               return err;
+       }
+       if (xe->info.has_asid) {
+               mutex_lock(&xe->usm.lock);
+               err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
+                                     XA_LIMIT(1, XE_MAX_ASID - 1),
+                                     &xe->usm.next_asid, GFP_KERNEL);
+               mutex_unlock(&xe->usm.lock);
+               if (err < 0) {
+                       xe_vm_close_and_put(vm);
+                       return err;
+               }
+               err = 0;
+               vm->usm.asid = asid;
+       }
+       args->vm_id = id;
+       vm->xef = xef;
+       /* Record BO memory for VM pagetable created against client */
+       for_each_tile(tile, xe, id)
+               if (vm->pt_root[id])
+                       xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
+ #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
+       /* Warning: Security issue - never enable by default */
+       args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
+ #endif
+       return 0;
+ }
+ int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
+                       struct drm_file *file)
+ {
+       struct xe_device *xe = to_xe_device(dev);
+       struct xe_file *xef = to_xe_file(file);
+       struct drm_xe_vm_destroy *args = data;
+       struct xe_vm *vm;
+       int err = 0;
+       if (XE_IOCTL_DBG(xe, args->pad) ||
+           XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+               return -EINVAL;
+       mutex_lock(&xef->vm.lock);
+       vm = xa_load(&xef->vm.xa, args->vm_id);
+       if (XE_IOCTL_DBG(xe, !vm))
+               err = -ENOENT;
+       else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
+               err = -EBUSY;
+       else
+               xa_erase(&xef->vm.xa, args->vm_id);
+       mutex_unlock(&xef->vm.lock);
+       if (!err)
+               xe_vm_close_and_put(vm);
+       return err;
+ }
+ static const u32 region_to_mem_type[] = {
+       XE_PL_TT,
+       XE_PL_VRAM0,
+       XE_PL_VRAM1,
+ };
+ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
+                         struct xe_exec_queue *q, u32 region,
+                         struct xe_sync_entry *syncs, u32 num_syncs,
+                         bool first_op, bool last_op)
+ {
+       struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
+       int err;
+       xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
+       if (!xe_vma_has_no_bo(vma)) {
+               err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
+               if (err)
+                       return err;
+       }
+       if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
+               return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
+                                 true, first_op, last_op);
+       } else {
+               int i;
+               /* Nothing to do, signal fences now */
+               if (last_op) {
+                       for (i = 0; i < num_syncs; i++) {
+                               struct dma_fence *fence =
+                                       xe_exec_queue_last_fence_get(wait_exec_queue, vm);
+                               xe_sync_entry_signal(&syncs[i], NULL, fence);
+                       }
+               }
+               return 0;
+       }
+ }
+ static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
+                            bool post_commit)
+ {
+       down_read(&vm->userptr.notifier_lock);
+       vma->gpuva.flags |= XE_VMA_DESTROYED;
+       up_read(&vm->userptr.notifier_lock);
+       if (post_commit)
+               xe_vm_remove_vma(vm, vma);
+ }
+ #undef ULL
+ #define ULL   unsigned long long
+ #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
+ static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
+ {
+       struct xe_vma *vma;
+       switch (op->op) {
+       case DRM_GPUVA_OP_MAP:
+               vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
+                      (ULL)op->map.va.addr, (ULL)op->map.va.range);
+               break;
+       case DRM_GPUVA_OP_REMAP:
+               vma = gpuva_to_vma(op->remap.unmap->va);
+               vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
+                      (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
+                      op->remap.unmap->keep ? 1 : 0);
+               if (op->remap.prev)
+                       vm_dbg(&xe->drm,
+                              "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
+                              (ULL)op->remap.prev->va.addr,
+                              (ULL)op->remap.prev->va.range);
+               if (op->remap.next)
+                       vm_dbg(&xe->drm,
+                              "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
+                              (ULL)op->remap.next->va.addr,
+                              (ULL)op->remap.next->va.range);
+               break;
+       case DRM_GPUVA_OP_UNMAP:
+               vma = gpuva_to_vma(op->unmap.va);
+               vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
+                      (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
+                      op->unmap.keep ? 1 : 0);
+               break;
+       case DRM_GPUVA_OP_PREFETCH:
+               vma = gpuva_to_vma(op->prefetch.va);
+               vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
+                      (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
+               break;
+       default:
+               drm_warn(&xe->drm, "NOT POSSIBLE");
+       }
+ }
+ #else
+ static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
+ {
+ }
+ #endif
+ /*
+  * Create operations list from IOCTL arguments, setup operations fields so parse
+  * and commit steps are decoupled from IOCTL arguments. This step can fail.
+  */
+ static struct drm_gpuva_ops *
+ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
+                        u64 bo_offset_or_userptr, u64 addr, u64 range,
+                        u32 operation, u32 flags,
+                        u32 prefetch_region, u16 pat_index)
+ {
+       struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
+       struct drm_gpuva_ops *ops;
+       struct drm_gpuva_op *__op;
+       struct xe_vma_op *op;
+       struct drm_gpuvm_bo *vm_bo;
+       int err;
+       lockdep_assert_held_write(&vm->lock);
+       vm_dbg(&vm->xe->drm,
+              "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
+              operation, (ULL)addr, (ULL)range,
+              (ULL)bo_offset_or_userptr);
+       switch (operation) {
+       case DRM_XE_VM_BIND_OP_MAP:
+       case DRM_XE_VM_BIND_OP_MAP_USERPTR:
+               ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
+                                                 obj, bo_offset_or_userptr);
+               break;
+       case DRM_XE_VM_BIND_OP_UNMAP:
+               ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
+               break;
+       case DRM_XE_VM_BIND_OP_PREFETCH:
+               ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
+               break;
+       case DRM_XE_VM_BIND_OP_UNMAP_ALL:
+               xe_assert(vm->xe, bo);
+               err = xe_bo_lock(bo, true);
+               if (err)
+                       return ERR_PTR(err);
+               vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
+               if (!vm_bo)
+                       break;
+               ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
+               drm_gpuvm_bo_put(vm_bo);
+               xe_bo_unlock(bo);
+               break;
+       default:
+               drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+               ops = ERR_PTR(-EINVAL);
+       }
+       if (IS_ERR(ops))
+               return ops;
+ #ifdef TEST_VM_ASYNC_OPS_ERROR
+       if (operation & FORCE_ASYNC_OP_ERROR) {
+               op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
+                                             base.entry);
+               if (op)
+                       op->inject_error = true;
+       }
+ #endif
+       drm_gpuva_for_each_op(__op, ops) {
+               struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
+               if (__op->op == DRM_GPUVA_OP_MAP) {
+                       op->map.immediate =
+                               flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
+                       op->map.read_only =
+                               flags & DRM_XE_VM_BIND_FLAG_READONLY;
+                       op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+                       op->map.pat_index = pat_index;
+               } else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
+                       op->prefetch.region = prefetch_region;
+               }
+               print_op(vm->xe, __op);
+       }
+       return ops;
+ }
+ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
+                             u16 pat_index, unsigned int flags)
+ {
+       struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
+       struct drm_exec exec;
+       struct xe_vma *vma;
+       int err;
+       lockdep_assert_held_write(&vm->lock);
+       if (bo) {
 -      drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
++              drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+               drm_exec_until_all_locked(&exec) {
+                       err = 0;
+                       if (!bo->vm) {
+                               err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
+                               drm_exec_retry_on_contention(&exec);
+                       }
+                       if (!err) {
+                               err = drm_exec_lock_obj(&exec, &bo->ttm.base);
+                               drm_exec_retry_on_contention(&exec);
+                       }
+                       if (err) {
+                               drm_exec_fini(&exec);
+                               return ERR_PTR(err);
+                       }
+               }
+       }
+       vma = xe_vma_create(vm, bo, op->gem.offset,
+                           op->va.addr, op->va.addr +
+                           op->va.range - 1, pat_index, flags);
+       if (bo)
+               drm_exec_fini(&exec);
+       if (xe_vma_is_userptr(vma)) {
+               err = xe_vma_userptr_pin_pages(vma);
+               if (err) {
+                       prep_vma_destroy(vm, vma, false);
+                       xe_vma_destroy_unlocked(vma);
+                       return ERR_PTR(err);
+               }
+       } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
+               err = add_preempt_fences(vm, bo);
+               if (err) {
+                       prep_vma_destroy(vm, vma, false);
+                       xe_vma_destroy_unlocked(vma);
+                       return ERR_PTR(err);
+               }
+       }
+       return vma;
+ }
+ static u64 xe_vma_max_pte_size(struct xe_vma *vma)
+ {
+       if (vma->gpuva.flags & XE_VMA_PTE_1G)
+               return SZ_1G;
+       else if (vma->gpuva.flags & XE_VMA_PTE_2M)
+               return SZ_2M;
+       return SZ_4K;
+ }
+ static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
+ {
+       switch (size) {
+       case SZ_1G:
+               vma->gpuva.flags |= XE_VMA_PTE_1G;
+               break;
+       case SZ_2M:
+               vma->gpuva.flags |= XE_VMA_PTE_2M;
+               break;
+       }
+       return SZ_4K;
+ }
+ static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
+ {
+       int err = 0;
+       lockdep_assert_held_write(&vm->lock);
+       switch (op->base.op) {
+       case DRM_GPUVA_OP_MAP:
+               err |= xe_vm_insert_vma(vm, op->map.vma);
+               if (!err)
+                       op->flags |= XE_VMA_OP_COMMITTED;
+               break;
+       case DRM_GPUVA_OP_REMAP:
+       {
+               u8 tile_present =
+                       gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
+               prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
+                                true);
+               op->flags |= XE_VMA_OP_COMMITTED;
+               if (op->remap.prev) {
+                       err |= xe_vm_insert_vma(vm, op->remap.prev);
+                       if (!err)
+                               op->flags |= XE_VMA_OP_PREV_COMMITTED;
+                       if (!err && op->remap.skip_prev) {
+                               op->remap.prev->tile_present =
+                                       tile_present;
+                               op->remap.prev = NULL;
+                       }
+               }
+               if (op->remap.next) {
+                       err |= xe_vm_insert_vma(vm, op->remap.next);
+                       if (!err)
+                               op->flags |= XE_VMA_OP_NEXT_COMMITTED;
+                       if (!err && op->remap.skip_next) {
+                               op->remap.next->tile_present =
+                                       tile_present;
+                               op->remap.next = NULL;
+                       }
+               }
+               /* Adjust for partial unbind after removin VMA from VM */
+               if (!err) {
+                       op->base.remap.unmap->va->va.addr = op->remap.start;
+                       op->base.remap.unmap->va->va.range = op->remap.range;
+               }
+               break;
+       }
+       case DRM_GPUVA_OP_UNMAP:
+               prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
+               op->flags |= XE_VMA_OP_COMMITTED;
+               break;
+       case DRM_GPUVA_OP_PREFETCH:
+               op->flags |= XE_VMA_OP_COMMITTED;
+               break;
+       default:
+               drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+       }
+       return err;
+ }
+ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
+                                  struct drm_gpuva_ops *ops,
+                                  struct xe_sync_entry *syncs, u32 num_syncs,
+                                  struct list_head *ops_list, bool last)
+ {
+       struct xe_vma_op *last_op = NULL;
+       struct drm_gpuva_op *__op;
+       int err = 0;
+       lockdep_assert_held_write(&vm->lock);
+       drm_gpuva_for_each_op(__op, ops) {
+               struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
+               struct xe_vma *vma;
+               bool first = list_empty(ops_list);
+               unsigned int flags = 0;
+               INIT_LIST_HEAD(&op->link);
+               list_add_tail(&op->link, ops_list);
+               if (first) {
+                       op->flags |= XE_VMA_OP_FIRST;
+                       op->num_syncs = num_syncs;
+                       op->syncs = syncs;
+               }
+               op->q = q;
+               switch (op->base.op) {
+               case DRM_GPUVA_OP_MAP:
+               {
+                       flags |= op->map.read_only ?
+                               VMA_CREATE_FLAG_READ_ONLY : 0;
+                       flags |= op->map.is_null ?
+                               VMA_CREATE_FLAG_IS_NULL : 0;
+                       vma = new_vma(vm, &op->base.map, op->map.pat_index,
+                                     flags);
+                       if (IS_ERR(vma))
+                               return PTR_ERR(vma);
+                       op->map.vma = vma;
+                       break;
+               }
+               case DRM_GPUVA_OP_REMAP:
+               {
+                       struct xe_vma *old =
+                               gpuva_to_vma(op->base.remap.unmap->va);
+                       op->remap.start = xe_vma_start(old);
+                       op->remap.range = xe_vma_size(old);
+                       if (op->base.remap.prev) {
+                               flags |= op->base.remap.unmap->va->flags &
+                                       XE_VMA_READ_ONLY ?
+                                       VMA_CREATE_FLAG_READ_ONLY : 0;
+                               flags |= op->base.remap.unmap->va->flags &
+                                       DRM_GPUVA_SPARSE ?
+                                       VMA_CREATE_FLAG_IS_NULL : 0;
+                               vma = new_vma(vm, op->base.remap.prev,
+                                             old->pat_index, flags);
+                               if (IS_ERR(vma))
+                                       return PTR_ERR(vma);
+                               op->remap.prev = vma;
+                               /*
+                                * Userptr creates a new SG mapping so
+                                * we must also rebind.
+                                */
+                               op->remap.skip_prev = !xe_vma_is_userptr(old) &&
+                                       IS_ALIGNED(xe_vma_end(vma),
+                                                  xe_vma_max_pte_size(old));
+                               if (op->remap.skip_prev) {
+                                       xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
+                                       op->remap.range -=
+                                               xe_vma_end(vma) -
+                                               xe_vma_start(old);
+                                       op->remap.start = xe_vma_end(vma);
+                               }
+                       }
+                       if (op->base.remap.next) {
+                               flags |= op->base.remap.unmap->va->flags &
+                                       XE_VMA_READ_ONLY ?
+                                       VMA_CREATE_FLAG_READ_ONLY : 0;
+                               flags |= op->base.remap.unmap->va->flags &
+                                       DRM_GPUVA_SPARSE ?
+                                       VMA_CREATE_FLAG_IS_NULL : 0;
+                               vma = new_vma(vm, op->base.remap.next,
+                                             old->pat_index, flags);
+                               if (IS_ERR(vma))
+                                       return PTR_ERR(vma);
+                               op->remap.next = vma;
+                               /*
+                                * Userptr creates a new SG mapping so
+                                * we must also rebind.
+                                */
+                               op->remap.skip_next = !xe_vma_is_userptr(old) &&
+                                       IS_ALIGNED(xe_vma_start(vma),
+                                                  xe_vma_max_pte_size(old));
+                               if (op->remap.skip_next) {
+                                       xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
+                                       op->remap.range -=
+                                               xe_vma_end(old) -
+                                               xe_vma_start(vma);
+                               }
+                       }
+                       break;
+               }
+               case DRM_GPUVA_OP_UNMAP:
+               case DRM_GPUVA_OP_PREFETCH:
+                       /* Nothing to do */
+                       break;
+               default:
+                       drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+               }
+               last_op = op;
+               err = xe_vma_op_commit(vm, op);
+               if (err)
+                       return err;
+       }
+       /* FIXME: Unhandled corner case */
+       XE_WARN_ON(!last_op && last && !list_empty(ops_list));
+       if (!last_op)
+               return 0;
+       last_op->ops = ops;
+       if (last) {
+               last_op->flags |= XE_VMA_OP_LAST;
+               last_op->num_syncs = num_syncs;
+               last_op->syncs = syncs;
+       }
+       return 0;
+ }
+ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
+                     struct xe_vma *vma, struct xe_vma_op *op)
+ {
+       int err;
+       lockdep_assert_held_write(&vm->lock);
+       err = xe_vm_prepare_vma(exec, vma, 1);
+       if (err)
+               return err;
+       xe_vm_assert_held(vm);
+       xe_bo_assert_held(xe_vma_bo(vma));
+       switch (op->base.op) {
+       case DRM_GPUVA_OP_MAP:
+               err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
+                                op->syncs, op->num_syncs,
+                                op->map.immediate || !xe_vm_in_fault_mode(vm),
+                                op->flags & XE_VMA_OP_FIRST,
+                                op->flags & XE_VMA_OP_LAST);
+               break;
+       case DRM_GPUVA_OP_REMAP:
+       {
+               bool prev = !!op->remap.prev;
+               bool next = !!op->remap.next;
+               if (!op->remap.unmap_done) {
+                       if (prev || next)
+                               vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
+                       err = xe_vm_unbind(vm, vma, op->q, op->syncs,
+                                          op->num_syncs,
+                                          op->flags & XE_VMA_OP_FIRST,
+                                          op->flags & XE_VMA_OP_LAST &&
+                                          !prev && !next);
+                       if (err)
+                               break;
+                       op->remap.unmap_done = true;
+               }
+               if (prev) {
+                       op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
+                       err = xe_vm_bind(vm, op->remap.prev, op->q,
+                                        xe_vma_bo(op->remap.prev), op->syncs,
+                                        op->num_syncs, true, false,
+                                        op->flags & XE_VMA_OP_LAST && !next);
+                       op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
+                       if (err)
+                               break;
+                       op->remap.prev = NULL;
+               }
+               if (next) {
+                       op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
+                       err = xe_vm_bind(vm, op->remap.next, op->q,
+                                        xe_vma_bo(op->remap.next),
+                                        op->syncs, op->num_syncs,
+                                        true, false,
+                                        op->flags & XE_VMA_OP_LAST);
+                       op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
+                       if (err)
+                               break;
+                       op->remap.next = NULL;
+               }
+               break;
+       }
+       case DRM_GPUVA_OP_UNMAP:
+               err = xe_vm_unbind(vm, vma, op->q, op->syncs,
+                                  op->num_syncs, op->flags & XE_VMA_OP_FIRST,
+                                  op->flags & XE_VMA_OP_LAST);
+               break;
+       case DRM_GPUVA_OP_PREFETCH:
+               err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
+                                    op->syncs, op->num_syncs,
+                                    op->flags & XE_VMA_OP_FIRST,
+                                    op->flags & XE_VMA_OP_LAST);
+               break;
+       default:
+               drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+       }
+       if (err)
+               trace_xe_vma_fail(vma);
+       return err;
+ }
+ static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
+                              struct xe_vma_op *op)
+ {
+       struct drm_exec exec;
+       int err;
+ retry_userptr:
++      drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+       drm_exec_until_all_locked(&exec) {
+               err = op_execute(&exec, vm, vma, op);
+               drm_exec_retry_on_contention(&exec);
+               if (err)
+                       break;
+       }
+       drm_exec_fini(&exec);
+       if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
+               lockdep_assert_held_write(&vm->lock);
+               err = xe_vma_userptr_pin_pages(vma);
+               if (!err)
+                       goto retry_userptr;
+               trace_xe_vma_fail(vma);
+       }
+       return err;
+ }
+ static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
+ {
+       int ret = 0;
+       lockdep_assert_held_write(&vm->lock);
+ #ifdef TEST_VM_ASYNC_OPS_ERROR
+       if (op->inject_error) {
+               op->inject_error = false;
+               return -ENOMEM;
+       }
+ #endif
+       switch (op->base.op) {
+       case DRM_GPUVA_OP_MAP:
+               ret = __xe_vma_op_execute(vm, op->map.vma, op);
+               break;
+       case DRM_GPUVA_OP_REMAP:
+       {
+               struct xe_vma *vma;
+               if (!op->remap.unmap_done)
+                       vma = gpuva_to_vma(op->base.remap.unmap->va);
+               else if (op->remap.prev)
+                       vma = op->remap.prev;
+               else
+                       vma = op->remap.next;
+               ret = __xe_vma_op_execute(vm, vma, op);
+               break;
+       }
+       case DRM_GPUVA_OP_UNMAP:
+               ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
+                                         op);
+               break;
+       case DRM_GPUVA_OP_PREFETCH:
+               ret = __xe_vma_op_execute(vm,
+                                         gpuva_to_vma(op->base.prefetch.va),
+                                         op);
+               break;
+       default:
+               drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+       }
+       return ret;
+ }
+ static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
+ {
+       bool last = op->flags & XE_VMA_OP_LAST;
+       if (last) {
+               while (op->num_syncs--)
+                       xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
+               kfree(op->syncs);
+               if (op->q)
+                       xe_exec_queue_put(op->q);
+       }
+       if (!list_empty(&op->link))
+               list_del(&op->link);
+       if (op->ops)
+               drm_gpuva_ops_free(&vm->gpuvm, op->ops);
+       if (last)
+               xe_vm_put(vm);
+ }
+ static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
+                            bool post_commit, bool prev_post_commit,
+                            bool next_post_commit)
+ {
+       lockdep_assert_held_write(&vm->lock);
+       switch (op->base.op) {
+       case DRM_GPUVA_OP_MAP:
+               if (op->map.vma) {
+                       prep_vma_destroy(vm, op->map.vma, post_commit);
+                       xe_vma_destroy_unlocked(op->map.vma);
+               }
+               break;
+       case DRM_GPUVA_OP_UNMAP:
+       {
+               struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
+               if (vma) {
+                       down_read(&vm->userptr.notifier_lock);
+                       vma->gpuva.flags &= ~XE_VMA_DESTROYED;
+                       up_read(&vm->userptr.notifier_lock);
+                       if (post_commit)
+                               xe_vm_insert_vma(vm, vma);
+               }
+               break;
+       }
+       case DRM_GPUVA_OP_REMAP:
+       {
+               struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
+               if (op->remap.prev) {
+                       prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
+                       xe_vma_destroy_unlocked(op->remap.prev);
+               }
+               if (op->remap.next) {
+                       prep_vma_destroy(vm, op->remap.next, next_post_commit);
+                       xe_vma_destroy_unlocked(op->remap.next);
+               }
+               if (vma) {
+                       down_read(&vm->userptr.notifier_lock);
+                       vma->gpuva.flags &= ~XE_VMA_DESTROYED;
+                       up_read(&vm->userptr.notifier_lock);
+                       if (post_commit)
+                               xe_vm_insert_vma(vm, vma);
+               }
+               break;
+       }
+       case DRM_GPUVA_OP_PREFETCH:
+               /* Nothing to do */
+               break;
+       default:
+               drm_warn(&vm->xe->drm, "NOT POSSIBLE");
+       }
+ }
+ static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
+                                    struct drm_gpuva_ops **ops,
+                                    int num_ops_list)
+ {
+       int i;
+       for (i = num_ops_list - 1; i; ++i) {
+               struct drm_gpuva_ops *__ops = ops[i];
+               struct drm_gpuva_op *__op;
+               if (!__ops)
+                       continue;
+               drm_gpuva_for_each_op_reverse(__op, __ops) {
+                       struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
+                       xe_vma_op_unwind(vm, op,
+                                        op->flags & XE_VMA_OP_COMMITTED,
+                                        op->flags & XE_VMA_OP_PREV_COMMITTED,
+                                        op->flags & XE_VMA_OP_NEXT_COMMITTED);
+               }
+               drm_gpuva_ops_free(&vm->gpuvm, __ops);
+       }
+ }
+ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
+                                    struct list_head *ops_list)
+ {
+       struct xe_vma_op *op, *next;
+       int err;
+       lockdep_assert_held_write(&vm->lock);
+       list_for_each_entry_safe(op, next, ops_list, link) {
+               err = xe_vma_op_execute(vm, op);
+               if (err) {
+                       drm_warn(&vm->xe->drm, "VM op(%d) failed with %d",
+                                op->base.op, err);
+                       /*
+                        * FIXME: Killing VM rather than proper error handling
+                        */
+                       xe_vm_kill(vm);
+                       return -ENOSPC;
+               }
+               xe_vma_op_cleanup(vm, op);
+       }
+       return 0;
+ }
+ #ifdef TEST_VM_ASYNC_OPS_ERROR
+ #define SUPPORTED_FLAGS       \
+       (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \
+        DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
+ #else
+ #define SUPPORTED_FLAGS       \
+       (DRM_XE_VM_BIND_FLAG_READONLY | \
+        DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
+        0xffff)
+ #endif
+ #define XE_64K_PAGE_MASK 0xffffull
+ #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
+ #define MAX_BINDS     512     /* FIXME: Picking random upper limit */
+ static int vm_bind_ioctl_check_args(struct xe_device *xe,
+                                   struct drm_xe_vm_bind *args,
+                                   struct drm_xe_vm_bind_op **bind_ops)
+ {
+       int err;
+       int i;
+       if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
+           XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+               return -EINVAL;
+       if (XE_IOCTL_DBG(xe, args->extensions) ||
+           XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
+               return -EINVAL;
+       if (args->num_binds > 1) {
+               u64 __user *bind_user =
+                       u64_to_user_ptr(args->vector_of_binds);
+               *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
+                                   args->num_binds, GFP_KERNEL);
+               if (!*bind_ops)
+                       return -ENOMEM;
+               err = __copy_from_user(*bind_ops, bind_user,
+                                      sizeof(struct drm_xe_vm_bind_op) *
+                                      args->num_binds);
+               if (XE_IOCTL_DBG(xe, err)) {
+                       err = -EFAULT;
+                       goto free_bind_ops;
+               }
+       } else {
+               *bind_ops = &args->bind;
+       }
+       for (i = 0; i < args->num_binds; ++i) {
+               u64 range = (*bind_ops)[i].range;
+               u64 addr = (*bind_ops)[i].addr;
+               u32 op = (*bind_ops)[i].op;
+               u32 flags = (*bind_ops)[i].flags;
+               u32 obj = (*bind_ops)[i].obj;
+               u64 obj_offset = (*bind_ops)[i].obj_offset;
+               u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
+               bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+               u16 pat_index = (*bind_ops)[i].pat_index;
+               u16 coh_mode;
+               if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+               pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
+               (*bind_ops)[i].pat_index = pat_index;
+               coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
+               if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+               if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+               if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
+                   XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
+                   XE_IOCTL_DBG(xe, obj && is_null) ||
+                   XE_IOCTL_DBG(xe, obj_offset && is_null) ||
+                   XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
+                                is_null) ||
+                   XE_IOCTL_DBG(xe, !obj &&
+                                op == DRM_XE_VM_BIND_OP_MAP &&
+                                !is_null) ||
+                   XE_IOCTL_DBG(xe, !obj &&
+                                op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
+                   XE_IOCTL_DBG(xe, addr &&
+                                op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
+                   XE_IOCTL_DBG(xe, range &&
+                                op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
+                   XE_IOCTL_DBG(xe, obj &&
+                                op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
+                   XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
+                                op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
+                   XE_IOCTL_DBG(xe, obj &&
+                                op == DRM_XE_VM_BIND_OP_PREFETCH) ||
+                   XE_IOCTL_DBG(xe, prefetch_region &&
+                                op != DRM_XE_VM_BIND_OP_PREFETCH) ||
+                   XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
+                                      xe->info.mem_region_mask)) ||
+                   XE_IOCTL_DBG(xe, obj &&
+                                op == DRM_XE_VM_BIND_OP_UNMAP)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+               if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
+                   XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
+                   XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
+                   XE_IOCTL_DBG(xe, !range &&
+                                op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+       }
+       return 0;
+ free_bind_ops:
+       if (args->num_binds > 1)
+               kfree(*bind_ops);
+       return err;
+ }
+ static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
+                                      struct xe_exec_queue *q,
+                                      struct xe_sync_entry *syncs,
+                                      int num_syncs)
+ {
+       struct dma_fence *fence;
+       int i, err = 0;
+       fence = xe_sync_in_fence_get(syncs, num_syncs,
+                                    to_wait_exec_queue(vm, q), vm);
+       if (IS_ERR(fence))
+               return PTR_ERR(fence);
+       for (i = 0; i < num_syncs; i++)
+               xe_sync_entry_signal(&syncs[i], NULL, fence);
+       xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
+                                    fence);
+       dma_fence_put(fence);
+       return err;
+ }
+ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+ {
+       struct xe_device *xe = to_xe_device(dev);
+       struct xe_file *xef = to_xe_file(file);
+       struct drm_xe_vm_bind *args = data;
+       struct drm_xe_sync __user *syncs_user;
+       struct xe_bo **bos = NULL;
+       struct drm_gpuva_ops **ops = NULL;
+       struct xe_vm *vm;
+       struct xe_exec_queue *q = NULL;
+       u32 num_syncs;
+       struct xe_sync_entry *syncs = NULL;
+       struct drm_xe_vm_bind_op *bind_ops;
+       LIST_HEAD(ops_list);
+       int err;
+       int i;
+       err = vm_bind_ioctl_check_args(xe, args, &bind_ops);
+       if (err)
+               return err;
+       if (args->exec_queue_id) {
+               q = xe_exec_queue_lookup(xef, args->exec_queue_id);
+               if (XE_IOCTL_DBG(xe, !q)) {
+                       err = -ENOENT;
+                       goto free_objs;
+               }
+               if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
+                       err = -EINVAL;
+                       goto put_exec_queue;
+               }
+       }
+       vm = xe_vm_lookup(xef, args->vm_id);
+       if (XE_IOCTL_DBG(xe, !vm)) {
+               err = -EINVAL;
+               goto put_exec_queue;
+       }
+       err = down_write_killable(&vm->lock);
+       if (err)
+               goto put_vm;
+       if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
+               err = -ENOENT;
+               goto release_vm_lock;
+       }
+       for (i = 0; i < args->num_binds; ++i) {
+               u64 range = bind_ops[i].range;
+               u64 addr = bind_ops[i].addr;
+               if (XE_IOCTL_DBG(xe, range > vm->size) ||
+                   XE_IOCTL_DBG(xe, addr > vm->size - range)) {
+                       err = -EINVAL;
+                       goto release_vm_lock;
+               }
+       }
+       if (args->num_binds) {
+               bos = kcalloc(args->num_binds, sizeof(*bos), GFP_KERNEL);
+               if (!bos) {
+                       err = -ENOMEM;
+                       goto release_vm_lock;
+               }
+               ops = kcalloc(args->num_binds, sizeof(*ops), GFP_KERNEL);
+               if (!ops) {
+                       err = -ENOMEM;
+                       goto release_vm_lock;
+               }
+       }
+       for (i = 0; i < args->num_binds; ++i) {
+               struct drm_gem_object *gem_obj;
+               u64 range = bind_ops[i].range;
+               u64 addr = bind_ops[i].addr;
+               u32 obj = bind_ops[i].obj;
+               u64 obj_offset = bind_ops[i].obj_offset;
+               u16 pat_index = bind_ops[i].pat_index;
+               u16 coh_mode;
+               if (!obj)
+                       continue;
+               gem_obj = drm_gem_object_lookup(file, obj);
+               if (XE_IOCTL_DBG(xe, !gem_obj)) {
+                       err = -ENOENT;
+                       goto put_obj;
+               }
+               bos[i] = gem_to_xe_bo(gem_obj);
+               if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
+                   XE_IOCTL_DBG(xe, obj_offset >
+                                bos[i]->size - range)) {
+                       err = -EINVAL;
+                       goto put_obj;
+               }
+               if (bos[i]->flags & XE_BO_INTERNAL_64K) {
+                       if (XE_IOCTL_DBG(xe, obj_offset &
+                                        XE_64K_PAGE_MASK) ||
+                           XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
+                           XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
+                               err = -EINVAL;
+                               goto put_obj;
+                       }
+               }
+               coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
+               if (bos[i]->cpu_caching) {
+                       if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
+                                        bos[i]->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
+                               err = -EINVAL;
+                               goto put_obj;
+                       }
+               } else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
+                       /*
+                        * Imported dma-buf from a different device should
+                        * require 1way or 2way coherency since we don't know
+                        * how it was mapped on the CPU. Just assume is it
+                        * potentially cached on CPU side.
+                        */
+                       err = -EINVAL;
+                       goto put_obj;
+               }
+       }
+       if (args->num_syncs) {
+               syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
+               if (!syncs) {
+                       err = -ENOMEM;
+                       goto put_obj;
+               }
+       }
+       syncs_user = u64_to_user_ptr(args->syncs);
+       for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
+               err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
+                                         &syncs_user[num_syncs],
+                                         (xe_vm_in_lr_mode(vm) ?
+                                          SYNC_PARSE_FLAG_LR_MODE : 0) |
+                                         (!args->num_binds ?
+                                          SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
+               if (err)
+                       goto free_syncs;
+       }
+       if (!args->num_binds) {
+               err = -ENODATA;
+               goto free_syncs;
+       }
+       for (i = 0; i < args->num_binds; ++i) {
+               u64 range = bind_ops[i].range;
+               u64 addr = bind_ops[i].addr;
+               u32 op = bind_ops[i].op;
+               u32 flags = bind_ops[i].flags;
+               u64 obj_offset = bind_ops[i].obj_offset;
+               u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
+               u16 pat_index = bind_ops[i].pat_index;
+               ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
+                                                 addr, range, op, flags,
+                                                 prefetch_region, pat_index);
+               if (IS_ERR(ops[i])) {
+                       err = PTR_ERR(ops[i]);
+                       ops[i] = NULL;
+                       goto unwind_ops;
+               }
+               err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
+                                             &ops_list,
+                                             i == args->num_binds - 1);
+               if (err)
+                       goto unwind_ops;
+       }
+       /* Nothing to do */
+       if (list_empty(&ops_list)) {
+               err = -ENODATA;
+               goto unwind_ops;
+       }
+       xe_vm_get(vm);
+       if (q)
+               xe_exec_queue_get(q);
+       err = vm_bind_ioctl_ops_execute(vm, &ops_list);
+       up_write(&vm->lock);
+       if (q)
+               xe_exec_queue_put(q);
+       xe_vm_put(vm);
+       for (i = 0; bos && i < args->num_binds; ++i)
+               xe_bo_put(bos[i]);
+       kfree(bos);
+       kfree(ops);
+       if (args->num_binds > 1)
+               kfree(bind_ops);
+       return err;
+ unwind_ops:
+       vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
+ free_syncs:
+       if (err == -ENODATA)
+               err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
+       while (num_syncs--)
+               xe_sync_entry_cleanup(&syncs[num_syncs]);
+       kfree(syncs);
+ put_obj:
+       for (i = 0; i < args->num_binds; ++i)
+               xe_bo_put(bos[i]);
+ release_vm_lock:
+       up_write(&vm->lock);
+ put_vm:
+       xe_vm_put(vm);
+ put_exec_queue:
+       if (q)
+               xe_exec_queue_put(q);
+ free_objs:
+       kfree(bos);
+       kfree(ops);
+       if (args->num_binds > 1)
+               kfree(bind_ops);
+       return err;
+ }
+ /**
+  * xe_vm_lock() - Lock the vm's dma_resv object
+  * @vm: The struct xe_vm whose lock is to be locked
+  * @intr: Whether to perform any wait interruptible
+  *
+  * Return: 0 on success, -EINTR if @intr is true and the wait for a
+  * contended lock was interrupted. If @intr is false, the function
+  * always returns 0.
+  */
+ int xe_vm_lock(struct xe_vm *vm, bool intr)
+ {
+       if (intr)
+               return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
+       return dma_resv_lock(xe_vm_resv(vm), NULL);
+ }
+ /**
+  * xe_vm_unlock() - Unlock the vm's dma_resv object
+  * @vm: The struct xe_vm whose lock is to be released.
+  *
+  * Unlock a buffer object lock that was locked by xe_vm_lock().
+  */
+ void xe_vm_unlock(struct xe_vm *vm)
+ {
+       dma_resv_unlock(xe_vm_resv(vm));
+ }
+ /**
+  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
+  * @vma: VMA to invalidate
+  *
+  * Walks a list of page tables leaves which it memset the entries owned by this
+  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
+  * complete.
+  *
+  * Returns 0 for success, negative error code otherwise.
+  */
+ int xe_vm_invalidate_vma(struct xe_vma *vma)
+ {
+       struct xe_device *xe = xe_vma_vm(vma)->xe;
+       struct xe_tile *tile;
+       u32 tile_needs_invalidate = 0;
+       int seqno[XE_MAX_TILES_PER_DEVICE];
+       u8 id;
+       int ret;
+       xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma)));
+       xe_assert(xe, !xe_vma_is_null(vma));
+       trace_xe_vma_usm_invalidate(vma);
+       /* Check that we don't race with page-table updates */
+       if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+               if (xe_vma_is_userptr(vma)) {
+                       WARN_ON_ONCE(!mmu_interval_check_retry
+                                    (&vma->userptr.notifier,
+                                     vma->userptr.notifier_seq));
+                       WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
+                                                            DMA_RESV_USAGE_BOOKKEEP));
+               } else {
+                       xe_bo_assert_held(xe_vma_bo(vma));
+               }
+       }
+       for_each_tile(tile, xe, id) {
+               if (xe_pt_zap_ptes(tile, vma)) {
+                       tile_needs_invalidate |= BIT(id);
+                       xe_device_wmb(xe);
+                       /*
+                        * FIXME: We potentially need to invalidate multiple
+                        * GTs within the tile
+                        */
+                       seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
+                       if (seqno[id] < 0)
+                               return seqno[id];
+               }
+       }
+       for_each_tile(tile, xe, id) {
+               if (tile_needs_invalidate & BIT(id)) {
+                       ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
+       vma->usm.tile_invalidated = vma->tile_mask;
+       return 0;
+ }
+ int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
+ {
+       struct drm_gpuva *gpuva;
+       bool is_vram;
+       uint64_t addr;
+       if (!down_read_trylock(&vm->lock)) {
+               drm_printf(p, " Failed to acquire VM lock to dump capture");
+               return 0;
+       }
+       if (vm->pt_root[gt_id]) {
+               addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
+               is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
+               drm_printf(p, " VM root: A:0x%llx %s\n", addr,
+                          is_vram ? "VRAM" : "SYS");
+       }
+       drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
+               struct xe_vma *vma = gpuva_to_vma(gpuva);
+               bool is_userptr = xe_vma_is_userptr(vma);
+               bool is_null = xe_vma_is_null(vma);
+               if (is_null) {
+                       addr = 0;
+               } else if (is_userptr) {
+                       struct xe_res_cursor cur;
+                       if (vma->userptr.sg) {
+                               xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
+                                               &cur);
+                               addr = xe_res_dma(&cur);
+                       } else {
+                               addr = 0;
+                       }
+               } else {
+                       addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
+                       is_vram = xe_bo_is_vram(xe_vma_bo(vma));
+               }
+               drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
+                          xe_vma_start(vma), xe_vma_end(vma) - 1,
+                          xe_vma_size(vma),
+                          addr, is_null ? "NULL" : is_userptr ? "USR" :
+                          is_vram ? "VRAM" : "SYS");
+       }
+       up_read(&vm->lock);
+       return 0;
+ }
Simple merge