diff --git a/drivers/Kconfig b/drivers/Kconfig
index 7bdad836fc6207727300e79c2d6f7db485baf80a..257af611384d849fde7bca73e1ae91dec5072ec8 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -245,4 +245,6 @@ source "drivers/cdx/Kconfig"
 
 source "drivers/dpll/Kconfig"
 
+source "drivers/rknpu/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 3bf5cab4b45191e4ff9bb657a186e958b306e348..29feb5cd62d88fce265e3609a5648577e586e18d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -193,6 +193,7 @@ obj-$(CONFIG_COUNTER)		+= counter/
 obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_PECI)		+= peci/
 obj-$(CONFIG_HTE)		+= hte/
+obj-$(CONFIG_ROCKCHIP_RKNPU)	+= rknpu/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
 obj-$(CONFIG_CDX_BUS)		+= cdx/
 obj-$(CONFIG_DPLL)		+= dpll/
diff --git a/drivers/rknpu/Kconfig b/drivers/rknpu/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..c3343eece9c6db2adcaefa1f45ffdb960b530ed7
--- /dev/null
+++ b/drivers/rknpu/Kconfig
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0
+menu "RKNPU"
+	depends on ARCH_ROCKCHIP
+
+config ROCKCHIP_RKNPU
+	tristate "ROCKCHIP_RKNPU"
+	depends on DRM || DMABUF_HEAPS_ROCKCHIP_CMA_HEAP
+	help
+	  rknpu module.
+
+if ROCKCHIP_RKNPU
+
+config ROCKCHIP_RKNPU_DEBUG_FS
+	bool "RKNPU debugfs"
+	depends on DEBUG_FS
+	default y
+	help
+	  Enable debugfs to debug RKNPU usage.
+
+config ROCKCHIP_RKNPU_PROC_FS
+	bool "RKNPU procfs"
+	depends on PROC_FS
+	help
+	  Enable procfs to debug RKNPU usage.
+
+config ROCKCHIP_RKNPU_FENCE
+	bool "RKNPU fence"
+	depends on SYNC_FILE
+	help
+	  Enable fence support for RKNPU.
+
+config ROCKCHIP_RKNPU_SRAM
+	bool "RKNPU SRAM"
+	depends on NO_GKI
+	help
+	  Enable RKNPU SRAM support
+
+choice
+	prompt "RKNPU memory manager"
+	default ROCKCHIP_RKNPU_DRM_GEM
+	help
+	  Select RKNPU memory manager
+
+config ROCKCHIP_RKNPU_DRM_GEM
+	bool "RKNPU DRM GEM"
+	depends on DRM
+	help
+	  Enable RKNPU memory manager by DRM GEM.
+
+config ROCKCHIP_RKNPU_DMA_HEAP
+	bool "RKNPU DMA heap"
+	depends on DMABUF_HEAPS_ROCKCHIP_CMA_HEAP
+	help
+	  Enable RKNPU memory manager by DMA Heap.
+
+endchoice
+
+endif
+
+endmenu
diff --git a/drivers/rknpu/Makefile b/drivers/rknpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..41dacc93157ceec1d16f90b6a7cc971c207f563b
--- /dev/null
+++ b/drivers/rknpu/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ROCKCHIP_RKNPU) += rknpu.o
+
+ccflags-y += -I$(srctree)/$(src)/include
+ccflags-y += -I$(src)/include
+ccflags-y += -Werror
+
+rknpu-y += rknpu_drv.o
+rknpu-y += rknpu_reset.o
+rknpu-y += rknpu_job.o
+rknpu-y += rknpu_debugger.o
+rknpu-$(CONFIG_ROCKCHIP_RKNPU_SRAM) += rknpu_mm.o
+rknpu-$(CONFIG_ROCKCHIP_RKNPU_FENCE) += rknpu_fence.o
+rknpu-$(CONFIG_ROCKCHIP_RKNPU_DRM_GEM) += rknpu_gem.o
+rknpu-$(CONFIG_ROCKCHIP_RKNPU_DMA_HEAP) += rknpu_mem.o
diff --git a/drivers/rknpu/include/rknpu_debugger.h b/drivers/rknpu/include/rknpu_debugger.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f4420d443e1e0c4646b48d6285be3a621fc4e31
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_debugger.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_DEBUGGER_H_
+#define __LINUX_RKNPU_DEBUGGER_H_
+
+#include <linux/seq_file.h>
+
+/*
+ * struct rknpu_debugger - rknpu debugger information
+ *
+ * This structure represents a debugger to be created by the rknpu driver
+ * or core.
+ */
+struct rknpu_debugger {
+#ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS
+	/* Directory of debugfs file */
+	struct dentry *debugfs_dir;
+	struct list_head debugfs_entry_list;
+	struct mutex debugfs_lock;
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS
+	/* Directory of procfs file */
+	struct proc_dir_entry *procfs_dir;
+	struct list_head procfs_entry_list;
+	struct mutex procfs_lock;
+#endif
+};
+
+/*
+ * struct rknpu_debugger_list - debugfs/procfs info list entry
+ *
+ * This structure represents a debugfs/procfs file to be created by the npu
+ * driver or core.
+ */
+struct rknpu_debugger_list {
+	/* File name */
+	const char *name;
+	/*
+	 * Show callback. &seq_file->private will be set to the &struct
+	 * rknpu_debugger_node corresponding to the instance of this info
+	 * on a given &struct rknpu_debugger.
+	 */
+	int (*show)(struct seq_file *seq, void *data);
+	/*
+	 * Write callback. &seq_file->private will be set to the &struct
+	 * rknpu_debugger_node corresponding to the instance of this info
+	 * on a given &struct rknpu_debugger.
+	 */
+	ssize_t (*write)(struct file *file, const char __user *ubuf, size_t len,
+			 loff_t *offp);
+	/* Procfs/Debugfs private data. */
+	void *data;
+};
+
+/*
+ * struct rknpu_debugger_node - Nodes for debugfs/procfs
+ *
+ * This structure represents each instance of procfs/debugfs created from the
+ * template.
+ */
+struct rknpu_debugger_node {
+	struct rknpu_debugger *debugger;
+
+	/* template for this node. */
+	const struct rknpu_debugger_list *info_ent;
+
+	/* Each Procfs/Debugfs file. */
+#ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS
+	struct dentry *dent;
+#endif
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS
+	struct proc_dir_entry *pent;
+#endif
+
+	struct list_head list;
+};
+
+struct rknpu_device;
+
+int rknpu_debugger_init(struct rknpu_device *rknpu_dev);
+int rknpu_debugger_remove(struct rknpu_device *rknpu_dev);
+
+#endif /* __LINUX_RKNPU_FENCE_H_ */
diff --git a/drivers/rknpu/include/rknpu_drv.h b/drivers/rknpu/include/rknpu_drv.h
new file mode 100644
index 0000000000000000000000000000000000000000..816dd16de219a63c8af65f53b67ab172b2db3e40
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_drv.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_DRV_H_
+#define __LINUX_RKNPU_DRV_H_
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/regulator/consumer.h>
+#include <linux/version.h>
+#include <linux/hrtimer.h>
+#include <linux/miscdevice.h>
+
+#ifndef FPGA_PLATFORM
+#if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+#include <soc/rockchip/rockchip_opp_select.h>
+#endif
+#endif
+
+#include "rknpu_job.h"
+#include "rknpu_fence.h"
+#include "rknpu_debugger.h"
+#include "rknpu_mm.h"
+
+#define DRIVER_NAME "rknpu"
+#define DRIVER_DESC "RKNPU driver"
+#define DRIVER_DATE "20221110"
+#define DRIVER_MAJOR 0
+#define DRIVER_MINOR 8
+#define DRIVER_PATCHLEVEL 3
+
+#define LOG_TAG "RKNPU"
+
+/* sample interval: 1000ms */
+#define RKNPU_LOAD_INTERVAL 1000000000
+
+#define LOG_INFO(fmt, args...) pr_info(LOG_TAG ": " fmt, ##args)
+#if KERNEL_VERSION(5, 5, 0) <= LINUX_VERSION_CODE
+#define LOG_WARN(fmt, args...) pr_warn(LOG_TAG ": " fmt, ##args)
+#else
+#define LOG_WARN(fmt, args...) pr_warning(LOG_TAG ": " fmt, ##args)
+#endif
+#define LOG_DEBUG(fmt, args...) pr_devel(LOG_TAG ": " fmt, ##args)
+#define LOG_ERROR(fmt, args...) pr_err(LOG_TAG ": " fmt, ##args)
+
+#define LOG_DEV_INFO(dev, fmt, args...) dev_info(dev, LOG_TAG ": " fmt, ##args)
+#define LOG_DEV_WARN(dev, fmt, args...) dev_warn(dev, LOG_TAG ": " fmt, ##args)
+#define LOG_DEV_DEBUG(dev, fmt, args...) dev_dbg(dev, LOG_TAG ": " fmt, ##args)
+#define LOG_DEV_ERROR(dev, fmt, args...) dev_err(dev, LOG_TAG ": " fmt, ##args)
+
+struct rknpu_reset_data {
+	const char *srst_a_name;
+	const char *srst_h_name;
+};
+
+struct rknpu_config {
+	__u32 bw_priority_addr;
+	__u32 bw_priority_length;
+	__u64 dma_mask;
+	__u32 pc_data_amount_scale;
+	__u32 pc_task_number_bits;
+	__u32 pc_task_number_mask;
+	__u32 pc_task_status_offset;
+	__u32 bw_enable;
+	const struct rknpu_irqs_data *irqs;
+	const struct rknpu_reset_data *resets;
+	int num_irqs;
+	int num_resets;
+};
+
+struct rknpu_timer {
+	__u32 busy_time;
+	__u32 busy_time_record;
+};
+
+struct rknpu_subcore_data {
+	struct list_head todo_list;
+	wait_queue_head_t job_done_wq;
+	struct rknpu_job *job;
+	int64_t task_num;
+	struct rknpu_timer timer;
+};
+
+/**
+ * RKNPU device
+ *
+ * @base: IO mapped base address for device
+ * @dev: Device instance
+ * @drm_dev: DRM device instance
+ */
+struct rknpu_device {
+	void __iomem *base[RKNPU_MAX_CORES];
+	struct device *dev;
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	struct drm_device *drm_dev;
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+	struct miscdevice miscdev;
+	struct rk_dma_heap *heap;
+#endif
+	atomic_t sequence;
+	spinlock_t lock;
+	spinlock_t irq_lock;
+	struct mutex power_lock;
+	struct mutex reset_lock;
+	struct rknpu_subcore_data subcore_datas[RKNPU_MAX_CORES];
+	const struct rknpu_config *config;
+	void __iomem *bw_priority_base;
+	struct rknpu_fence_context *fence_ctx;
+	bool iommu_en;
+	struct reset_control *srst_a[RKNPU_MAX_CORES];
+	struct reset_control *srst_h[RKNPU_MAX_CORES];
+	struct clk_bulk_data *clks;
+	int num_clks;
+	struct regulator *vdd;
+	struct regulator *mem;
+	struct thermal_cooling_device *devfreq_cooling;
+	struct devfreq *devfreq;
+	unsigned long ondemand_freq;
+#ifndef FPGA_PLATFORM
+#if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+	struct rockchip_opp_info opp_info;
+#endif
+#endif
+	unsigned long current_freq;
+	unsigned long current_volt;
+	int bypass_irq_handler;
+	int bypass_soft_reset;
+	bool soft_reseting;
+	struct device *genpd_dev_npu0;
+	struct device *genpd_dev_npu1;
+	struct device *genpd_dev_npu2;
+	bool multiple_domains;
+	atomic_t power_refcount;
+	atomic_t cmdline_power_refcount;
+	struct delayed_work power_off_work;
+	struct workqueue_struct *power_off_wq;
+	struct rknpu_debugger debugger;
+	struct hrtimer timer;
+	ktime_t kt;
+	phys_addr_t sram_start;
+	phys_addr_t sram_end;
+	uint32_t sram_size;
+	void __iomem *sram_base_io;
+	struct rknpu_mm *sram_mm;
+	unsigned long power_put_delay;
+};
+
+int rknpu_power_get(struct rknpu_device *rknpu_dev);
+int rknpu_power_put(struct rknpu_device *rknpu_dev);
+
+#endif /* __LINUX_RKNPU_DRV_H_ */
diff --git a/drivers/rknpu/include/rknpu_fence.h b/drivers/rknpu/include/rknpu_fence.h
new file mode 100644
index 0000000000000000000000000000000000000000..164f6de4116b33c19ad6a7ab772d39b9c786f981
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_fence.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_FENCE_H_
+#define __LINUX_RKNPU_FENCE_H_
+
+#include "rknpu_job.h"
+
+struct rknpu_fence_context {
+	unsigned int context;
+	unsigned int seqno;
+	spinlock_t spinlock;
+};
+
+int rknpu_fence_context_alloc(struct rknpu_device *rknpu_dev);
+
+int rknpu_fence_alloc(struct rknpu_job *job);
+
+int rknpu_fence_get_fd(struct rknpu_job *job);
+
+#endif /* __LINUX_RKNPU_FENCE_H_ */
diff --git a/drivers/rknpu/include/rknpu_gem.h b/drivers/rknpu/include/rknpu_gem.h
new file mode 100644
index 0000000000000000000000000000000000000000..954586607b16562090e9fe6fb3a0f6d530a247e1
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_gem.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_GEM_H
+#define __LINUX_RKNPU_GEM_H
+
+#include <linux/mm_types.h>
+#include <linux/version.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_vma_manager.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_mode.h>
+
+#if KERNEL_VERSION(4, 14, 0) > LINUX_VERSION_CODE
+#include <drm/drm_mem_util.h>
+#endif
+
+#include "rknpu_mm.h"
+
+#define to_rknpu_obj(x) container_of(x, struct rknpu_gem_object, base)
+
+/*
+ * rknpu drm buffer structure.
+ *
+ * @base: a gem object.
+ *	- a new handle to this gem object would be created
+ *	by drm_gem_handle_create().
+ * @flags: indicate memory type to allocated buffer and cache attribute.
+ * @size: size requested from user, in bytes and this size is aligned
+ *	in page unit.
+ * @cookie: cookie returned by dma_alloc_attrs
+ * @kv_addr: kernel virtual address to allocated memory region.
+ * @dma_addr: bus address(accessed by dma) to allocated memory region.
+ *	- this address could be physical address without IOMMU and
+ *	device address with IOMMU.
+ * @pages: Array of backing pages.
+ * @sgt: Imported sg_table.
+ *
+ * P.S. this object would be transferred to user as kms_bo.handle so
+ *	user can access the buffer through kms_bo.handle.
+ */
+struct rknpu_gem_object {
+	struct drm_gem_object base;
+	unsigned int flags;
+	unsigned long size;
+	unsigned long sram_size;
+	struct rknpu_mm_obj *sram_obj;
+	dma_addr_t iova_start;
+	unsigned long iova_size;
+	void *cookie;
+	void __iomem *kv_addr;
+	dma_addr_t dma_addr;
+	unsigned long dma_attrs;
+	unsigned long num_pages;
+	struct page **pages;
+	struct sg_table *sgt;
+	struct drm_mm_node mm_node;
+};
+
+/* create a new buffer with gem object */
+struct rknpu_gem_object *rknpu_gem_object_create(struct drm_device *dev,
+						 unsigned int flags,
+						 unsigned long size,
+						 unsigned long sram_size);
+
+/* destroy a buffer with gem object */
+void rknpu_gem_object_destroy(struct rknpu_gem_object *rknpu_obj);
+
+/* request gem object creation and buffer allocation as the size */
+int rknpu_gem_create_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv);
+
+/* get fake-offset of gem object that can be used with mmap. */
+int rknpu_gem_map_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+
+int rknpu_gem_destroy_ioctl(struct drm_device *dev, void *data,
+			    struct drm_file *file_priv);
+
+/*
+ * get rknpu drm object,
+ * gem object reference count would be increased.
+ */
+static inline void rknpu_gem_object_get(struct drm_gem_object *obj)
+{
+#if KERNEL_VERSION(4, 13, 0) < LINUX_VERSION_CODE
+	drm_gem_object_get(obj);
+#else
+	drm_gem_object_reference(obj);
+#endif
+}
+
+/*
+ * put rknpu drm object acquired from rknpu_gem_object_find() or rknpu_gem_object_get(),
+ * gem object reference count would be decreased.
+ */
+static inline void rknpu_gem_object_put(struct drm_gem_object *obj)
+{
+#if KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE
+	drm_gem_object_put(obj);
+#elif KERNEL_VERSION(4, 13, 0) < LINUX_VERSION_CODE
+	drm_gem_object_put_unlocked(obj);
+#else
+	drm_gem_object_unreference_unlocked(obj);
+#endif
+}
+
+/*
+ * get rknpu drm object from gem handle, this function could be used for
+ * other drivers such as 2d/3d acceleration drivers.
+ * with this function call, gem object reference count would be increased.
+ */
+static inline struct rknpu_gem_object *
+rknpu_gem_object_find(struct drm_file *filp, unsigned int handle)
+{
+	struct drm_gem_object *obj;
+
+	obj = drm_gem_object_lookup(filp, handle);
+	if (!obj) {
+		// DRM_ERROR("failed to lookup gem object.\n");
+		return NULL;
+	}
+
+	rknpu_gem_object_put(obj);
+
+	return to_rknpu_obj(obj);
+}
+
+/* get buffer information to memory region allocated by gem. */
+int rknpu_gem_get_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+
+/* free gem object. */
+void rknpu_gem_free_object(struct drm_gem_object *obj);
+
+/* create memory region for drm framebuffer. */
+int rknpu_gem_dumb_create(struct drm_file *file_priv, struct drm_device *dev,
+			  struct drm_mode_create_dumb *args);
+
+#if KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE
+/* map memory region for drm framebuffer to user space. */
+int rknpu_gem_dumb_map_offset(struct drm_file *file_priv,
+			      struct drm_device *dev, uint32_t handle,
+			      uint64_t *offset);
+#endif
+
+/* page fault handler and mmap fault address(virtual) to physical memory. */
+#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
+vm_fault_t rknpu_gem_fault(struct vm_fault *vmf);
+#elif KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE
+int rknpu_gem_fault(struct vm_fault *vmf);
+#else
+int rknpu_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+#endif
+
+/* set vm_flags and we can change the vm attribute to other one at here. */
+int rknpu_gem_mmap(struct file *filp, struct vm_area_struct *vma);
+
+/* low-level interface prime helpers */
+#if KERNEL_VERSION(4, 13, 0) <= LINUX_VERSION_CODE
+struct drm_gem_object *rknpu_gem_prime_import(struct drm_device *dev,
+					      struct dma_buf *dma_buf);
+#endif
+struct sg_table *rknpu_gem_prime_get_sg_table(struct drm_gem_object *obj);
+struct drm_gem_object *
+rknpu_gem_prime_import_sg_table(struct drm_device *dev,
+				struct dma_buf_attachment *attach,
+				struct sg_table *sgt);
+void *rknpu_gem_prime_vmap(struct drm_gem_object *obj);
+void rknpu_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
+int rknpu_gem_prime_mmap(struct drm_gem_object *obj,
+			 struct vm_area_struct *vma);
+
+int rknpu_gem_sync_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
+
+static inline void *rknpu_gem_alloc_page(size_t nr_pages)
+{
+#if KERNEL_VERSION(4, 13, 0) <= LINUX_VERSION_CODE
+	return kvmalloc_array(nr_pages, sizeof(struct page *),
+			      GFP_KERNEL | __GFP_ZERO);
+#else
+	return drm_calloc_large(nr_pages, sizeof(struct page *));
+#endif
+}
+
+static inline void rknpu_gem_free_page(void *pages)
+{
+#if KERNEL_VERSION(4, 13, 0) <= LINUX_VERSION_CODE
+	kvfree(pages);
+#else
+	drm_free_large(pages);
+#endif
+}
+
+#endif
diff --git a/drivers/rknpu/include/rknpu_ioctl.h b/drivers/rknpu/include/rknpu_ioctl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc7225fb7b473d0db491dc4517f76bbc68601497
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_ioctl.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_IOCTL_H
+#define __LINUX_RKNPU_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+#define RKNPU_OFFSET_VERSION 0x0
+#define RKNPU_OFFSET_VERSION_NUM 0x4
+#define RKNPU_OFFSET_PC_OP_EN 0x8
+#define RKNPU_OFFSET_PC_DATA_ADDR 0x10
+#define RKNPU_OFFSET_PC_DATA_AMOUNT 0x14
+#define RKNPU_OFFSET_PC_TASK_CONTROL 0x30
+#define RKNPU_OFFSET_PC_DMA_BASE_ADDR 0x34
+
+#define RKNPU_OFFSET_INT_MASK 0x20
+#define RKNPU_OFFSET_INT_CLEAR 0x24
+#define RKNPU_OFFSET_INT_STATUS 0x28
+#define RKNPU_OFFSET_INT_RAW_STATUS 0x2c
+
+#define RKNPU_OFFSET_CLR_ALL_RW_AMOUNT 0x8010
+#define RKNPU_OFFSET_DT_WR_AMOUNT 0x8034
+#define RKNPU_OFFSET_DT_RD_AMOUNT 0x8038
+#define RKNPU_OFFSET_WT_RD_AMOUNT 0x803c
+
+#define RKNPU_OFFSET_ENABLE_MASK 0xf008
+
+#define RKNPU_INT_CLEAR 0x1ffff
+
+#define RKNPU_PC_DATA_EXTRA_AMOUNT 4
+
+#define RKNPU_STR_HELPER(x) #x
+
+#define RKNPU_GET_DRV_VERSION_STRING(MAJOR, MINOR, PATCHLEVEL)                 \
+	RKNPU_STR_HELPER(MAJOR)                                                \
+	"." RKNPU_STR_HELPER(MINOR) "." RKNPU_STR_HELPER(PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_CODE(MAJOR, MINOR, PATCHLEVEL)                   \
+	(MAJOR * 10000 + MINOR * 100 + PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_MAJOR(CODE) (CODE / 10000)
+#define RKNPU_GET_DRV_VERSION_MINOR(CODE) ((CODE % 10000) / 100)
+#define RKNPU_GET_DRV_VERSION_PATCHLEVEL(CODE) (CODE % 100)
+
+/* memory type definitions. */
+enum e_rknpu_mem_type {
+	/* physically continuous memory and used as default. */
+	RKNPU_MEM_CONTIGUOUS = 0 << 0,
+	/* physically non-continuous memory. */
+	RKNPU_MEM_NON_CONTIGUOUS = 1 << 0,
+	/* non-cacheable mapping and used as default. */
+	RKNPU_MEM_NON_CACHEABLE = 0 << 1,
+	/* cacheable mapping. */
+	RKNPU_MEM_CACHEABLE = 1 << 1,
+	/* write-combine mapping. */
+	RKNPU_MEM_WRITE_COMBINE = 1 << 2,
+	/* dma attr kernel mapping */
+	RKNPU_MEM_KERNEL_MAPPING = 1 << 3,
+	/* iommu mapping */
+	RKNPU_MEM_IOMMU = 1 << 4,
+	/* zero mapping */
+	RKNPU_MEM_ZEROING = 1 << 5,
+	/* allocate secure buffer */
+	RKNPU_MEM_SECURE = 1 << 6,
+	/* allocate from non-dma32 zone */
+	RKNPU_MEM_NON_DMA32 = 1 << 7,
+	/* request SRAM */
+	RKNPU_MEM_TRY_ALLOC_SRAM = 1 << 8,
+	RKNPU_MEM_MASK = RKNPU_MEM_NON_CONTIGUOUS | RKNPU_MEM_CACHEABLE |
+			 RKNPU_MEM_WRITE_COMBINE | RKNPU_MEM_KERNEL_MAPPING |
+			 RKNPU_MEM_IOMMU | RKNPU_MEM_ZEROING |
+			 RKNPU_MEM_SECURE | RKNPU_MEM_NON_DMA32 |
+			 RKNPU_MEM_TRY_ALLOC_SRAM
+};
+
+/* sync mode definitions. */
+enum e_rknpu_mem_sync_mode {
+	RKNPU_MEM_SYNC_TO_DEVICE = 1 << 0,
+	RKNPU_MEM_SYNC_FROM_DEVICE = 1 << 1,
+	RKNPU_MEM_SYNC_MASK =
+		RKNPU_MEM_SYNC_TO_DEVICE | RKNPU_MEM_SYNC_FROM_DEVICE
+};
+
+/* job mode definitions. */
+enum e_rknpu_job_mode {
+	RKNPU_JOB_SLAVE = 0 << 0,
+	RKNPU_JOB_PC = 1 << 0,
+	RKNPU_JOB_BLOCK = 0 << 1,
+	RKNPU_JOB_NONBLOCK = 1 << 1,
+	RKNPU_JOB_PINGPONG = 1 << 2,
+	RKNPU_JOB_FENCE_IN = 1 << 3,
+	RKNPU_JOB_FENCE_OUT = 1 << 4,
+	RKNPU_JOB_MASK = RKNPU_JOB_PC | RKNPU_JOB_NONBLOCK |
+			 RKNPU_JOB_PINGPONG | RKNPU_JOB_FENCE_IN |
+			 RKNPU_JOB_FENCE_OUT
+};
+
+/* action definitions */
+enum e_rknpu_action {
+	RKNPU_GET_HW_VERSION = 0,
+	RKNPU_GET_DRV_VERSION = 1,
+	RKNPU_GET_FREQ = 2,
+	RKNPU_SET_FREQ = 3,
+	RKNPU_GET_VOLT = 4,
+	RKNPU_SET_VOLT = 5,
+	RKNPU_ACT_RESET = 6,
+	RKNPU_GET_BW_PRIORITY = 7,
+	RKNPU_SET_BW_PRIORITY = 8,
+	RKNPU_GET_BW_EXPECT = 9,
+	RKNPU_SET_BW_EXPECT = 10,
+	RKNPU_GET_BW_TW = 11,
+	RKNPU_SET_BW_TW = 12,
+	RKNPU_ACT_CLR_TOTAL_RW_AMOUNT = 13,
+	RKNPU_GET_DT_WR_AMOUNT = 14,
+	RKNPU_GET_DT_RD_AMOUNT = 15,
+	RKNPU_GET_WT_RD_AMOUNT = 16,
+	RKNPU_GET_TOTAL_RW_AMOUNT = 17,
+	RKNPU_GET_IOMMU_EN = 18,
+	RKNPU_SET_PROC_NICE = 19,
+	RKNPU_POWER_ON = 20,
+	RKNPU_POWER_OFF = 21,
+	RKNPU_GET_TOTAL_SRAM_SIZE = 22,
+	RKNPU_GET_FREE_SRAM_SIZE = 23,
+};
+
+/**
+ * User-desired buffer creation information structure.
+ *
+ * @handle: The handle of the created GEM object.
+ * @flags: user request for setting memory type or cache attributes.
+ * @size: user-desired memory allocation size.
+ *	- this size value would be page-aligned internally.
+ * @obj_addr: address of RKNPU memory object.
+ * @dma_addr: dma address that access by rknpu.
+ * @sram_size: user-desired sram memory allocation size.
+ *  - this size value would be page-aligned internally.
+ */
+struct rknpu_mem_create {
+	__u32 handle;
+	__u32 flags;
+	__u64 size;
+	__u64 obj_addr;
+	__u64 dma_addr;
+	__u64 sram_size;
+};
+
+/**
+ * A structure for getting a fake-offset that can be used with mmap.
+ *
+ * @handle: handle of gem object.
+ * @reserved: just padding to be 64-bit aligned.
+ * @offset: a fake-offset of gem object.
+ */
+struct rknpu_mem_map {
+	__u32 handle;
+	__u32 reserved;
+	__u64 offset;
+};
+
+/**
+ * For destroying DMA buffer
+ *
+ * @handle:	handle of the buffer.
+ * @reserved: reserved for padding.
+ * @obj_addr: rknpu_mem_object addr.
+ */
+struct rknpu_mem_destroy {
+	__u32 handle;
+	__u32 reserved;
+	__u64 obj_addr;
+};
+
+/**
+ * For synchronizing DMA buffer
+ *
+ * @flags: user request for setting memory type or cache attributes.
+ * @reserved: reserved for padding.
+ * @obj_addr: address of RKNPU memory object.
+ * @offset: offset in bytes from start address of buffer.
+ * @size: size of memory region.
+ *
+ */
+struct rknpu_mem_sync {
+	__u32 flags;
+	__u32 reserved;
+	__u64 obj_addr;
+	__u64 offset;
+	__u64 size;
+};
+
+/**
+ * struct rknpu_task structure for task information
+ *
+ * @flags: flags for task
+ * @op_idx: operator index
+ * @enable_mask: enable mask
+ * @int_mask: interrupt mask
+ * @int_clear: interrupt clear
+ * @int_status: interrupt status
+ * @regcfg_amount: register config number
+ * @regcfg_offset: offset for register config
+ * @regcmd_addr: address for register command
+ *
+ */
+struct rknpu_task {
+	__u32 flags;
+	__u32 op_idx;
+	__u32 enable_mask;
+	__u32 int_mask;
+	__u32 int_clear;
+	__u32 int_status;
+	__u32 regcfg_amount;
+	__u32 regcfg_offset;
+	__u64 regcmd_addr;
+} __packed;
+
+/**
+ * struct rknpu_subcore_task structure for subcore task index
+ *
+ * @task_start: task start index
+ * @task_number: task number
+ *
+ */
+struct rknpu_subcore_task {
+	__u32 task_start;
+	__u32 task_number;
+};
+
+/**
+ * struct rknpu_submit structure for job submit
+ *
+ * @flags: flags for job submit
+ * @timeout: submit timeout
+ * @task_start: task start index
+ * @task_number: task number
+ * @task_counter: task counter
+ * @priority: submit priority
+ * @task_obj_addr: address of task object
+ * @regcfg_obj_addr: address of register config object
+ * @task_base_addr: task base address
+ * @user_data: (optional) user data
+ * @core_mask: core mask of rknpu
+ * @fence_fd: dma fence fd
+ * @subcore_task: subcore task
+ *
+ */
+struct rknpu_submit {
+	__u32 flags;
+	__u32 timeout;
+	__u32 task_start;
+	__u32 task_number;
+	__u32 task_counter;
+	__s32 priority;
+	__u64 task_obj_addr;
+	__u64 regcfg_obj_addr;
+	__u64 task_base_addr;
+	__u64 user_data;
+	__u32 core_mask;
+	__s32 fence_fd;
+	struct rknpu_subcore_task subcore_task[5];
+};
+
+/**
+ * struct rknpu_task structure for action (GET, SET or ACT)
+ *
+ * @flags: flags for action
+ * @value: GET or SET value
+ *
+ */
+struct rknpu_action {
+	__u32 flags;
+	__u32 value;
+};
+
+#define RKNPU_ACTION 0x00
+#define RKNPU_SUBMIT 0x01
+#define RKNPU_MEM_CREATE 0x02
+#define RKNPU_MEM_MAP 0x03
+#define RKNPU_MEM_DESTROY 0x04
+#define RKNPU_MEM_SYNC 0x05
+
+#define RKNPU_IOC_MAGIC 'r'
+#define RKNPU_IOW(nr, type) _IOW(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOR(nr, type) _IOR(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOWR(nr, type) _IOWR(RKNPU_IOC_MAGIC, nr, type)
+
+#include <drm/drm.h>
+
+#define DRM_IOCTL_RKNPU_ACTION                                                 \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_ACTION, struct rknpu_action)
+#define DRM_IOCTL_RKNPU_SUBMIT                                                 \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_SUBMIT, struct rknpu_submit)
+#define DRM_IOCTL_RKNPU_MEM_CREATE                                             \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define DRM_IOCTL_RKNPU_MEM_MAP                                                \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define DRM_IOCTL_RKNPU_MEM_DESTROY                                            \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define DRM_IOCTL_RKNPU_MEM_SYNC                                               \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#define IOCTL_RKNPU_ACTION RKNPU_IOWR(RKNPU_ACTION, struct rknpu_action)
+#define IOCTL_RKNPU_SUBMIT RKNPU_IOWR(RKNPU_SUBMIT, struct rknpu_submit)
+#define IOCTL_RKNPU_MEM_CREATE                                                 \
+	RKNPU_IOWR(RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define IOCTL_RKNPU_MEM_MAP RKNPU_IOWR(RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define IOCTL_RKNPU_MEM_DESTROY                                                \
+	RKNPU_IOWR(RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define IOCTL_RKNPU_MEM_SYNC RKNPU_IOWR(RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#endif
diff --git a/drivers/rknpu/include/rknpu_job.h b/drivers/rknpu/include/rknpu_job.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ef52d439277e1f8c5797a71172ccb5aeeb02ae8
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_job.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_JOB_H_
+#define __LINUX_RKNPU_JOB_H_
+
+#include <linux/spinlock.h>
+#include <linux/dma-fence.h>
+#include <linux/irq.h>
+
+#include <drm/drm_device.h>
+
+#include "rknpu_ioctl.h"
+
+#define RKNPU_MAX_CORES 3
+
+#define RKNPU_JOB_DONE (1 << 0)
+#define RKNPU_JOB_ASYNC (1 << 1)
+#define RKNPU_JOB_DETACHED (1 << 2)
+
+#define RKNPU_CORE_AUTO_MASK 0x00
+#define RKNPU_CORE0_MASK 0x01
+#define RKNPU_CORE1_MASK 0x02
+#define RKNPU_CORE2_MASK 0x04
+
+struct rknpu_job {
+	struct rknpu_device *rknpu_dev;
+	struct list_head head[RKNPU_MAX_CORES];
+	struct work_struct cleanup_work;
+	bool in_queue[RKNPU_MAX_CORES];
+	bool irq_entry[RKNPU_MAX_CORES];
+	unsigned int flags;
+	int ret;
+	struct rknpu_submit *args;
+	bool args_owner;
+	struct rknpu_task *first_task;
+	struct rknpu_task *last_task;
+	uint32_t int_mask[RKNPU_MAX_CORES];
+	uint32_t int_status[RKNPU_MAX_CORES];
+	struct dma_fence *fence;
+	ktime_t timestamp;
+	uint32_t use_core_num;
+	uint32_t run_count;
+	uint32_t interrupt_count;
+	ktime_t hw_recoder_time;
+};
+
+irqreturn_t rknpu_core0_irq_handler(int irq, void *data);
+irqreturn_t rknpu_core1_irq_handler(int irq, void *data);
+irqreturn_t rknpu_core2_irq_handler(int irq, void *data);
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+int rknpu_submit_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv);
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+int rknpu_submit_ioctl(struct rknpu_device *rknpu_dev, unsigned long data);
+#endif
+
+int rknpu_get_hw_version(struct rknpu_device *rknpu_dev, uint32_t *version);
+
+int rknpu_get_bw_priority(struct rknpu_device *rknpu_dev, uint32_t *priority,
+			  uint32_t *expect, uint32_t *tw);
+
+int rknpu_set_bw_priority(struct rknpu_device *rknpu_dev, uint32_t priority,
+			  uint32_t expect, uint32_t tw);
+
+int rknpu_clear_rw_amount(struct rknpu_device *rknpu_dev);
+
+int rknpu_get_rw_amount(struct rknpu_device *rknpu_dev, uint32_t *dt_wr,
+			uint32_t *dt_rd, uint32_t *wd_rd);
+
+int rknpu_get_total_rw_amount(struct rknpu_device *rknpu_dev, uint32_t *amount);
+
+#endif /* __LINUX_RKNPU_JOB_H_ */
diff --git a/drivers/rknpu/include/rknpu_mem.h b/drivers/rknpu/include/rknpu_mem.h
new file mode 100644
index 0000000000000000000000000000000000000000..925535c85f067973828dd065aa394ede2d454934
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_mem.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_MEM_H
+#define __LINUX_RKNPU_MEM_H
+
+#include <linux/mm_types.h>
+#include <linux/version.h>
+
+/*
+ * rknpu DMA buffer structure.
+ *
+ * @flags: indicate memory type to allocated buffer and cache attribute.
+ * @size: size requested from user, in bytes and this size is aligned
+ *	in page unit.
+ * @kv_addr: kernel virtual address to allocated memory region.
+ * @dma_addr: bus address(accessed by dma) to allocated memory region.
+ *	- this address could be physical address without IOMMU and
+ *	device address with IOMMU.
+ * @pages: Array of backing pages.
+ * @sgt: Imported sg_table.
+ * @dmabuf: buffer for this attachment.
+ * @owner: Is this memory internally allocated.
+ */
+struct rknpu_mem_object {
+	unsigned long flags;
+	unsigned long size;
+	void __iomem *kv_addr;
+	dma_addr_t dma_addr;
+	struct page **pages;
+	struct sg_table *sgt;
+	struct dma_buf *dmabuf;
+	unsigned int owner;
+};
+
+int rknpu_mem_create_ioctl(struct rknpu_device *rknpu_dev, unsigned long data);
+int rknpu_mem_destroy_ioctl(struct rknpu_device *rknpu_dev, unsigned long data);
+int rknpu_mem_sync_ioctl(struct rknpu_device *rknpu_dev, unsigned long data);
+
+#endif
diff --git a/drivers/rknpu/include/rknpu_mm.h b/drivers/rknpu/include/rknpu_mm.h
new file mode 100644
index 0000000000000000000000000000000000000000..84a8c393f1be326ce643766f8afa4494bd159efc
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_mm.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_MM_H
+#define __LINUX_RKNPU_MM_H
+
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
+
+#include "rknpu_drv.h"
+
+struct rknpu_mm {
+	void *bitmap;
+	struct mutex lock;
+	unsigned int chunk_size;
+	unsigned int total_chunks;
+	unsigned int free_chunks;
+};
+
+struct rknpu_mm_obj {
+	uint32_t range_start;
+	uint32_t range_end;
+};
+
+int rknpu_mm_create(unsigned int mem_size, unsigned int chunk_size,
+		    struct rknpu_mm **mm);
+
+void rknpu_mm_destroy(struct rknpu_mm *mm);
+
+int rknpu_mm_alloc(struct rknpu_mm *mm, unsigned int size,
+		   struct rknpu_mm_obj **mm_obj);
+
+int rknpu_mm_free(struct rknpu_mm *mm, struct rknpu_mm_obj *mm_obj);
+
+int rknpu_mm_dump(struct seq_file *m, void *data);
+
+enum iommu_dma_cookie_type {
+	IOMMU_DMA_IOVA_COOKIE,
+	IOMMU_DMA_MSI_COOKIE,
+};
+
+struct rknpu_iommu_dma_cookie {
+	enum iommu_dma_cookie_type type;
+
+	/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
+	struct iova_domain iovad;
+};
+
+dma_addr_t rknpu_iommu_dma_alloc_iova(struct iommu_domain *domain, size_t size,
+				      u64 dma_limit, struct device *dev);
+
+void rknpu_iommu_dma_free_iova(struct rknpu_iommu_dma_cookie *cookie,
+			       dma_addr_t iova, size_t size);
+
+#endif
diff --git a/drivers/rknpu/include/rknpu_reset.h b/drivers/rknpu/include/rknpu_reset.h
new file mode 100644
index 0000000000000000000000000000000000000000..b80e29b321b0ef857328bf83827b5c00475ef847
--- /dev/null
+++ b/drivers/rknpu/include/rknpu_reset.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_RESET_H
+#define __LINUX_RKNPU_RESET_H
+
+#include <linux/reset.h>
+
+#include "rknpu_drv.h"
+
+int rknpu_reset_get(struct rknpu_device *rknpu_dev);
+
+int rknpu_soft_reset(struct rknpu_device *rknpu_dev);
+
+#endif
diff --git a/drivers/rknpu/rknpu_debugger.c b/drivers/rknpu/rknpu_debugger.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e4b5239bbf5fead554b9ac576f3d2db47f74fca
--- /dev/null
+++ b/drivers/rknpu/rknpu_debugger.c
@@ -0,0 +1,601 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/syscalls.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
+#include <linux/devfreq.h>
+#include <linux/clk.h>
+#include <asm/div64.h>
+
+#ifndef FPGA_PLATFORM
+#ifdef CONFIG_PM_DEVFREQ
+#include <../drivers/devfreq/governor.h>
+#endif
+#endif
+
+#include "rknpu_drv.h"
+#include "rknpu_mm.h"
+#include "rknpu_reset.h"
+#include "rknpu_debugger.h"
+
+#define RKNPU_DEBUGGER_ROOT_NAME "rknpu"
+
+#if defined(CONFIG_ROCKCHIP_RKNPU_DEBUG_FS) ||                                 \
+	defined(CONFIG_ROCKCHIP_RKNPU_PROC_FS)
+static int rknpu_version_show(struct seq_file *m, void *data)
+{
+	seq_printf(m, "%s: v%d.%d.%d\n", DRIVER_DESC, DRIVER_MAJOR,
+		   DRIVER_MINOR, DRIVER_PATCHLEVEL);
+
+	return 0;
+}
+
+static int rknpu_load_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	struct rknpu_subcore_data *subcore_data = NULL;
+	unsigned long flags;
+	int i;
+	int load;
+	uint64_t busy_time_total, div_value;
+
+	seq_puts(m, "NPU load: ");
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		subcore_data = &rknpu_dev->subcore_datas[i];
+
+		if (rknpu_dev->config->num_irqs > 1)
+			seq_printf(m, " Core%d: ", i);
+
+		spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+
+		busy_time_total = subcore_data->timer.busy_time_record;
+
+		spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+
+		div_value = (RKNPU_LOAD_INTERVAL / 100000);
+		do_div(busy_time_total, div_value);
+		load = busy_time_total;
+
+		if (rknpu_dev->config->num_irqs > 1)
+			seq_printf(m, "%2.d%%,", load);
+		else
+			seq_printf(m, "%2.d%%", load);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static int rknpu_power_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+
+	if (atomic_read(&rknpu_dev->power_refcount) > 0)
+		seq_puts(m, "on\n");
+	else
+		seq_puts(m, "off\n");
+
+	return 0;
+}
+
+static ssize_t rknpu_power_set(struct file *file, const char __user *ubuf,
+			       size_t len, loff_t *offp)
+{
+	struct seq_file *priv = file->private_data;
+	struct rknpu_debugger_node *node = priv->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	char buf[8];
+
+	if (len > sizeof(buf) - 1)
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, len))
+		return -EFAULT;
+	buf[len - 1] = '\0';
+
+	if (strcmp(buf, "on") == 0) {
+		atomic_inc(&rknpu_dev->cmdline_power_refcount);
+		rknpu_power_get(rknpu_dev);
+		LOG_INFO("rknpu power is on!");
+	} else if (strcmp(buf, "off") == 0) {
+		if (atomic_read(&rknpu_dev->power_refcount) > 0 &&
+		    atomic_dec_if_positive(
+			    &rknpu_dev->cmdline_power_refcount) >= 0) {
+			atomic_sub(
+				atomic_read(&rknpu_dev->cmdline_power_refcount),
+				&rknpu_dev->power_refcount);
+			atomic_set(&rknpu_dev->cmdline_power_refcount, 0);
+			rknpu_power_put(rknpu_dev);
+		}
+		if (atomic_read(&rknpu_dev->power_refcount) <= 0)
+			LOG_INFO("rknpu power is off!");
+	} else {
+		LOG_ERROR("rknpu power node params is invalid!");
+	}
+
+	return len;
+}
+
+static int rknpu_power_put_delay_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+
+	seq_printf(m, "%lu\n", rknpu_dev->power_put_delay);
+
+	return 0;
+}
+
+static ssize_t rknpu_power_put_delay_set(struct file *file,
+					 const char __user *ubuf, size_t len,
+					 loff_t *offp)
+{
+	struct seq_file *priv = file->private_data;
+	struct rknpu_debugger_node *node = priv->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	char buf[16];
+	unsigned long power_put_delay = 0;
+	int ret = 0;
+
+	if (len > sizeof(buf) - 1)
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, len))
+		return -EFAULT;
+	buf[len - 1] = '\0';
+
+	ret = kstrtoul(buf, 10, &power_put_delay);
+	if (ret) {
+		LOG_ERROR("failed to parse power put delay string: %s\n", buf);
+		return -EFAULT;
+	}
+
+	rknpu_dev->power_put_delay = power_put_delay;
+
+	LOG_INFO("set rknpu power put delay time %lums\n",
+		 rknpu_dev->power_put_delay);
+
+	return len;
+}
+
+static int rknpu_freq_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	unsigned long current_freq = 0;
+
+	rknpu_power_get(rknpu_dev);
+
+	current_freq = clk_get_rate(rknpu_dev->clks[0].clk);
+
+	rknpu_power_put(rknpu_dev);
+
+	seq_printf(m, "%lu\n", current_freq);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_DEVFREQ
+static ssize_t rknpu_freq_set(struct file *file, const char __user *ubuf,
+			      size_t len, loff_t *offp)
+{
+	struct seq_file *priv = file->private_data;
+	struct rknpu_debugger_node *node = priv->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	unsigned long current_freq = 0;
+	char buf[16];
+	unsigned long freq = 0;
+	int ret = 0;
+
+	if (len > sizeof(buf) - 1)
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, len))
+		return -EFAULT;
+	buf[len - 1] = '\0';
+
+	ret = kstrtoul(buf, 10, &freq);
+	if (ret) {
+		LOG_ERROR("failed to parse freq string: %s\n", buf);
+		return -EFAULT;
+	}
+
+	if (!rknpu_dev->devfreq)
+		return -EFAULT;
+
+	rknpu_power_get(rknpu_dev);
+
+	current_freq = clk_get_rate(rknpu_dev->clks[0].clk);
+	if (freq != current_freq) {
+		rknpu_dev->ondemand_freq = freq;
+		mutex_lock(&rknpu_dev->devfreq->lock);
+		update_devfreq(rknpu_dev->devfreq);
+		mutex_unlock(&rknpu_dev->devfreq->lock);
+	}
+
+	rknpu_power_put(rknpu_dev);
+
+	return len;
+}
+#else
+static ssize_t rknpu_freq_set(struct file *file, const char __user *ubuf,
+			      size_t len, loff_t *offp)
+{
+	return -EFAULT;
+}
+#endif
+
+static int rknpu_volt_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	unsigned long current_volt = 0;
+
+	current_volt = regulator_get_voltage(rknpu_dev->vdd);
+
+	seq_printf(m, "%lu\n", current_volt);
+
+	return 0;
+}
+
+static int rknpu_reset_show(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+
+	if (!rknpu_dev->bypass_soft_reset)
+		seq_puts(m, "on\n");
+	else
+		seq_puts(m, "off\n");
+
+	return 0;
+}
+
+static ssize_t rknpu_reset_set(struct file *file, const char __user *ubuf,
+			       size_t len, loff_t *offp)
+{
+	struct seq_file *priv = file->private_data;
+	struct rknpu_debugger_node *node = priv->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	char buf[8];
+
+	if (len > sizeof(buf) - 1)
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, len))
+		return -EFAULT;
+	buf[len - 1] = '\0';
+
+	if (strcmp(buf, "1") == 0 &&
+	    atomic_read(&rknpu_dev->power_refcount) > 0)
+		rknpu_soft_reset(rknpu_dev);
+	else if (strcmp(buf, "on") == 0)
+		rknpu_dev->bypass_soft_reset = 0;
+	else if (strcmp(buf, "off") == 0)
+		rknpu_dev->bypass_soft_reset = 1;
+
+	return len;
+}
+
+static struct rknpu_debugger_list rknpu_debugger_root_list[] = {
+	{ "version", rknpu_version_show, NULL, NULL },
+	{ "load", rknpu_load_show, NULL, NULL },
+	{ "power", rknpu_power_show, rknpu_power_set, NULL },
+	{ "freq", rknpu_freq_show, rknpu_freq_set, NULL },
+	{ "volt", rknpu_volt_show, NULL, NULL },
+	{ "delayms", rknpu_power_put_delay_show, rknpu_power_put_delay_set,
+	  NULL },
+	{ "reset", rknpu_reset_show, rknpu_reset_set, NULL },
+#ifdef CONFIG_ROCKCHIP_RKNPU_SRAM
+	{ "mm", rknpu_mm_dump, NULL, NULL },
+#endif
+};
+
+static ssize_t rknpu_debugger_write(struct file *file, const char __user *ubuf,
+				    size_t len, loff_t *offp)
+{
+	struct seq_file *priv = file->private_data;
+	struct rknpu_debugger_node *node = priv->private;
+
+	if (node->info_ent->write)
+		return node->info_ent->write(file, ubuf, len, offp);
+	else
+		return len;
+}
+
+static int rknpu_debugfs_open(struct inode *inode, struct file *file)
+{
+	struct rknpu_debugger_node *node = inode->i_private;
+
+	return single_open(file, node->info_ent->show, node);
+}
+
+static const struct file_operations rknpu_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = rknpu_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.write = rknpu_debugger_write,
+};
+#endif /* #if defined(CONFIG_ROCKCHIP_RKNPU_DEBUG_FS) || defined(CONFIG_ROCKCHIP_RKNPU_PROC_FS) */
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS
+static int rknpu_debugfs_remove_files(struct rknpu_debugger *debugger)
+{
+	struct rknpu_debugger_node *pos, *q;
+	struct list_head *entry_list;
+
+	mutex_lock(&debugger->debugfs_lock);
+
+	/* Delete debugfs entry list */
+	entry_list = &debugger->debugfs_entry_list;
+	list_for_each_entry_safe(pos, q, entry_list, list) {
+		if (pos->dent == NULL)
+			continue;
+		list_del(&pos->list);
+		kfree(pos);
+		pos = NULL;
+	}
+
+	/* Delete all debugfs node in this directory */
+	debugfs_remove_recursive(debugger->debugfs_dir);
+	debugger->debugfs_dir = NULL;
+
+	mutex_unlock(&debugger->debugfs_lock);
+
+	return 0;
+}
+
+static int rknpu_debugfs_create_files(const struct rknpu_debugger_list *files,
+				      int count, struct dentry *root,
+				      struct rknpu_debugger *debugger)
+{
+	int i;
+	struct dentry *ent;
+	struct rknpu_debugger_node *tmp;
+
+	for (i = 0; i < count; i++) {
+		tmp = kmalloc(sizeof(struct rknpu_debugger_node), GFP_KERNEL);
+		if (tmp == NULL) {
+			LOG_ERROR(
+				"Cannot alloc node path /sys/kernel/debug/%pd/%s\n",
+				root, files[i].name);
+			goto MALLOC_FAIL;
+		}
+
+		tmp->info_ent = &files[i];
+		tmp->debugger = debugger;
+
+		ent = debugfs_create_file(files[i].name, S_IFREG | S_IRUGO,
+					  root, tmp, &rknpu_debugfs_fops);
+		if (!ent) {
+			LOG_ERROR("Cannot create /sys/kernel/debug/%pd/%s\n",
+				  root, files[i].name);
+			goto CREATE_FAIL;
+		}
+
+		tmp->dent = ent;
+
+		mutex_lock(&debugger->debugfs_lock);
+		list_add_tail(&tmp->list, &debugger->debugfs_entry_list);
+		mutex_unlock(&debugger->debugfs_lock);
+	}
+
+	return 0;
+
+CREATE_FAIL:
+	kfree(tmp);
+MALLOC_FAIL:
+	rknpu_debugfs_remove_files(debugger);
+
+	return -1;
+}
+
+static int rknpu_debugfs_remove(struct rknpu_debugger *debugger)
+{
+	rknpu_debugfs_remove_files(debugger);
+
+	return 0;
+}
+
+static int rknpu_debugfs_init(struct rknpu_debugger *debugger)
+{
+	int ret;
+
+	debugger->debugfs_dir =
+		debugfs_create_dir(RKNPU_DEBUGGER_ROOT_NAME, NULL);
+	if (IS_ERR_OR_NULL(debugger->debugfs_dir)) {
+		LOG_ERROR("failed on mkdir /sys/kernel/debug/%s\n",
+			  RKNPU_DEBUGGER_ROOT_NAME);
+		debugger->debugfs_dir = NULL;
+		return -EIO;
+	}
+
+	ret = rknpu_debugfs_create_files(rknpu_debugger_root_list,
+					 ARRAY_SIZE(rknpu_debugger_root_list),
+					 debugger->debugfs_dir, debugger);
+	if (ret) {
+		LOG_ERROR(
+			"Could not install rknpu_debugger_root_list debugfs\n");
+		goto CREATE_FAIL;
+	}
+
+	return 0;
+
+CREATE_FAIL:
+	rknpu_debugfs_remove(debugger);
+
+	return ret;
+}
+#endif /* #ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS */
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS
+static int rknpu_procfs_open(struct inode *inode, struct file *file)
+{
+	struct rknpu_debugger_node *node = pde_data(inode);
+
+	return single_open(file, node->info_ent->show, node);
+}
+
+static const struct proc_ops rknpu_procfs_fops = {
+	.proc_open = rknpu_procfs_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = single_release,
+	.proc_write = rknpu_debugger_write,
+};
+
+static int rknpu_procfs_remove_files(struct rknpu_debugger *debugger)
+{
+	struct rknpu_debugger_node *pos, *q;
+	struct list_head *entry_list;
+
+	mutex_lock(&debugger->procfs_lock);
+
+	/* Delete procfs entry list */
+	entry_list = &debugger->procfs_entry_list;
+	list_for_each_entry_safe(pos, q, entry_list, list) {
+		if (pos->pent == NULL)
+			continue;
+		list_del(&pos->list);
+		kfree(pos);
+		pos = NULL;
+	}
+
+	/* Delete all procfs node in this directory */
+	proc_remove(debugger->procfs_dir);
+	debugger->procfs_dir = NULL;
+
+	mutex_unlock(&debugger->procfs_lock);
+
+	return 0;
+}
+
+static int rknpu_procfs_create_files(const struct rknpu_debugger_list *files,
+				     int count, struct proc_dir_entry *root,
+				     struct rknpu_debugger *debugger)
+{
+	int i;
+	struct proc_dir_entry *ent;
+	struct rknpu_debugger_node *tmp;
+
+	for (i = 0; i < count; i++) {
+		tmp = kmalloc(sizeof(struct rknpu_debugger_node), GFP_KERNEL);
+		if (tmp == NULL) {
+			LOG_ERROR("Cannot alloc node path for /proc/%s/%s\n",
+				  RKNPU_DEBUGGER_ROOT_NAME, files[i].name);
+			goto MALLOC_FAIL;
+		}
+
+		tmp->info_ent = &files[i];
+		tmp->debugger = debugger;
+
+		ent = proc_create_data(files[i].name, S_IFREG | S_IRUGO, root,
+				       &rknpu_procfs_fops, tmp);
+		if (!ent) {
+			LOG_ERROR("Cannot create /proc/%s/%s\n",
+				  RKNPU_DEBUGGER_ROOT_NAME, files[i].name);
+			goto CREATE_FAIL;
+		}
+
+		tmp->pent = ent;
+
+		mutex_lock(&debugger->procfs_lock);
+		list_add_tail(&tmp->list, &debugger->procfs_entry_list);
+		mutex_unlock(&debugger->procfs_lock);
+	}
+
+	return 0;
+
+CREATE_FAIL:
+	kfree(tmp);
+MALLOC_FAIL:
+	rknpu_procfs_remove_files(debugger);
+	return -1;
+}
+
+static int rknpu_procfs_remove(struct rknpu_debugger *debugger)
+{
+	rknpu_procfs_remove_files(debugger);
+
+	return 0;
+}
+
+static int rknpu_procfs_init(struct rknpu_debugger *debugger)
+{
+	int ret;
+
+	debugger->procfs_dir = proc_mkdir(RKNPU_DEBUGGER_ROOT_NAME, NULL);
+	if (IS_ERR_OR_NULL(debugger->procfs_dir)) {
+		pr_err("failed on mkdir /proc/%s\n", RKNPU_DEBUGGER_ROOT_NAME);
+		debugger->procfs_dir = NULL;
+		return -EIO;
+	}
+
+	ret = rknpu_procfs_create_files(rknpu_debugger_root_list,
+					ARRAY_SIZE(rknpu_debugger_root_list),
+					debugger->procfs_dir, debugger);
+	if (ret) {
+		pr_err("Could not install rknpu_debugger_root_list procfs\n");
+		goto CREATE_FAIL;
+	}
+
+	return 0;
+
+CREATE_FAIL:
+	rknpu_procfs_remove(debugger);
+
+	return ret;
+}
+#endif /* #ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS */
+
+int rknpu_debugger_init(struct rknpu_device *rknpu_dev)
+{
+#ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS
+	mutex_init(&rknpu_dev->debugger.debugfs_lock);
+	INIT_LIST_HEAD(&rknpu_dev->debugger.debugfs_entry_list);
+	rknpu_debugfs_init(&rknpu_dev->debugger);
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS
+	mutex_init(&rknpu_dev->debugger.procfs_lock);
+	INIT_LIST_HEAD(&rknpu_dev->debugger.procfs_entry_list);
+	rknpu_procfs_init(&rknpu_dev->debugger);
+#endif
+	return 0;
+}
+
+int rknpu_debugger_remove(struct rknpu_device *rknpu_dev)
+{
+#ifdef CONFIG_ROCKCHIP_RKNPU_DEBUG_FS
+	rknpu_debugfs_remove(&rknpu_dev->debugger);
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_PROC_FS
+	rknpu_procfs_remove(&rknpu_dev->debugger);
+#endif
+	return 0;
+}
diff --git a/drivers/rknpu/rknpu_drv.c b/drivers/rknpu/rknpu_drv.c
new file mode 100644
index 0000000000000000000000000000000000000000..7690c5a5804892aa3a918bab75dc9ea19a22e479
--- /dev/null
+++ b/drivers/rknpu/rknpu_drv.c
@@ -0,0 +1,1483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-mapping.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/devfreq_cooling.h>
+#include <linux/regmap.h>
+#include <linux/of_address.h>
+
+#ifndef FPGA_PLATFORM
+#include <soc/rockchip/rockchip_opp_select.h>
+#ifdef CONFIG_PM_DEVFREQ
+#include <../drivers/devfreq/governor.h>
+#endif
+#endif
+
+#include "rknpu_ioctl.h"
+#include "rknpu_reset.h"
+#include "rknpu_fence.h"
+#include "rknpu_drv.h"
+#include "rknpu_gem.h"
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+#include <drm/drm_device.h>
+#include <drm/drm_ioctl.h>
+#include <drm/drm_file.h>
+#include <drm/drm_drv.h>
+#include "rknpu_gem.h"
+#endif
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+#include <linux/rk-dma-heap.h>
+#include "rknpu_mem.h"
+#endif
+
+#define POWER_DOWN_FREQ 200000000
+#define NPU_MMU_DISABLED_POLL_PERIOD_US 1000
+#define NPU_MMU_DISABLED_POLL_TIMEOUT_US 20000
+
+static int bypass_irq_handler;
+module_param(bypass_irq_handler, int, 0644);
+MODULE_PARM_DESC(bypass_irq_handler,
+		 "bypass RKNPU irq handler if set it to 1, disabled by default");
+
+static int bypass_soft_reset;
+module_param(bypass_soft_reset, int, 0644);
+MODULE_PARM_DESC(bypass_soft_reset,
+		 "bypass RKNPU soft reset if set it to 1, disabled by default");
+
+struct rknpu_irqs_data {
+	const char *name;
+	irqreturn_t (*irq_hdl)(int irq, void *ctx);
+};
+
+static const struct rknpu_irqs_data rknpu_irqs[] = {
+	{ "npu_irq", rknpu_core0_irq_handler }
+};
+
+static const struct rknpu_irqs_data rk3588_npu_irqs[] = {
+	{ "npu0_irq", rknpu_core0_irq_handler },
+	{ "npu1_irq", rknpu_core1_irq_handler },
+	{ "npu2_irq", rknpu_core2_irq_handler }
+};
+
+static const struct rknpu_irqs_data rv110x_npu_irqs[] = {
+	{ "npu_irq", rknpu_core0_irq_handler }
+};
+
+static const struct rknpu_reset_data rknpu_resets[] = {
+	{ "srst_a", "srst_h" }
+};
+
+static const struct rknpu_reset_data rk3588_npu_resets[] = {
+	{ "srst_a0", "srst_h0" },
+	{ "srst_a1", "srst_h1" },
+	{ "srst_a2", "srst_h2" }
+};
+
+static const struct rknpu_reset_data rv110x_npu_resets[] = {
+	{ "srst_a", "srst_h" }
+};
+
+static const struct rknpu_config rk356x_rknpu_config = {
+	.bw_priority_addr = 0xfe180008,
+	.bw_priority_length = 0x10,
+	.dma_mask = DMA_BIT_MASK(32),
+	.pc_data_amount_scale = 1,
+	.pc_task_number_bits = 12,
+	.pc_task_number_mask = 0xfff,
+	.pc_task_status_offset = 0x3c,
+	.bw_enable = 1,
+	.irqs = rknpu_irqs,
+	.resets = rknpu_resets,
+	.num_irqs = ARRAY_SIZE(rknpu_irqs),
+	.num_resets = ARRAY_SIZE(rknpu_resets)
+};
+
+static const struct rknpu_config rk3588_rknpu_config = {
+	.bw_priority_addr = 0x0,
+	.bw_priority_length = 0x0,
+	.dma_mask = DMA_BIT_MASK(40),
+	.pc_data_amount_scale = 2,
+	.pc_task_number_bits = 12,
+	.pc_task_number_mask = 0xfff,
+	.pc_task_status_offset = 0x3c,
+	.bw_enable = 0,
+	.irqs = rk3588_npu_irqs,
+	.resets = rk3588_npu_resets,
+	.num_irqs = ARRAY_SIZE(rk3588_npu_irqs),
+	.num_resets = ARRAY_SIZE(rk3588_npu_resets)
+};
+
+static const struct rknpu_config rv1106_rknpu_config = {
+	.bw_priority_addr = 0x0,
+	.bw_priority_length = 0x0,
+	.dma_mask = DMA_BIT_MASK(32),
+	.pc_data_amount_scale = 2,
+	.pc_task_number_bits = 16,
+	.pc_task_number_mask = 0xffff,
+	.pc_task_status_offset = 0x3c,
+	.bw_enable = 1,
+	.irqs = rv110x_npu_irqs,
+	.resets = rv110x_npu_resets,
+	.num_irqs = ARRAY_SIZE(rv110x_npu_irqs),
+	.num_resets = ARRAY_SIZE(rv110x_npu_resets)
+};
+
+static const struct rknpu_config rk3562_rknpu_config = {
+	.bw_priority_addr = 0x0,
+	.bw_priority_length = 0x0,
+	.dma_mask = DMA_BIT_MASK(40),
+	.pc_data_amount_scale = 2,
+	.pc_task_number_bits = 16,
+	.pc_task_number_mask = 0xffff,
+	.pc_task_status_offset = 0x48,
+	.bw_enable = 1,
+	.irqs = rknpu_irqs,
+	.resets = rknpu_resets,
+	.num_irqs = ARRAY_SIZE(rknpu_irqs),
+	.num_resets = ARRAY_SIZE(rknpu_resets)
+};
+
+/* driver probe and init */
+static const struct of_device_id rknpu_of_match[] = {
+	{
+		.compatible = "rockchip,rknpu",
+		.data = &rk356x_rknpu_config,
+	},
+	{
+		.compatible = "rockchip,rk3568-rknpu",
+		.data = &rk356x_rknpu_config,
+	},
+	{
+		.compatible = "rockchip,rk3588-rknpu",
+		.data = &rk3588_rknpu_config,
+	},
+	{
+		.compatible = "rockchip,rv1106-rknpu",
+		.data = &rv1106_rknpu_config,
+	},
+	{
+		.compatible = "rockchip,rk3562-rknpu",
+		.data = &rk3562_rknpu_config,
+	},
+	{},
+};
+
+static int rknpu_get_drv_version(uint32_t *version)
+{
+	*version = RKNPU_GET_DRV_VERSION_CODE(DRIVER_MAJOR, DRIVER_MINOR,
+					      DRIVER_PATCHLEVEL);
+	return 0;
+}
+
+static int rknpu_power_on(struct rknpu_device *rknpu_dev);
+static int rknpu_power_off(struct rknpu_device *rknpu_dev);
+
+static void rknpu_power_off_delay_work(struct work_struct *power_off_work)
+{
+	struct rknpu_device *rknpu_dev =
+		container_of(to_delayed_work(power_off_work),
+			     struct rknpu_device, power_off_work);
+	mutex_lock(&rknpu_dev->power_lock);
+	if (atomic_dec_if_positive(&rknpu_dev->power_refcount) == 0)
+		rknpu_power_off(rknpu_dev);
+	mutex_unlock(&rknpu_dev->power_lock);
+}
+
+int rknpu_power_get(struct rknpu_device *rknpu_dev)
+{
+	int ret = 0;
+
+	cancel_delayed_work(&rknpu_dev->power_off_work);
+	mutex_lock(&rknpu_dev->power_lock);
+	if (atomic_inc_return(&rknpu_dev->power_refcount) == 1)
+		ret = rknpu_power_on(rknpu_dev);
+	mutex_unlock(&rknpu_dev->power_lock);
+
+	return ret;
+}
+
+int rknpu_power_put(struct rknpu_device *rknpu_dev)
+{
+	int ret = 0;
+
+	mutex_lock(&rknpu_dev->power_lock);
+	if (atomic_dec_if_positive(&rknpu_dev->power_refcount) == 0)
+		ret = rknpu_power_off(rknpu_dev);
+	mutex_unlock(&rknpu_dev->power_lock);
+
+	return ret;
+}
+
+static int rknpu_power_put_delay(struct rknpu_device *rknpu_dev)
+{
+	mutex_lock(&rknpu_dev->power_lock);
+	if (atomic_read(&rknpu_dev->power_refcount) == 1)
+		queue_delayed_work(
+			rknpu_dev->power_off_wq, &rknpu_dev->power_off_work,
+			msecs_to_jiffies(rknpu_dev->power_put_delay));
+	else
+		atomic_dec_if_positive(&rknpu_dev->power_refcount);
+	mutex_unlock(&rknpu_dev->power_lock);
+	return 0;
+}
+
+static int rknpu_action(struct rknpu_device *rknpu_dev,
+			struct rknpu_action *args)
+{
+	int ret = -EINVAL;
+
+	switch (args->flags) {
+	case RKNPU_GET_HW_VERSION:
+		ret = rknpu_get_hw_version(rknpu_dev, &args->value);
+		break;
+	case RKNPU_GET_DRV_VERSION:
+		ret = rknpu_get_drv_version(&args->value);
+		break;
+	case RKNPU_GET_FREQ:
+#ifndef FPGA_PLATFORM
+		args->value = clk_get_rate(rknpu_dev->clks[0].clk);
+#endif
+		ret = 0;
+		break;
+	case RKNPU_SET_FREQ:
+		break;
+	case RKNPU_GET_VOLT:
+#ifndef FPGA_PLATFORM
+		args->value = regulator_get_voltage(rknpu_dev->vdd);
+#endif
+		ret = 0;
+		break;
+	case RKNPU_SET_VOLT:
+		break;
+	case RKNPU_ACT_RESET:
+		ret = rknpu_soft_reset(rknpu_dev);
+		break;
+	case RKNPU_GET_BW_PRIORITY:
+		ret = rknpu_get_bw_priority(rknpu_dev, &args->value, NULL,
+					    NULL);
+		break;
+	case RKNPU_SET_BW_PRIORITY:
+		ret = rknpu_set_bw_priority(rknpu_dev, args->value, 0, 0);
+		break;
+	case RKNPU_GET_BW_EXPECT:
+		ret = rknpu_get_bw_priority(rknpu_dev, NULL, &args->value,
+					    NULL);
+		break;
+	case RKNPU_SET_BW_EXPECT:
+		ret = rknpu_set_bw_priority(rknpu_dev, 0, args->value, 0);
+		break;
+	case RKNPU_GET_BW_TW:
+		ret = rknpu_get_bw_priority(rknpu_dev, NULL, NULL,
+					    &args->value);
+		break;
+	case RKNPU_SET_BW_TW:
+		ret = rknpu_set_bw_priority(rknpu_dev, 0, 0, args->value);
+		break;
+	case RKNPU_ACT_CLR_TOTAL_RW_AMOUNT:
+		ret = rknpu_clear_rw_amount(rknpu_dev);
+		break;
+	case RKNPU_GET_DT_WR_AMOUNT:
+		ret = rknpu_get_rw_amount(rknpu_dev, &args->value, NULL, NULL);
+		break;
+	case RKNPU_GET_DT_RD_AMOUNT:
+		ret = rknpu_get_rw_amount(rknpu_dev, NULL, &args->value, NULL);
+		break;
+	case RKNPU_GET_WT_RD_AMOUNT:
+		ret = rknpu_get_rw_amount(rknpu_dev, NULL, NULL, &args->value);
+		break;
+	case RKNPU_GET_TOTAL_RW_AMOUNT:
+		ret = rknpu_get_total_rw_amount(rknpu_dev, &args->value);
+		break;
+	case RKNPU_GET_IOMMU_EN:
+		args->value = rknpu_dev->iommu_en;
+		ret = 0;
+		break;
+	case RKNPU_SET_PROC_NICE:
+		set_user_nice(current, *(int32_t *)&args->value);
+		ret = 0;
+		break;
+	case RKNPU_GET_TOTAL_SRAM_SIZE:
+		if (rknpu_dev->sram_mm)
+			args->value = rknpu_dev->sram_mm->total_chunks *
+				      rknpu_dev->sram_mm->chunk_size;
+		else
+			args->value = 0;
+		ret = 0;
+		break;
+	case RKNPU_GET_FREE_SRAM_SIZE:
+		if (rknpu_dev->sram_mm)
+			args->value = rknpu_dev->sram_mm->free_chunks *
+				      rknpu_dev->sram_mm->chunk_size;
+		else
+			args->value = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+static int rknpu_open(struct inode *inode, struct file *file)
+{
+	return nonseekable_open(inode, file);
+}
+
+static int rknpu_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int rknpu_action_ioctl(struct rknpu_device *rknpu_dev,
+			      unsigned long data)
+{
+	struct rknpu_action args;
+	int ret = -EINVAL;
+
+	if (unlikely(copy_from_user(&args, (struct rknpu_action *)data,
+				    sizeof(struct rknpu_action)))) {
+		LOG_ERROR("%s: copy_from_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	ret = rknpu_action(rknpu_dev, &args);
+
+	if (unlikely(copy_to_user((struct rknpu_action *)data, &args,
+				  sizeof(struct rknpu_action)))) {
+		LOG_ERROR("%s: copy_to_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	return ret;
+}
+
+static long rknpu_ioctl(struct file *file, uint32_t cmd, unsigned long arg)
+{
+	long ret = -EINVAL;
+	struct rknpu_device *rknpu_dev =
+		container_of(file->private_data, struct rknpu_device, miscdev);
+
+	rknpu_power_get(rknpu_dev);
+
+	switch (cmd) {
+	case IOCTL_RKNPU_ACTION:
+		ret = rknpu_action_ioctl(rknpu_dev, arg);
+		break;
+	case IOCTL_RKNPU_SUBMIT:
+		ret = rknpu_submit_ioctl(rknpu_dev, arg);
+		break;
+	case IOCTL_RKNPU_MEM_CREATE:
+		ret = rknpu_mem_create_ioctl(rknpu_dev, arg);
+		break;
+	case RKNPU_MEM_MAP:
+		break;
+	case IOCTL_RKNPU_MEM_DESTROY:
+		ret = rknpu_mem_destroy_ioctl(rknpu_dev, arg);
+		break;
+	case IOCTL_RKNPU_MEM_SYNC:
+		ret = rknpu_mem_sync_ioctl(rknpu_dev, arg);
+		break;
+	default:
+		break;
+	}
+
+	rknpu_power_put_delay(rknpu_dev);
+
+	return ret;
+}
+const struct file_operations rknpu_fops = {
+	.owner = THIS_MODULE,
+	.open = rknpu_open,
+	.release = rknpu_release,
+	.unlocked_ioctl = rknpu_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = rknpu_ioctl,
+#endif
+};
+#endif
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+static const struct vm_operations_struct rknpu_gem_vm_ops = {
+	.fault = rknpu_gem_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
+static int rknpu_action_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv)
+{
+	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev->dev);
+
+	return rknpu_action(rknpu_dev, (struct rknpu_action *)data);
+}
+
+#define RKNPU_IOCTL(func)                                                      \
+	static int __##func(struct drm_device *dev, void *data,                \
+			    struct drm_file *file_priv)                        \
+	{                                                                      \
+		struct rknpu_device *rknpu_dev = dev_get_drvdata(dev->dev);    \
+		int ret = -EINVAL;                                             \
+		rknpu_power_get(rknpu_dev);                                    \
+		ret = func(dev, data, file_priv);                              \
+		rknpu_power_put_delay(rknpu_dev);                              \
+		return ret;                                                    \
+	}
+
+RKNPU_IOCTL(rknpu_action_ioctl);
+RKNPU_IOCTL(rknpu_submit_ioctl);
+RKNPU_IOCTL(rknpu_gem_create_ioctl);
+RKNPU_IOCTL(rknpu_gem_map_ioctl);
+RKNPU_IOCTL(rknpu_gem_destroy_ioctl);
+RKNPU_IOCTL(rknpu_gem_sync_ioctl);
+
+static const struct drm_ioctl_desc rknpu_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(RKNPU_ACTION, __rknpu_action_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RKNPU_SUBMIT, __rknpu_submit_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RKNPU_MEM_CREATE, __rknpu_gem_create_ioctl,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RKNPU_MEM_MAP, __rknpu_gem_map_ioctl,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RKNPU_MEM_DESTROY, __rknpu_gem_destroy_ioctl,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RKNPU_MEM_SYNC, __rknpu_gem_sync_ioctl,
+			  DRM_RENDER_ALLOW),
+};
+
+static const struct file_operations rknpu_drm_driver_fops = {
+	.owner = THIS_MODULE,
+	.open = drm_open,
+	.mmap = rknpu_gem_mmap,
+	.poll = drm_poll,
+	.read = drm_read,
+	.unlocked_ioctl = drm_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = drm_compat_ioctl,
+#endif
+	.release = drm_release,
+	.llseek = noop_llseek,
+};
+
+static struct drm_driver rknpu_drm_driver = {
+// #if KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE
+// 	.driver_features = DRIVER_GEM | DRIVER_RENDER,
+// #else
+	// .driver_features = DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER,
+	.driver_features = DRIVER_GEM | DRIVER_RENDER,
+// #endif
+	// .gem_free_object_unlocked = rknpu_gem_free_object,
+	// .gem_vm_ops = &rknpu_gem_vm_ops,
+	// .dumb_create = rknpu_gem_dumb_create,
+// #if KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE
+// 	.dumb_map_offset = rknpu_gem_dumb_map_offset,
+// #else
+	.dumb_map_offset = drm_gem_dumb_map_offset,
+// #endif
+	// .dumb_destroy = drm_gem_dumb_destroy,
+	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
+	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
+	// .gem_prime_export = drm_gem_prime_export,
+// #if KERNEL_VERSION(4, 13, 0) <= LINUX_VERSION_CODE
+// 	.gem_prime_import = rknpu_gem_prime_import,
+// #else
+	.gem_prime_import = drm_gem_prime_import,
+// #endif
+	// .gem_prime_get_sg_table = rknpu_gem_prime_get_sg_table,
+	.gem_prime_import_sg_table = rknpu_gem_prime_import_sg_table,
+	// .gem_prime_vmap = rknpu_gem_prime_vmap,
+	// .gem_prime_vunmap = rknpu_gem_prime_vunmap,
+	.gem_prime_mmap = rknpu_gem_prime_mmap,
+	.ioctls = rknpu_ioctls,
+	.num_ioctls = ARRAY_SIZE(rknpu_ioctls),
+	.fops = &rknpu_drm_driver_fops,
+	.name = DRIVER_NAME,
+	.desc = DRIVER_DESC,
+	.date = DRIVER_DATE,
+	.major = DRIVER_MAJOR,
+	.minor = DRIVER_MINOR,
+	.patchlevel = DRIVER_PATCHLEVEL,
+};
+
+#endif
+
+static enum hrtimer_restart hrtimer_handler(struct hrtimer *timer)
+{
+	struct rknpu_device *rknpu_dev =
+		container_of(timer, struct rknpu_device, timer);
+	struct rknpu_subcore_data *subcore_data = NULL;
+	struct rknpu_job *job = NULL;
+	ktime_t now = ktime_get();
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		subcore_data = &rknpu_dev->subcore_datas[i];
+
+		spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+
+		job = subcore_data->job;
+		if (job) {
+			subcore_data->timer.busy_time +=
+				ktime_us_delta(now, job->hw_recoder_time);
+			job->hw_recoder_time = ktime_get();
+		}
+
+		subcore_data->timer.busy_time_record =
+			subcore_data->timer.busy_time;
+		subcore_data->timer.busy_time = 0;
+		spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+	}
+
+	hrtimer_forward_now(timer, rknpu_dev->kt);
+	return HRTIMER_RESTART;
+}
+
+static void rknpu_init_timer(struct rknpu_device *rknpu_dev)
+{
+	rknpu_dev->kt = ktime_set(0, RKNPU_LOAD_INTERVAL);
+	hrtimer_init(&rknpu_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rknpu_dev->timer.function = hrtimer_handler;
+	hrtimer_start(&rknpu_dev->timer, rknpu_dev->kt, HRTIMER_MODE_REL);
+}
+
+static void rknpu_cancel_timer(struct rknpu_device *rknpu_dev)
+{
+	hrtimer_cancel(&rknpu_dev->timer);
+}
+
+static bool rknpu_is_iommu_enable(struct device *dev)
+{
+	struct device_node *iommu = NULL;
+
+	iommu = of_parse_phandle(dev->of_node, "iommus", 0);
+	if (!iommu) {
+		LOG_DEV_INFO(
+			dev,
+			"rknpu iommu device-tree entry not found!, using non-iommu mode\n");
+		return false;
+	}
+
+	if (!of_device_is_available(iommu)) {
+		LOG_DEV_INFO(dev,
+			     "rknpu iommu is disabled, using non-iommu mode\n");
+		of_node_put(iommu);
+		return false;
+	}
+	of_node_put(iommu);
+
+	LOG_DEV_INFO(dev, "rknpu iommu is enabled, using iommu mode\n");
+
+	return true;
+}
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+static int rknpu_drm_probe(struct rknpu_device *rknpu_dev)
+{
+	struct device *dev = rknpu_dev->dev;
+	struct drm_device *drm_dev = NULL;
+	int ret = -EINVAL;
+
+	drm_dev = drm_dev_alloc(&rknpu_drm_driver, dev);
+	if (IS_ERR(drm_dev))
+		return PTR_ERR(drm_dev);
+
+	/* register the DRM device */
+	ret = drm_dev_register(drm_dev, 0);
+	if (ret < 0)
+		goto err_free_drm;
+
+	drm_dev->dev_private = rknpu_dev;
+	rknpu_dev->drm_dev = drm_dev;
+
+	return 0;
+
+err_free_drm:
+#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
+	drm_dev_put(drm_dev);
+#else
+	drm_dev_unref(drm_dev);
+#endif
+
+	return ret;
+}
+
+static void rknpu_drm_remove(struct rknpu_device *rknpu_dev)
+{
+	struct drm_device *drm_dev = rknpu_dev->drm_dev;
+
+	drm_dev_unregister(drm_dev);
+
+#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
+	drm_dev_put(drm_dev);
+#else
+	drm_dev_unref(drm_dev);
+#endif
+}
+#endif
+
+static int rknpu_power_on(struct rknpu_device *rknpu_dev)
+{
+	struct device *dev = rknpu_dev->dev;
+	int ret = -EINVAL;
+
+#ifndef FPGA_PLATFORM
+	if (rknpu_dev->vdd) {
+		ret = regulator_enable(rknpu_dev->vdd);
+		if (ret) {
+			LOG_DEV_ERROR(
+				dev,
+				"failed to enable vdd reg for rknpu, ret: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
+	if (rknpu_dev->mem) {
+		ret = regulator_enable(rknpu_dev->mem);
+		if (ret) {
+			LOG_DEV_ERROR(
+				dev,
+				"failed to enable mem reg for rknpu, ret: %d\n",
+				ret);
+			return ret;
+		}
+	}
+#endif
+
+	ret = clk_bulk_prepare_enable(rknpu_dev->num_clks, rknpu_dev->clks);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to enable clk for rknpu, ret: %d\n",
+			      ret);
+		return ret;
+	}
+
+	if (rknpu_dev->multiple_domains) {
+		if (rknpu_dev->genpd_dev_npu0) {
+			ret = pm_runtime_get_sync(rknpu_dev->genpd_dev_npu0);
+			if (ret < 0) {
+				LOG_DEV_ERROR(
+					dev,
+					"failed to get pm runtime for npu0, ret: %d\n",
+					ret);
+				goto out;
+			}
+		}
+		if (rknpu_dev->genpd_dev_npu1) {
+			ret = pm_runtime_get_sync(rknpu_dev->genpd_dev_npu1);
+			if (ret < 0) {
+				LOG_DEV_ERROR(
+					dev,
+					"failed to get pm runtime for npu1, ret: %d\n",
+					ret);
+				goto out;
+			}
+		}
+		if (rknpu_dev->genpd_dev_npu2) {
+			ret = pm_runtime_get_sync(rknpu_dev->genpd_dev_npu2);
+			if (ret < 0) {
+				LOG_DEV_ERROR(
+					dev,
+					"failed to get pm runtime for npu2, ret: %d\n",
+					ret);
+				goto out;
+			}
+		}
+	}
+	ret = pm_runtime_get_sync(dev);
+	if (ret < 0) {
+		LOG_DEV_ERROR(dev,
+			      "failed to get pm runtime for rknpu, ret: %d\n",
+			      ret);
+	}
+
+out:
+	return ret;
+}
+
+static int rknpu_power_off(struct rknpu_device *rknpu_dev)
+{
+	struct device *dev = rknpu_dev->dev;
+
+	pm_runtime_put_sync(dev);
+
+	if (rknpu_dev->multiple_domains) {
+		if (rknpu_dev->genpd_dev_npu2)
+			pm_runtime_put_sync(rknpu_dev->genpd_dev_npu2);
+		if (rknpu_dev->genpd_dev_npu1)
+			pm_runtime_put_sync(rknpu_dev->genpd_dev_npu1);
+		if (rknpu_dev->genpd_dev_npu0)
+			pm_runtime_put_sync(rknpu_dev->genpd_dev_npu0);
+	}
+
+	clk_bulk_disable_unprepare(rknpu_dev->num_clks, rknpu_dev->clks);
+
+#ifndef FPGA_PLATFORM
+	if (rknpu_dev->vdd)
+		regulator_disable(rknpu_dev->vdd);
+
+	if (rknpu_dev->mem)
+		regulator_disable(rknpu_dev->mem);
+#endif
+
+	return 0;
+}
+
+static int npu_devfreq_target(struct device *dev, unsigned long *target_freq,
+			      u32 flags)
+{
+	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev);
+	struct dev_pm_opp *opp = NULL;
+	unsigned long freq = *target_freq;
+	unsigned long old_freq = rknpu_dev->current_freq;
+	unsigned long volt, old_volt = rknpu_dev->current_volt;
+	int ret = -EINVAL;
+
+	opp = devfreq_recommended_opp(dev, &freq, flags);
+	if (IS_ERR(opp)) {
+		LOG_DEV_ERROR(dev, "failed to get opp (%ld)\n", PTR_ERR(opp));
+		return PTR_ERR(opp);
+	}
+	volt = dev_pm_opp_get_voltage(opp);
+
+	/*
+	 * Only update if there is a change of frequency
+	 */
+	if (old_freq == freq) {
+		*target_freq = freq;
+		if (old_volt == volt)
+			return 0;
+		ret = regulator_set_voltage(rknpu_dev->vdd, volt, INT_MAX);
+		if (ret) {
+			LOG_DEV_ERROR(dev, "failed to set volt %lu\n", volt);
+			return ret;
+		}
+		rknpu_dev->current_volt = volt;
+		return 0;
+	}
+
+	if (rknpu_dev->vdd && old_volt != volt && old_freq < freq) {
+		ret = regulator_set_voltage(rknpu_dev->vdd, volt, INT_MAX);
+		if (ret) {
+			LOG_DEV_ERROR(dev, "failed to increase volt %lu\n",
+				      volt);
+			return ret;
+		}
+	}
+	LOG_DEV_DEBUG(dev, "%luHz %luuV -> %luHz %luuV\n", old_freq, old_volt,
+		      freq, volt);
+	ret = clk_set_rate(rknpu_dev->clks[0].clk, freq);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to set clock %lu\n", freq);
+		return ret;
+	}
+	*target_freq = freq;
+	rknpu_dev->current_freq = freq;
+
+	if (rknpu_dev->devfreq)
+		rknpu_dev->devfreq->last_status.current_frequency = freq;
+
+	if (rknpu_dev->vdd && old_volt != volt && old_freq > freq) {
+		ret = regulator_set_voltage(rknpu_dev->vdd, volt, INT_MAX);
+		if (ret) {
+			LOG_DEV_ERROR(dev, "failed to decrease volt %lu\n",
+				      volt);
+			return ret;
+		}
+	}
+	rknpu_dev->current_volt = volt;
+
+	LOG_DEV_INFO(dev, "set rknpu freq: %lu, volt: %lu\n",
+		     rknpu_dev->current_freq, rknpu_dev->current_volt);
+
+	return ret;
+}
+
+static int npu_devfreq_get_dev_status(struct device *dev,
+				      struct devfreq_dev_status *stat)
+{
+	return 0;
+}
+
+static int npu_devfreq_get_cur_freq(struct device *dev, unsigned long *freq)
+{
+	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev);
+
+	*freq = rknpu_dev->current_freq;
+
+	return 0;
+}
+
+static struct devfreq_dev_profile npu_devfreq_profile = {
+	.polling_ms = 50,
+	.target = npu_devfreq_target,
+	.get_dev_status = npu_devfreq_get_dev_status,
+	.get_cur_freq = npu_devfreq_get_cur_freq,
+};
+
+#ifdef CONFIG_PM_DEVFREQ
+static int devfreq_rknpu_ondemand_func(struct devfreq *df, unsigned long *freq)
+{
+	struct rknpu_device *rknpu_dev = df->data;
+
+	if (rknpu_dev)
+		*freq = rknpu_dev->ondemand_freq;
+	else
+		*freq = df->previous_freq;
+
+	return 0;
+}
+
+static int devfreq_rknpu_ondemand_handler(struct devfreq *devfreq,
+					  unsigned int event, void *data)
+{
+	return 0;
+}
+
+static struct devfreq_governor devfreq_rknpu_ondemand = {
+	.name = "rknpu_ondemand",
+	.get_target_freq = devfreq_rknpu_ondemand_func,
+	.event_handler = devfreq_rknpu_ondemand_handler,
+};
+#endif
+
+static int npu_devfreq_adjust_current_freq_volt(struct device *dev,
+						struct rknpu_device *rknpu_dev)
+{
+	unsigned long volt, old_freq, freq;
+	struct dev_pm_opp *opp = NULL;
+	int ret = -EINVAL;
+
+	old_freq = clk_get_rate(rknpu_dev->clks[0].clk);
+	freq = old_freq;
+
+	opp = devfreq_recommended_opp(dev, &freq, 0);
+	volt = dev_pm_opp_get_voltage(opp);
+
+	if (freq >= old_freq && rknpu_dev->vdd) {
+		ret = regulator_set_voltage(rknpu_dev->vdd, volt, INT_MAX);
+		if (ret) {
+			LOG_DEV_ERROR(dev, "failed to set volt %lu\n", volt);
+			return ret;
+		}
+	}
+	LOG_DEV_DEBUG(dev, "adjust current freq=%luHz, volt=%luuV\n", freq,
+		      volt);
+	ret = clk_set_rate(rknpu_dev->clks[0].clk, freq);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to set clock %lu\n", freq);
+		return ret;
+	}
+	if (freq < old_freq && rknpu_dev->vdd) {
+		ret = regulator_set_voltage(rknpu_dev->vdd, volt, INT_MAX);
+		if (ret) {
+			LOG_DEV_ERROR(dev, "failed to set volt %lu\n", volt);
+			return ret;
+		}
+	}
+	rknpu_dev->current_freq = freq;
+	rknpu_dev->current_volt = volt;
+
+	return 0;
+}
+
+static int rknpu_devfreq_init(struct rknpu_device *rknpu_dev)
+{
+	struct device *dev = rknpu_dev->dev;
+	struct devfreq_dev_profile *dp = &npu_devfreq_profile;
+	int ret = -EINVAL;
+
+	ret = rockchip_init_opp_table(dev, NULL, "npu_leakage", "rknpu");
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to init_opp_table\n");
+		return ret;
+	}
+
+	ret = npu_devfreq_adjust_current_freq_volt(dev, rknpu_dev);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to adjust current freq volt\n");
+		goto err_remove_table;
+	}
+	dp->initial_freq = rknpu_dev->current_freq;
+
+#ifdef CONFIG_PM_DEVFREQ
+	ret = devfreq_add_governor(&devfreq_rknpu_ondemand);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to add rknpu_ondemand governor\n");
+		goto err_remove_table;
+	}
+#endif
+
+	rknpu_dev->devfreq = devm_devfreq_add_device(dev, dp, "rknpu_ondemand",
+						     (void *)rknpu_dev);
+	if (IS_ERR(rknpu_dev->devfreq)) {
+		LOG_DEV_ERROR(dev, "failed to add devfreq\n");
+		ret = PTR_ERR(rknpu_dev->devfreq);
+		goto err_remove_governor;
+	}
+	devm_devfreq_register_opp_notifier(dev, rknpu_dev->devfreq);
+
+	rknpu_dev->devfreq->last_status.current_frequency = dp->initial_freq;
+	rknpu_dev->devfreq->last_status.total_time = 1;
+	rknpu_dev->devfreq->last_status.busy_time = 1;
+
+	rknpu_dev->current_freq = clk_get_rate(rknpu_dev->clks[0].clk);
+	rknpu_dev->current_volt = regulator_get_voltage(rknpu_dev->vdd);
+
+	if (IS_ERR_OR_NULL(rknpu_dev->devfreq_cooling))
+		LOG_DEV_ERROR(dev, "failed to register cooling device\n");
+
+	return 0;
+
+err_remove_governor:
+#ifdef CONFIG_PM_DEVFREQ
+	devfreq_remove_governor(&devfreq_rknpu_ondemand);
+#endif
+err_remove_table:
+	dev_pm_opp_of_remove_table(dev);
+
+	rknpu_dev->devfreq = NULL;
+
+	return ret;
+}
+
+static int rknpu_devfreq_remove(struct rknpu_device *rknpu_dev)
+{
+	if (rknpu_dev->devfreq) {
+		devfreq_unregister_opp_notifier(rknpu_dev->dev,
+						rknpu_dev->devfreq);
+		dev_pm_opp_of_remove_table(rknpu_dev->dev);
+#ifdef CONFIG_PM_DEVFREQ
+		devfreq_remove_governor(&devfreq_rknpu_ondemand);
+#endif
+	}
+
+	return 0;
+}
+
+static int rknpu_register_irq(struct platform_device *pdev,
+			      struct rknpu_device *rknpu_dev)
+{
+	const struct rknpu_config *config = rknpu_dev->config;
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	int i, ret, irq;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_IRQ,
+					   config->irqs[0].name);
+	if (res) {
+		/* there are irq names in dts */
+		for (i = 0; i < config->num_irqs; i++) {
+			irq = platform_get_irq_byname(pdev,
+						      config->irqs[i].name);
+			if (irq < 0) {
+				LOG_DEV_ERROR(dev, "no npu %s in dts\n",
+					      config->irqs[i].name);
+				return irq;
+			}
+
+			ret = devm_request_irq(dev, irq,
+					       config->irqs[i].irq_hdl,
+					       IRQF_SHARED, dev_name(dev),
+					       rknpu_dev);
+			if (ret < 0) {
+				LOG_DEV_ERROR(dev, "request %s failed: %d\n",
+					      config->irqs[i].name, ret);
+				return ret;
+			}
+		}
+	} else {
+		/* no irq names in dts */
+		irq = platform_get_irq(pdev, 0);
+		if (irq < 0) {
+			LOG_DEV_ERROR(dev, "no npu irq in dts\n");
+			return irq;
+		}
+
+		ret = devm_request_irq(dev, irq, rknpu_core0_irq_handler,
+				       IRQF_SHARED, dev_name(dev), rknpu_dev);
+		if (ret < 0) {
+			LOG_DEV_ERROR(dev, "request irq failed: %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int rknpu_find_sram_resource(struct rknpu_device *rknpu_dev)
+{
+	struct device *dev = rknpu_dev->dev;
+	struct device_node *sram_node = NULL;
+	struct resource sram_res;
+	uint32_t sram_size = 0;
+	int ret = -EINVAL;
+
+	/* get sram device node */
+	sram_node = of_parse_phandle(dev->of_node, "rockchip,sram", 0);
+	rknpu_dev->sram_size = 0;
+	if (!sram_node)
+		return -EINVAL;
+
+	/* get sram start and size */
+	ret = of_address_to_resource(sram_node, 0, &sram_res);
+	of_node_put(sram_node);
+	if (ret)
+		return ret;
+
+	/* check sram start and size is PAGE_SIZE align */
+	rknpu_dev->sram_start = round_up(sram_res.start, PAGE_SIZE);
+	rknpu_dev->sram_end = round_down(
+		sram_res.start + resource_size(&sram_res), PAGE_SIZE);
+	if (rknpu_dev->sram_end <= rknpu_dev->sram_start) {
+		LOG_DEV_WARN(
+			dev,
+			"invalid sram resource, sram start %pa, sram end %pa\n",
+			&rknpu_dev->sram_start, &rknpu_dev->sram_end);
+		return -EINVAL;
+	}
+
+	sram_size = rknpu_dev->sram_end - rknpu_dev->sram_start;
+
+	rknpu_dev->sram_base_io =
+		devm_ioremap(dev, rknpu_dev->sram_start, sram_size);
+	if (IS_ERR(rknpu_dev->sram_base_io)) {
+		LOG_DEV_ERROR(dev, "failed to remap sram base io!\n");
+		rknpu_dev->sram_base_io = NULL;
+	}
+
+	rknpu_dev->sram_size = sram_size;
+
+	LOG_DEV_INFO(dev, "sram region: [%pa, %pa), sram size: %#x\n",
+		     &rknpu_dev->sram_start, &rknpu_dev->sram_end,
+		     rknpu_dev->sram_size);
+
+	return 0;
+}
+
+static int rknpu_probe(struct platform_device *pdev)
+{
+	struct resource *res = NULL;
+	struct rknpu_device *rknpu_dev = NULL;
+	struct device *dev = &pdev->dev;
+	struct device *virt_dev = NULL;
+	const struct of_device_id *match = NULL;
+	const struct rknpu_config *config = NULL;
+	int ret = -EINVAL, i = 0;
+
+	if (!pdev->dev.of_node) {
+		LOG_DEV_ERROR(dev, "rknpu device-tree data is missing!\n");
+		return -ENODEV;
+	}
+
+	match = of_match_device(rknpu_of_match, dev);
+	if (!match) {
+		LOG_DEV_ERROR(dev, "rknpu device-tree entry is missing!\n");
+		return -ENODEV;
+	}
+
+	rknpu_dev = devm_kzalloc(dev, sizeof(*rknpu_dev), GFP_KERNEL);
+	if (!rknpu_dev) {
+		LOG_DEV_ERROR(dev, "failed to allocate rknpu device!\n");
+		return -ENOMEM;
+	}
+
+	config = of_device_get_match_data(dev);
+	if (!config)
+		return -EINVAL;
+
+	rknpu_dev->config = config;
+	rknpu_dev->dev = dev;
+
+	rknpu_dev->iommu_en = rknpu_is_iommu_enable(dev);
+	if (!rknpu_dev->iommu_en) {
+		/* Initialize reserved memory resources */
+		ret = of_reserved_mem_device_init(dev);
+		if (!ret) {
+			LOG_DEV_INFO(
+				dev,
+				"initialize reserved memory for rknpu device!\n");
+		}
+	}
+
+	rknpu_dev->bypass_irq_handler = bypass_irq_handler;
+	rknpu_dev->bypass_soft_reset = bypass_soft_reset;
+
+	rknpu_reset_get(rknpu_dev);
+
+	rknpu_dev->num_clks = devm_clk_bulk_get_all(dev, &rknpu_dev->clks);
+	if (rknpu_dev->num_clks < 1) {
+		LOG_DEV_ERROR(dev, "failed to get clk source for rknpu\n");
+#ifndef FPGA_PLATFORM
+		return -ENODEV;
+#endif
+	}
+
+#ifndef FPGA_PLATFORM
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// 	if (strstr(__clk_get_name(rknpu_dev->clks[0].clk), "scmi"))
+// 		rknpu_dev->opp_info.scmi_clk = rknpu_dev->clks[0].clk;
+// #endif
+
+	rknpu_dev->vdd = devm_regulator_get_optional(dev, "rknpu");
+	if (IS_ERR(rknpu_dev->vdd)) {
+		if (PTR_ERR(rknpu_dev->vdd) != -ENODEV) {
+			ret = PTR_ERR(rknpu_dev->vdd);
+			LOG_DEV_ERROR(
+				dev,
+				"failed to get vdd regulator for rknpu: %d\n",
+				ret);
+			return ret;
+		}
+		rknpu_dev->vdd = NULL;
+	}
+
+	rknpu_dev->mem = devm_regulator_get_optional(dev, "mem");
+	if (IS_ERR(rknpu_dev->mem)) {
+		if (PTR_ERR(rknpu_dev->mem) != -ENODEV) {
+			ret = PTR_ERR(rknpu_dev->mem);
+			LOG_DEV_ERROR(
+				dev,
+				"failed to get mem regulator for rknpu: %d\n",
+				ret);
+			return ret;
+		}
+		rknpu_dev->mem = NULL;
+	}
+#endif
+
+	spin_lock_init(&rknpu_dev->lock);
+	spin_lock_init(&rknpu_dev->irq_lock);
+	mutex_init(&rknpu_dev->power_lock);
+	mutex_init(&rknpu_dev->reset_lock);
+	for (i = 0; i < config->num_irqs; i++) {
+		INIT_LIST_HEAD(&rknpu_dev->subcore_datas[i].todo_list);
+		init_waitqueue_head(&rknpu_dev->subcore_datas[i].job_done_wq);
+		rknpu_dev->subcore_datas[i].task_num = 0;
+		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
+		if (!res) {
+			LOG_DEV_ERROR(
+				dev,
+				"failed to get memory resource for rknpu\n");
+			return -ENXIO;
+		}
+
+		rknpu_dev->base[i] = devm_ioremap_resource(dev, res);
+		if (PTR_ERR(rknpu_dev->base[i]) == -EBUSY) {
+			rknpu_dev->base[i] = devm_ioremap(dev, res->start,
+							  resource_size(res));
+		}
+
+		if (IS_ERR(rknpu_dev->base[i])) {
+			LOG_DEV_ERROR(dev,
+				      "failed to remap register for rknpu\n");
+			return PTR_ERR(rknpu_dev->base[i]);
+		}
+	}
+
+	if (config->bw_priority_length > 0) {
+		rknpu_dev->bw_priority_base =
+			devm_ioremap(dev, config->bw_priority_addr,
+				     config->bw_priority_length);
+		if (IS_ERR(rknpu_dev->bw_priority_base)) {
+			LOG_DEV_ERROR(
+				rknpu_dev->dev,
+				"failed to remap bw priority register for rknpu\n");
+			rknpu_dev->bw_priority_base = NULL;
+		}
+	}
+
+	if (!rknpu_dev->bypass_irq_handler) {
+		ret = rknpu_register_irq(pdev, rknpu_dev);
+		if (ret)
+			return ret;
+	} else {
+		LOG_DEV_WARN(dev, "bypass irq handler!\n");
+	}
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	ret = rknpu_drm_probe(rknpu_dev);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "failed to probe device for rknpu\n");
+		return ret;
+	}
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+	rknpu_dev->miscdev.minor = MISC_DYNAMIC_MINOR;
+	rknpu_dev->miscdev.name = "rknpu";
+	rknpu_dev->miscdev.fops = &rknpu_fops;
+
+	ret = misc_register(&rknpu_dev->miscdev);
+	if (ret) {
+		LOG_DEV_ERROR(dev, "cannot register miscdev (%d)\n", ret);
+		return ret;
+	}
+
+	rknpu_dev->heap = rk_dma_heap_find("rk-dma-heap-cma");
+	if (!rknpu_dev->heap) {
+		LOG_DEV_ERROR(dev, "failed to find cma heap\n");
+		return -ENOMEM;
+	}
+	rk_dma_heap_set_dev(dev);
+	LOG_DEV_INFO(dev, "Initialized %s: v%d.%d.%d for %s\n", DRIVER_DESC,
+		     DRIVER_MAJOR, DRIVER_MINOR, DRIVER_PATCHLEVEL,
+		     DRIVER_DATE);
+#endif
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_FENCE
+	ret = rknpu_fence_context_alloc(rknpu_dev);
+	if (ret) {
+		LOG_DEV_ERROR(dev,
+			      "failed to allocate fence context for rknpu\n");
+		goto err_remove_drv;
+	}
+#endif
+
+	platform_set_drvdata(pdev, rknpu_dev);
+
+	pm_runtime_enable(dev);
+
+	if (of_count_phandle_with_args(dev->of_node, "power-domains",
+				       "#power-domain-cells") > 1) {
+		virt_dev = dev_pm_domain_attach_by_name(dev, "npu0");
+		if (!IS_ERR(virt_dev))
+			rknpu_dev->genpd_dev_npu0 = virt_dev;
+		virt_dev = dev_pm_domain_attach_by_name(dev, "npu1");
+		if (!IS_ERR(virt_dev))
+			rknpu_dev->genpd_dev_npu1 = virt_dev;
+		virt_dev = dev_pm_domain_attach_by_name(dev, "npu2");
+		if (!IS_ERR(virt_dev))
+			rknpu_dev->genpd_dev_npu2 = virt_dev;
+		rknpu_dev->multiple_domains = true;
+	}
+
+	ret = rknpu_power_on(rknpu_dev);
+	if (ret)
+		goto err_remove_drv;
+
+#ifndef FPGA_PLATFORM
+	rknpu_devfreq_init(rknpu_dev);
+#endif
+
+	// set default power put delay to 3s
+	rknpu_dev->power_put_delay = 3000;
+	rknpu_dev->power_off_wq =
+		create_freezable_workqueue("rknpu_power_off_wq");
+	if (!rknpu_dev->power_off_wq) {
+		LOG_DEV_ERROR(dev, "rknpu couldn't create power_off workqueue");
+		ret = -ENOMEM;
+		goto err_devfreq_remove;
+	}
+	INIT_DEFERRABLE_WORK(&rknpu_dev->power_off_work,
+			     rknpu_power_off_delay_work);
+
+	if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) && rknpu_dev->iommu_en) {
+		if (!rknpu_find_sram_resource(rknpu_dev)) {
+			ret = rknpu_mm_create(rknpu_dev->sram_size, PAGE_SIZE,
+					      &rknpu_dev->sram_mm);
+			if (ret != 0)
+				goto err_remove_wq;
+		} else {
+			LOG_DEV_WARN(dev, "could not find sram resource!\n");
+		}
+	}
+
+	rknpu_power_off(rknpu_dev);
+	atomic_set(&rknpu_dev->power_refcount, 0);
+	atomic_set(&rknpu_dev->cmdline_power_refcount, 0);
+
+	rknpu_debugger_init(rknpu_dev);
+	rknpu_init_timer(rknpu_dev);
+
+	return 0;
+
+err_remove_wq:
+	destroy_workqueue(rknpu_dev->power_off_wq);
+
+err_devfreq_remove:
+#ifndef FPGA_PLATFORM
+	rknpu_devfreq_remove(rknpu_dev);
+#endif
+
+err_remove_drv:
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	rknpu_drm_remove(rknpu_dev);
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+	misc_deregister(&(rknpu_dev->miscdev));
+#endif
+
+	return ret;
+}
+
+static int rknpu_remove(struct platform_device *pdev)
+{
+	struct rknpu_device *rknpu_dev = platform_get_drvdata(pdev);
+	int i = 0;
+
+	cancel_delayed_work_sync(&rknpu_dev->power_off_work);
+	destroy_workqueue(rknpu_dev->power_off_wq);
+
+	if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) && rknpu_dev->sram_mm)
+		rknpu_mm_destroy(rknpu_dev->sram_mm);
+
+	rknpu_debugger_remove(rknpu_dev);
+	rknpu_cancel_timer(rknpu_dev);
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		WARN_ON(rknpu_dev->subcore_datas[i].job);
+		WARN_ON(!list_empty(&rknpu_dev->subcore_datas[i].todo_list));
+	}
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	rknpu_drm_remove(rknpu_dev);
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+	misc_deregister(&(rknpu_dev->miscdev));
+#endif
+
+#ifndef FPGA_PLATFORM
+	rknpu_devfreq_remove(rknpu_dev);
+#endif
+
+	mutex_lock(&rknpu_dev->power_lock);
+	if (atomic_read(&rknpu_dev->power_refcount) > 0)
+		rknpu_power_off(rknpu_dev);
+	mutex_unlock(&rknpu_dev->power_lock);
+
+	if (rknpu_dev->multiple_domains) {
+		if (rknpu_dev->genpd_dev_npu0)
+			dev_pm_domain_detach(rknpu_dev->genpd_dev_npu0, true);
+		if (rknpu_dev->genpd_dev_npu1)
+			dev_pm_domain_detach(rknpu_dev->genpd_dev_npu1, true);
+		if (rknpu_dev->genpd_dev_npu2)
+			dev_pm_domain_detach(rknpu_dev->genpd_dev_npu2, true);
+	}
+
+	pm_runtime_disable(&pdev->dev);
+
+	return 0;
+}
+
+#ifndef FPGA_PLATFORM
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// static int rknpu_runtime_suspend(struct device *dev)
+// {
+// 	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev);
+// 	struct rockchip_opp_info *opp_info = &rknpu_dev->opp_info;
+
+// 	if (opp_info->scmi_clk) {
+// 		if (clk_set_rate(opp_info->scmi_clk, POWER_DOWN_FREQ))
+// 			LOG_DEV_ERROR(dev, "failed to restore clk rate\n");
+// 	}
+// 	opp_info->current_rm = UINT_MAX;
+
+// 	return 0;
+// }
+
+// static int rknpu_runtime_resume(struct device *dev)
+// {
+// 	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev);
+// 	struct rockchip_opp_info *opp_info = &rknpu_dev->opp_info;
+// 	int ret = 0;
+
+// 	if (!rknpu_dev->current_freq || !rknpu_dev->current_volt)
+// 		return 0;
+
+// 	ret = clk_bulk_prepare_enable(opp_info->num_clks, opp_info->clks);
+// 	if (ret) {
+// 		LOG_DEV_ERROR(dev, "failed to enable opp clks\n");
+// 		return ret;
+// 	}
+
+// 	if (opp_info->data && opp_info->data->set_read_margin)
+// 		opp_info->data->set_read_margin(dev, opp_info,
+// 						opp_info->target_rm);
+// 	if (opp_info->scmi_clk) {
+// 		if (clk_set_rate(opp_info->scmi_clk, rknpu_dev->current_freq))
+// 			LOG_DEV_ERROR(dev, "failed to set power down rate\n");
+// 	}
+
+// 	clk_bulk_disable_unprepare(opp_info->num_clks, opp_info->clks);
+
+// 	return ret;
+// }
+
+// static const struct dev_pm_ops rknpu_pm_ops = {
+// 	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+// 				pm_runtime_force_resume)
+// 		SET_RUNTIME_PM_OPS(rknpu_runtime_suspend, rknpu_runtime_resume,
+// 				   NULL)
+// };
+// #endif
+#endif
+
+static struct platform_driver rknpu_driver = {
+	.probe = rknpu_probe,
+	.remove = rknpu_remove,
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "RKNPU",
+#ifndef FPGA_PLATFORM
+// #if KERNEL_VERSION(5, 5, 0) < LINUX_VERSION_CODE
+// 		.pm = &rknpu_pm_ops,
+// #endif
+#endif
+		.of_match_table = of_match_ptr(rknpu_of_match),
+	},
+};
+
+static int rknpu_init(void)
+{
+	return platform_driver_register(&rknpu_driver);
+}
+
+static void rknpu_exit(void)
+{
+	platform_driver_unregister(&rknpu_driver);
+}
+
+late_initcall(rknpu_init);
+module_exit(rknpu_exit);
+
+MODULE_DESCRIPTION("rknpu driver");
+MODULE_AUTHOR("Felix Zeng <felix.zeng@rock-chips.com>");
+MODULE_ALIAS("rockchip-rknpu");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(RKNPU_GET_DRV_VERSION_STRING(DRIVER_MAJOR, DRIVER_MINOR,
+					    DRIVER_PATCHLEVEL));
+MODULE_IMPORT_NS(DMA_BUF);
diff --git a/drivers/rknpu/rknpu_fence.c b/drivers/rknpu/rknpu_fence.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc22ea1c4e120abab4293bf02ca2a152a92e0bc8
--- /dev/null
+++ b/drivers/rknpu/rknpu_fence.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/dma-fence.h>
+#include <linux/sync_file.h>
+
+#include "rknpu_drv.h"
+#include "rknpu_job.h"
+
+#include "rknpu_fence.h"
+
+static const char *rknpu_fence_get_name(struct dma_fence *fence)
+{
+	return DRIVER_NAME;
+}
+
+static const struct dma_fence_ops rknpu_fence_ops = {
+	.get_driver_name = rknpu_fence_get_name,
+	.get_timeline_name = rknpu_fence_get_name,
+};
+
+int rknpu_fence_context_alloc(struct rknpu_device *rknpu_dev)
+{
+	struct rknpu_fence_context *fence_ctx = NULL;
+
+	fence_ctx =
+		devm_kzalloc(rknpu_dev->dev, sizeof(*fence_ctx), GFP_KERNEL);
+	if (!fence_ctx)
+		return -ENOMEM;
+
+	fence_ctx->context = dma_fence_context_alloc(1);
+	spin_lock_init(&fence_ctx->spinlock);
+
+	rknpu_dev->fence_ctx = fence_ctx;
+
+	return 0;
+}
+
+int rknpu_fence_alloc(struct rknpu_job *job)
+{
+	struct rknpu_fence_context *fence_ctx = job->rknpu_dev->fence_ctx;
+	struct dma_fence *fence = NULL;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return -ENOMEM;
+
+	dma_fence_init(fence, &rknpu_fence_ops, &fence_ctx->spinlock,
+		       fence_ctx->context, ++fence_ctx->seqno);
+
+	job->fence = fence;
+
+	return 0;
+}
+
+int rknpu_fence_get_fd(struct rknpu_job *job)
+{
+	struct sync_file *sync_file = NULL;
+	int fence_fd = -1;
+
+	if (!job->fence)
+		return -EINVAL;
+
+	fence_fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fence_fd < 0)
+		return fence_fd;
+
+	sync_file = sync_file_create(job->fence);
+	if (!sync_file)
+		return -ENOMEM;
+
+	fd_install(fence_fd, sync_file->file);
+
+	return fence_fd;
+}
diff --git a/drivers/rknpu/rknpu_gem.c b/drivers/rknpu/rknpu_gem.c
new file mode 100644
index 0000000000000000000000000000000000000000..e0b151ec43c78c8cd2eeacdf62b9e22fbb9e09bc
--- /dev/null
+++ b/drivers/rknpu/rknpu_gem.c
@@ -0,0 +1,1295 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_vma_manager.h>
+#include <drm/drm_prime.h>
+#include <drm/drm_file.h>
+#include <drm/drm_drv.h>
+
+#include <linux/shmem_fs.h>
+#include <linux/dma-buf.h>
+#include <linux/iommu.h>
+#include <linux/dma-iommu.h>
+#include <linux/pfn_t.h>
+#include <linux/version.h>
+#include <asm/cacheflush.h>
+
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// #include <linux/dma-map-ops.h>
+// #endif
+
+#include "rknpu_drv.h"
+#include "rknpu_ioctl.h"
+#include "rknpu_gem.h"
+
+#define RKNPU_GEM_ALLOC_FROM_PAGES 1
+
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+static int rknpu_gem_get_pages(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+	struct scatterlist *s = NULL;
+	dma_addr_t dma_addr = 0;
+	dma_addr_t phys = 0;
+	int ret = -EINVAL, i = 0;
+
+	rknpu_obj->pages = drm_gem_get_pages(&rknpu_obj->base);
+	if (IS_ERR(rknpu_obj->pages)) {
+		ret = PTR_ERR(rknpu_obj->pages);
+		LOG_ERROR("failed to get pages: %d\n", ret);
+		return ret;
+	}
+
+	rknpu_obj->num_pages = rknpu_obj->size >> PAGE_SHIFT;
+
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// 	rknpu_obj->sgt = drm_prime_pages_to_sg(drm, rknpu_obj->pages,
+// 					       rknpu_obj->num_pages);
+// #else
+	// rknpu_obj->sgt =
+	// 	drm_prime_pages_to_sg(rknpu_obj->pages, rknpu_obj->num_pages);
+	rknpu_obj->sgt =
+		drm_prime_pages_to_sg(drm, rknpu_obj->pages, rknpu_obj->num_pages);
+// #endif
+	if (IS_ERR(rknpu_obj->sgt)) {
+		ret = PTR_ERR(rknpu_obj->sgt);
+		LOG_ERROR("failed to allocate sgt: %d\n", ret);
+		goto put_pages;
+	}
+
+	ret = dma_map_sg(drm->dev, rknpu_obj->sgt->sgl, rknpu_obj->sgt->nents,
+			 DMA_BIDIRECTIONAL);
+	if (ret == 0) {
+		ret = -EFAULT;
+		LOG_DEV_ERROR(drm->dev, "%s: dma map %zu fail\n", __func__,
+			      rknpu_obj->size);
+		goto free_sgt;
+	}
+
+	if (rknpu_obj->flags & RKNPU_MEM_KERNEL_MAPPING) {
+		rknpu_obj->cookie = vmap(rknpu_obj->pages, rknpu_obj->num_pages,
+					 VM_MAP, PAGE_KERNEL);
+		if (!rknpu_obj->cookie) {
+			ret = -ENOMEM;
+			LOG_ERROR("failed to vmap: %d\n", ret);
+			goto unmap_sg;
+		}
+		rknpu_obj->kv_addr = rknpu_obj->cookie;
+	}
+
+	dma_addr = sg_dma_address(rknpu_obj->sgt->sgl);
+	rknpu_obj->dma_addr = dma_addr;
+
+	for_each_sg(rknpu_obj->sgt->sgl, s, rknpu_obj->sgt->nents, i) {
+		dma_addr += s->length;
+		phys = sg_phys(s);
+		LOG_DEBUG(
+			"gem pages alloc sgt[%d], dma_address: %pad, length: %#x, phys: %pad, virt: %p\n",
+			i, &dma_addr, s->length, &phys, sg_virt(s));
+	}
+
+	return 0;
+
+unmap_sg:
+	dma_unmap_sg(drm->dev, rknpu_obj->sgt->sgl, rknpu_obj->sgt->nents,
+		     DMA_BIDIRECTIONAL);
+
+free_sgt:
+	sg_free_table(rknpu_obj->sgt);
+	kfree(rknpu_obj->sgt);
+
+put_pages:
+	drm_gem_put_pages(&rknpu_obj->base, rknpu_obj->pages, false, false);
+
+	return ret;
+}
+
+static void rknpu_gem_put_pages(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+
+	if (rknpu_obj->flags & RKNPU_MEM_KERNEL_MAPPING) {
+		vunmap(rknpu_obj->kv_addr);
+		rknpu_obj->kv_addr = NULL;
+	}
+
+	if (rknpu_obj->sgt != NULL) {
+		dma_unmap_sg(drm->dev, rknpu_obj->sgt->sgl,
+			rknpu_obj->sgt->nents, DMA_BIDIRECTIONAL);
+		sg_free_table(rknpu_obj->sgt);
+		kfree(rknpu_obj->sgt);
+	}
+
+	drm_gem_put_pages(&rknpu_obj->base, rknpu_obj->pages, true, true);
+}
+#endif
+
+static int rknpu_gem_alloc_buf(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+	unsigned int nr_pages = 0;
+	struct sg_table *sgt = NULL;
+	struct scatterlist *s = NULL;
+	gfp_t gfp_mask = GFP_KERNEL;
+	int ret = -EINVAL, i = 0;
+
+	if (rknpu_obj->dma_addr) {
+		LOG_DEBUG("buffer already allocated.\n");
+		return 0;
+	}
+
+	rknpu_obj->dma_attrs = 0;
+
+	/*
+	 * if RKNPU_MEM_CONTIGUOUS, fully physically contiguous memory
+	 * region will be allocated else physically contiguous
+	 * as possible.
+	 */
+	if (!(rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS))
+		rknpu_obj->dma_attrs |= DMA_ATTR_FORCE_CONTIGUOUS;
+
+	// cacheable mapping or writecombine mapping
+	if (rknpu_obj->flags & RKNPU_MEM_CACHEABLE) {
+#ifdef DMA_ATTR_NON_CONSISTENT
+		rknpu_obj->dma_attrs |= DMA_ATTR_NON_CONSISTENT;
+#endif
+#ifdef DMA_ATTR_SYS_CACHE_ONLY
+		rknpu_obj->dma_attrs |= DMA_ATTR_SYS_CACHE_ONLY;
+#endif
+	} else if (rknpu_obj->flags & RKNPU_MEM_WRITE_COMBINE) {
+		rknpu_obj->dma_attrs |= DMA_ATTR_WRITE_COMBINE;
+	}
+
+	if (!(rknpu_obj->flags & RKNPU_MEM_KERNEL_MAPPING))
+		rknpu_obj->dma_attrs |= DMA_ATTR_NO_KERNEL_MAPPING;
+
+#ifdef DMA_ATTR_SKIP_ZEROING
+	if (!(rknpu_obj->flags & RKNPU_MEM_ZEROING))
+		rknpu_obj->dma_attrs |= DMA_ATTR_SKIP_ZEROING;
+#endif
+
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+	if ((rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS) &&
+	    rknpu_dev->iommu_en) {
+		return rknpu_gem_get_pages(rknpu_obj);
+	}
+#endif
+
+	if (rknpu_obj->flags & RKNPU_MEM_ZEROING)
+		gfp_mask |= __GFP_ZERO;
+
+	if (!(rknpu_obj->flags & RKNPU_MEM_NON_DMA32)) {
+		gfp_mask &= ~__GFP_HIGHMEM;
+		gfp_mask |= __GFP_DMA32;
+	}
+
+	nr_pages = rknpu_obj->size >> PAGE_SHIFT;
+
+	rknpu_obj->pages = rknpu_gem_alloc_page(nr_pages);
+	if (!rknpu_obj->pages) {
+		LOG_ERROR("failed to allocate pages.\n");
+		return -ENOMEM;
+	}
+
+	rknpu_obj->cookie =
+		dma_alloc_attrs(drm->dev, rknpu_obj->size, &rknpu_obj->dma_addr,
+				gfp_mask, rknpu_obj->dma_attrs);
+	if (!rknpu_obj->cookie) {
+		/*
+		 * when RKNPU_MEM_CONTIGUOUS and IOMMU is available
+		 * try to fallback to allocate non-contiguous buffer
+		 */
+		if (!(rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS) &&
+		    rknpu_dev->iommu_en) {
+			LOG_DEV_WARN(
+				drm->dev,
+				"try to fallback to allocate non-contiguous %lu buffer.\n",
+				rknpu_obj->size);
+			rknpu_obj->dma_attrs &= ~DMA_ATTR_FORCE_CONTIGUOUS;
+			rknpu_obj->flags |= RKNPU_MEM_NON_CONTIGUOUS;
+			rknpu_obj->cookie =
+				dma_alloc_attrs(drm->dev, rknpu_obj->size,
+						&rknpu_obj->dma_addr, gfp_mask,
+						rknpu_obj->dma_attrs);
+			if (!rknpu_obj->cookie) {
+				LOG_DEV_ERROR(
+					drm->dev,
+					"failed to allocate non-contiguous %lu buffer.\n",
+					rknpu_obj->size);
+				goto err_free;
+			}
+		} else {
+			LOG_DEV_ERROR(drm->dev,
+				      "failed to allocate %lu buffer.\n",
+				      rknpu_obj->size);
+			goto err_free;
+		}
+	}
+
+	if (rknpu_obj->flags & RKNPU_MEM_KERNEL_MAPPING)
+		rknpu_obj->kv_addr = rknpu_obj->cookie;
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt) {
+		ret = -ENOMEM;
+		goto err_free_dma;
+	}
+
+	ret = dma_get_sgtable_attrs(drm->dev, sgt, rknpu_obj->cookie,
+				    rknpu_obj->dma_addr, rknpu_obj->size,
+				    rknpu_obj->dma_attrs);
+	if (ret < 0) {
+		LOG_DEV_ERROR(drm->dev, "failed to get sgtable.\n");
+		goto err_free_sgt;
+	}
+
+	for_each_sg(sgt->sgl, s, sgt->nents, i) {
+		sg_dma_address(s) = sg_phys(s);
+		LOG_DEBUG("dma alloc sgt[%d], phys_address: %pad, length: %u\n",
+			  i, &s->dma_address, s->length);
+	}
+
+	if (drm_prime_sg_to_page_array(sgt, rknpu_obj->pages,
+					     nr_pages)) {
+		LOG_DEV_ERROR(drm->dev, "invalid sgtable.\n");
+		ret = -EINVAL;
+		goto err_free_sg_table;
+	}
+
+	rknpu_obj->sgt = sgt;
+
+	return ret;
+
+err_free_sg_table:
+	sg_free_table(sgt);
+err_free_sgt:
+	kfree(sgt);
+err_free_dma:
+	dma_free_attrs(drm->dev, rknpu_obj->size, rknpu_obj->cookie,
+		       rknpu_obj->dma_addr, rknpu_obj->dma_attrs);
+err_free:
+	rknpu_gem_free_page(rknpu_obj->pages);
+
+	return ret;
+}
+
+static void rknpu_gem_free_buf(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+#endif
+
+	if (!rknpu_obj->dma_addr) {
+		LOG_DEBUG("dma handle is invalid.\n");
+		return;
+	}
+
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+	if ((rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS) &&
+	    rknpu_dev->iommu_en) {
+		rknpu_gem_put_pages(rknpu_obj);
+		return;
+	}
+#endif
+
+	sg_free_table(rknpu_obj->sgt);
+	kfree(rknpu_obj->sgt);
+
+	dma_free_attrs(drm->dev, rknpu_obj->size, rknpu_obj->cookie,
+		       rknpu_obj->dma_addr, rknpu_obj->dma_attrs);
+
+	rknpu_gem_free_page(rknpu_obj->pages);
+
+	rknpu_obj->dma_addr = 0;
+}
+
+static int rknpu_gem_handle_create(struct drm_gem_object *obj,
+				   struct drm_file *file_priv,
+				   unsigned int *handle)
+{
+	int ret = -EINVAL;
+	/*
+	 * allocate a id of idr table where the obj is registered
+	 * and handle has the id what user can see.
+	 */
+	ret = drm_gem_handle_create(file_priv, obj, handle);
+	if (ret)
+		return ret;
+
+	LOG_DEBUG("gem handle: %#x\n", *handle);
+
+	/* drop reference from allocate - handle holds it now. */
+	rknpu_gem_object_put(obj);
+
+	return 0;
+}
+
+static int rknpu_gem_handle_destroy(struct drm_file *file_priv,
+				    unsigned int handle)
+{
+	return drm_gem_handle_delete(file_priv, handle);
+}
+
+static struct rknpu_gem_object *rknpu_gem_init(struct drm_device *drm,
+					       unsigned long size)
+{
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	struct drm_gem_object *obj = NULL;
+	gfp_t gfp_mask;
+	int ret = -EINVAL;
+
+	rknpu_obj = kzalloc(sizeof(*rknpu_obj), GFP_KERNEL);
+	if (!rknpu_obj)
+		return ERR_PTR(-ENOMEM);
+
+	obj = &rknpu_obj->base;
+
+	ret = drm_gem_object_init(drm, obj, size);
+	if (ret < 0) {
+		LOG_DEV_ERROR(drm->dev, "failed to initialize gem object\n");
+		kfree(rknpu_obj);
+		return ERR_PTR(ret);
+	}
+
+	rknpu_obj->size = rknpu_obj->base.size;
+
+	gfp_mask = mapping_gfp_mask(obj->filp->f_mapping);
+
+	if (rknpu_obj->flags & RKNPU_MEM_ZEROING)
+		gfp_mask |= __GFP_ZERO;
+
+	if (!(rknpu_obj->flags & RKNPU_MEM_NON_DMA32)) {
+		gfp_mask &= ~__GFP_HIGHMEM;
+		gfp_mask |= __GFP_DMA32;
+	}
+
+	mapping_set_gfp_mask(obj->filp->f_mapping, gfp_mask);
+
+	return rknpu_obj;
+}
+
+static void rknpu_gem_release(struct rknpu_gem_object *rknpu_obj)
+{
+	/* release file pointer to gem object. */
+	drm_gem_object_release(&rknpu_obj->base);
+	kfree(rknpu_obj);
+}
+
+static int rknpu_gem_alloc_buf_with_sram(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+	struct iommu_domain *domain = NULL;
+	struct rknpu_iommu_dma_cookie *cookie = NULL;
+	struct iova_domain *iovad = NULL;
+	struct scatterlist *s = NULL;
+	unsigned long length = 0;
+	unsigned long size = 0;
+	unsigned long offset = 0;
+	int i = 0;
+	int ret = -EINVAL;
+
+	/* iova map to sram */
+	domain = iommu_get_domain_for_dev(rknpu_dev->dev);
+	if (!domain) {
+		LOG_ERROR("failed to get iommu domain!");
+		return -EINVAL;
+	}
+
+	cookie = domain->iova_cookie;
+	iovad = &cookie->iovad;
+	rknpu_obj->iova_size =
+		iova_align(iovad, rknpu_obj->sram_size + rknpu_obj->size);
+	rknpu_obj->iova_start = rknpu_iommu_dma_alloc_iova(
+		domain, rknpu_obj->iova_size, dma_get_mask(drm->dev), drm->dev);
+	if (!rknpu_obj->iova_start) {
+		LOG_ERROR("iommu_dma_alloc_iova failed\n");
+		return -ENOMEM;
+	}
+
+	LOG_INFO("allocate iova start: %pad, size: %lu\n",
+		 &rknpu_obj->iova_start, rknpu_obj->iova_size);
+
+	/*
+	 * Overview SRAM + DDR map to IOVA
+	 * --------
+	 * sram_size: rknpu_obj->sram_size
+	 *   - allocate from SRAM, this size value has been page-aligned
+	 * size: rknpu_obj->size
+	 *   - allocate from DDR pages, this size value has been page-aligned
+	 * iova_size: rknpu_obj->iova_size
+	 *   - from iova_align(sram_size + size)
+	 *   - it may be larger than the (sram_size + size), and the larger part is not mapped
+	 * --------
+	 *
+	 * |<- sram_size ->|      |<- - - - size - - - ->|
+	 * +---------------+      +----------------------+
+	 * |     SRAM      |      |         DDR          |
+	 * +---------------+      +----------------------+
+	 *         |                    |
+	 * |       V       |            V          |
+	 * +---------------------------------------+
+	 * |             IOVA range                |
+	 * +---------------------------------------+
+	 * |<- - - - - - - iova_size - - - - - - ->|
+	 *
+	 */
+	offset = rknpu_obj->sram_obj->range_start *
+		 rknpu_dev->sram_mm->chunk_size;
+	ret = iommu_map(domain, rknpu_obj->iova_start,
+			rknpu_dev->sram_start + offset, rknpu_obj->sram_size,
+			IOMMU_READ | IOMMU_WRITE);
+	if (ret) {
+		LOG_ERROR("sram iommu_map error: %d\n", ret);
+		goto free_iova;
+	}
+
+	rknpu_obj->dma_addr = rknpu_obj->iova_start;
+
+	if (rknpu_obj->size == 0) {
+		LOG_INFO("allocate sram size: %lu\n", rknpu_obj->sram_size);
+		return 0;
+	}
+
+	rknpu_obj->pages = drm_gem_get_pages(&rknpu_obj->base);
+	if (IS_ERR(rknpu_obj->pages)) {
+		ret = PTR_ERR(rknpu_obj->pages);
+		LOG_ERROR("failed to get pages: %d\n", ret);
+		goto sram_unmap;
+	}
+
+	rknpu_obj->num_pages = rknpu_obj->size >> PAGE_SHIFT;
+
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// 	rknpu_obj->sgt = drm_prime_pages_to_sg(drm, rknpu_obj->pages,
+// 					       rknpu_obj->num_pages);
+// #else
+	rknpu_obj->sgt =
+		drm_prime_pages_to_sg(drm, rknpu_obj->pages, rknpu_obj->num_pages);
+// #endif
+	if (IS_ERR(rknpu_obj->sgt)) {
+		ret = PTR_ERR(rknpu_obj->sgt);
+		LOG_ERROR("failed to allocate sgt: %d\n", ret);
+		goto put_pages;
+	}
+
+	length = rknpu_obj->size;
+	offset = rknpu_obj->iova_start + rknpu_obj->sram_size;
+
+	for_each_sg(rknpu_obj->sgt->sgl, s, rknpu_obj->sgt->nents, i) {
+		size = (length < s->length) ? length : s->length;
+
+		ret = iommu_map(domain, offset, sg_phys(s), size,
+				IOMMU_READ | IOMMU_WRITE);
+		if (ret) {
+			LOG_ERROR("ddr iommu_map error: %d\n", ret);
+			goto sgl_unmap;
+		}
+
+		length -= size;
+		offset += size;
+
+		if (length == 0)
+			break;
+	}
+
+	LOG_INFO("allocate size: %lu with sram size: %lu\n", rknpu_obj->size,
+		 rknpu_obj->sram_size);
+
+	return 0;
+
+sgl_unmap:
+	iommu_unmap(domain, rknpu_obj->iova_start + rknpu_obj->sram_size,
+		    rknpu_obj->size - length);
+	sg_free_table(rknpu_obj->sgt);
+	kfree(rknpu_obj->sgt);
+
+put_pages:
+	drm_gem_put_pages(&rknpu_obj->base, rknpu_obj->pages, false, false);
+
+sram_unmap:
+	iommu_unmap(domain, rknpu_obj->iova_start, rknpu_obj->sram_size);
+
+free_iova:
+	rknpu_iommu_dma_free_iova(domain->iova_cookie, rknpu_obj->iova_start,
+				  rknpu_obj->iova_size);
+
+	return ret;
+}
+
+static void rknpu_gem_free_buf_with_sram(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+	struct iommu_domain *domain = NULL;
+
+	domain = iommu_get_domain_for_dev(rknpu_dev->dev);
+	if (domain) {
+		iommu_unmap(domain, rknpu_obj->iova_start,
+			    rknpu_obj->sram_size);
+		if (rknpu_obj->size > 0)
+			iommu_unmap(domain,
+				    rknpu_obj->iova_start +
+					    rknpu_obj->sram_size,
+				    rknpu_obj->size);
+		rknpu_iommu_dma_free_iova(domain->iova_cookie,
+					  rknpu_obj->iova_start,
+					  rknpu_obj->iova_size);
+	}
+
+	if (rknpu_obj->pages)
+		drm_gem_put_pages(&rknpu_obj->base, rknpu_obj->pages, true,
+				  true);
+
+	if (rknpu_obj->sgt != NULL) {
+		sg_free_table(rknpu_obj->sgt);
+		kfree(rknpu_obj->sgt);
+	}
+}
+
+struct rknpu_gem_object *rknpu_gem_object_create(struct drm_device *drm,
+						 unsigned int flags,
+						 unsigned long size,
+						 unsigned long sram_size)
+{
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	size_t remain_ddr_size = 0;
+	int ret = -EINVAL;
+
+	if (!size) {
+		LOG_DEV_ERROR(drm->dev, "invalid buffer size: %lu\n", size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	remain_ddr_size = round_up(size, PAGE_SIZE);
+
+	if (!rknpu_dev->iommu_en && (flags & RKNPU_MEM_NON_CONTIGUOUS)) {
+		/*
+		 * when no IOMMU is available, all allocated buffers are
+		 * contiguous anyway, so drop RKNPU_MEM_NON_CONTIGUOUS flag
+		 */
+		flags &= ~RKNPU_MEM_NON_CONTIGUOUS;
+		LOG_WARN(
+			"non-contiguous allocation is not supported without IOMMU, falling back to contiguous buffer\n");
+	}
+
+	if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) &&
+	    (flags & RKNPU_MEM_TRY_ALLOC_SRAM) && rknpu_dev->sram_size > 0) {
+		size_t sram_free_size = 0;
+		size_t real_sram_size = 0;
+
+		if (sram_size != 0)
+			sram_size = round_up(sram_size, PAGE_SIZE);
+
+		rknpu_obj = rknpu_gem_init(drm, remain_ddr_size);
+		if (IS_ERR(rknpu_obj))
+			return rknpu_obj;
+
+		/* set memory type and cache attribute from user side. */
+		rknpu_obj->flags = flags;
+
+		sram_free_size = rknpu_dev->sram_mm->free_chunks *
+				 rknpu_dev->sram_mm->chunk_size;
+		if (sram_free_size > 0) {
+			real_sram_size = remain_ddr_size;
+			if (sram_size != 0 && remain_ddr_size > sram_size)
+				real_sram_size = sram_size;
+			if (real_sram_size > sram_free_size)
+				real_sram_size = sram_free_size;
+			ret = rknpu_mm_alloc(rknpu_dev->sram_mm, real_sram_size,
+					     &rknpu_obj->sram_obj);
+			if (ret != 0) {
+				sram_free_size =
+					rknpu_dev->sram_mm->free_chunks *
+					rknpu_dev->sram_mm->chunk_size;
+				LOG_WARN(
+					"mm allocate %zu failed, ret: %d, free size: %zu\n",
+					real_sram_size, ret, sram_free_size);
+				real_sram_size = 0;
+			}
+		}
+
+		if (real_sram_size > 0) {
+			rknpu_obj->sram_size = real_sram_size;
+
+			ret = rknpu_gem_alloc_buf_with_sram(rknpu_obj);
+			if (ret < 0)
+				goto mm_free;
+			remain_ddr_size = 0;
+		}
+	}
+
+	if (remain_ddr_size > 0) {
+		rknpu_obj = rknpu_gem_init(drm, remain_ddr_size);
+		if (IS_ERR(rknpu_obj))
+			return rknpu_obj;
+
+		/* set memory type and cache attribute from user side. */
+		rknpu_obj->flags = flags;
+
+		ret = rknpu_gem_alloc_buf(rknpu_obj);
+		if (ret < 0)
+			goto gem_release;
+	}
+
+	if (rknpu_obj)
+		LOG_DEBUG(
+			"created dma addr: %pad, cookie: %p, ddr size: %lu, sram size: %lu, attrs: %#lx, flags: %#x\n",
+			&rknpu_obj->dma_addr, rknpu_obj->cookie, rknpu_obj->size,
+			rknpu_obj->sram_size, rknpu_obj->dma_attrs, rknpu_obj->flags);
+
+	return rknpu_obj;
+
+mm_free:
+	if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) &&
+	    rknpu_obj->sram_obj != NULL)
+		rknpu_mm_free(rknpu_dev->sram_mm, rknpu_obj->sram_obj);
+
+gem_release:
+	rknpu_gem_release(rknpu_obj);
+
+	return ERR_PTR(ret);
+}
+
+void rknpu_gem_object_destroy(struct rknpu_gem_object *rknpu_obj)
+{
+	struct drm_gem_object *obj = &rknpu_obj->base;
+
+	LOG_DEBUG(
+		"destroy dma addr: %pad, cookie: %p, size: %lu, attrs: %#lx, flags: %#x, handle count: %d\n",
+		&rknpu_obj->dma_addr, rknpu_obj->cookie, rknpu_obj->size,
+		rknpu_obj->dma_attrs, rknpu_obj->flags, obj->handle_count);
+
+	/*
+	 * do not release memory region from exporter.
+	 *
+	 * the region will be released by exporter
+	 * once dmabuf's refcount becomes 0.
+	 */
+	if (obj->import_attach) {
+		drm_prime_gem_destroy(obj, rknpu_obj->sgt);
+		rknpu_gem_free_page(rknpu_obj->pages);
+	} else {
+		if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) &&
+		    rknpu_obj->sram_size > 0) {
+			struct rknpu_device *rknpu_dev = obj->dev->dev_private;
+
+			if (rknpu_obj->sram_obj != NULL)
+				rknpu_mm_free(rknpu_dev->sram_mm,
+					      rknpu_obj->sram_obj);
+			rknpu_gem_free_buf_with_sram(rknpu_obj);
+		} else {
+			rknpu_gem_free_buf(rknpu_obj);
+		}
+	}
+
+	rknpu_gem_release(rknpu_obj);
+}
+
+int rknpu_gem_create_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv)
+{
+	struct rknpu_mem_create *args = data;
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	int ret = -EINVAL;
+
+	rknpu_obj = rknpu_gem_object_find(file_priv, args->handle);
+	if (!rknpu_obj) {
+		rknpu_obj = rknpu_gem_object_create(
+			dev, args->flags, args->size, args->sram_size);
+		if (IS_ERR(rknpu_obj))
+			return PTR_ERR(rknpu_obj);
+
+		ret = rknpu_gem_handle_create(&rknpu_obj->base, file_priv,
+					      &args->handle);
+		if (ret) {
+			rknpu_gem_object_destroy(rknpu_obj);
+			return ret;
+		}
+	}
+
+	// rknpu_gem_object_get(&rknpu_obj->base);
+
+	args->size = rknpu_obj->size;
+	args->sram_size = rknpu_obj->sram_size;
+	args->obj_addr = (__u64)(uintptr_t)rknpu_obj;
+	args->dma_addr = rknpu_obj->dma_addr;
+
+	return 0;
+}
+
+int rknpu_gem_map_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct rknpu_mem_map *args = data;
+
+// #if KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE
+// 	return rknpu_gem_dumb_map_offset(file_priv, dev, args->handle,
+// 					 &args->offset);
+// #else
+	return drm_gem_dumb_map_offset(file_priv, dev, args->handle,
+				       &args->offset);
+// #endif
+}
+
+int rknpu_gem_destroy_ioctl(struct drm_device *dev, void *data,
+			    struct drm_file *file_priv)
+{
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	struct rknpu_mem_destroy *args = data;
+
+	rknpu_obj = rknpu_gem_object_find(file_priv, args->handle);
+	if (!rknpu_obj)
+		return -EINVAL;
+
+	// rknpu_gem_object_put(&rknpu_obj->base);
+
+	return rknpu_gem_handle_destroy(file_priv, args->handle);
+}
+
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+			  unsigned long num, unsigned long offset)
+{
+	unsigned long count = vma_pages(vma);
+	unsigned long uaddr = vma->vm_start;
+	int ret = -EINVAL, i = 0;
+
+	/* Fail if the user requested offset is beyond the end of the object */
+	if (offset >= num)
+		return -ENXIO;
+
+	/* Fail if the user requested size exceeds available object size */
+	if (count > num - offset)
+		return -ENXIO;
+
+	for (i = 0; i < count; i++) {
+		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+		if (ret < 0)
+			return ret;
+		uaddr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static int rknpu_gem_mmap_pages(struct rknpu_gem_object *rknpu_obj,
+				struct vm_area_struct *vma)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+	int ret = -EINVAL;
+
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	ret = __vm_map_pages(vma, rknpu_obj->pages, rknpu_obj->num_pages,
+			     vma->vm_pgoff);
+	if (ret < 0)
+		LOG_DEV_ERROR(drm->dev, "failed to map pages into vma: %d\n",
+			      ret);
+
+	return ret;
+}
+#endif
+
+static int rknpu_gem_mmap_buffer(struct rknpu_gem_object *rknpu_obj,
+				 struct vm_area_struct *vma)
+{
+	struct drm_device *drm = rknpu_obj->base.dev;
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+#endif
+	unsigned long vm_size = 0;
+	int ret = -EINVAL;
+
+	/*
+	 * clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
+	 * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map
+	 * the whole buffer.
+	 */
+	vma->vm_flags &= ~VM_PFNMAP;
+	vma->vm_pgoff = 0;
+
+	vm_size = vma->vm_end - vma->vm_start;
+
+	/* check if user-requested size is valid. */
+	if (vm_size > rknpu_obj->size)
+		return -EINVAL;
+
+	if (rknpu_obj->sram_size > 0) {
+		unsigned long offset = 0;
+		unsigned long num_pages = 0;
+		int i = 0;
+
+		vma->vm_flags |= VM_MIXEDMAP;
+
+		offset = rknpu_obj->sram_obj->range_start *
+			 rknpu_dev->sram_mm->chunk_size;
+		vma->vm_pgoff = __phys_to_pfn(rknpu_dev->sram_start + offset);
+
+		ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				      rknpu_obj->sram_size, vma->vm_page_prot);
+		if (ret)
+			return -EAGAIN;
+
+		if (rknpu_obj->size == 0)
+			return 0;
+
+		offset = rknpu_obj->sram_size;
+
+		num_pages = (vm_size - rknpu_obj->sram_size) / PAGE_SIZE;
+		for (i = 0; i < num_pages; ++i) {
+			ret = vm_insert_page(vma, vma->vm_start + offset,
+					     rknpu_obj->pages[i]);
+			if (ret < 0)
+				return ret;
+			offset += PAGE_SIZE;
+		}
+
+		return 0;
+	}
+
+#if RKNPU_GEM_ALLOC_FROM_PAGES
+	if ((rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS) &&
+	    rknpu_dev->iommu_en) {
+		return rknpu_gem_mmap_pages(rknpu_obj, vma);
+	}
+#endif
+
+	ret = dma_mmap_attrs(drm->dev, vma, rknpu_obj->cookie,
+			     rknpu_obj->dma_addr, rknpu_obj->size,
+			     rknpu_obj->dma_attrs);
+	if (ret < 0) {
+		LOG_DEV_ERROR(drm->dev, "failed to mmap, ret: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void rknpu_gem_free_object(struct drm_gem_object *obj)
+{
+	rknpu_gem_object_destroy(to_rknpu_obj(obj));
+}
+
+int rknpu_gem_dumb_create(struct drm_file *file_priv, struct drm_device *drm,
+			  struct drm_mode_create_dumb *args)
+{
+	struct rknpu_device *rknpu_dev = drm->dev_private;
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	unsigned int flags = 0;
+	int ret = -EINVAL;
+
+	/*
+	 * allocate memory to be used for framebuffer.
+	 * - this callback would be called by user application
+	 *	with DRM_IOCTL_MODE_CREATE_DUMB command.
+	 */
+	args->pitch = args->width * ((args->bpp + 7) / 8);
+	args->size = args->pitch * args->height;
+
+	if (rknpu_dev->iommu_en)
+		flags = RKNPU_MEM_NON_CONTIGUOUS | RKNPU_MEM_WRITE_COMBINE;
+	else
+		flags = RKNPU_MEM_CONTIGUOUS | RKNPU_MEM_WRITE_COMBINE;
+
+	rknpu_obj = rknpu_gem_object_create(drm, flags, args->size, 0);
+	if (IS_ERR(rknpu_obj)) {
+		LOG_DEV_ERROR(drm->dev, "gem object allocate failed.\n");
+		return PTR_ERR(rknpu_obj);
+	}
+
+	ret = rknpu_gem_handle_create(&rknpu_obj->base, file_priv,
+				      &args->handle);
+	if (ret) {
+		rknpu_gem_object_destroy(rknpu_obj);
+		return ret;
+	}
+
+	return 0;
+}
+
+// #if KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE
+// int rknpu_gem_dumb_map_offset(struct drm_file *file_priv,
+// 			      struct drm_device *drm, uint32_t handle,
+// 			      uint64_t *offset)
+// {
+// 	struct rknpu_gem_object *rknpu_obj = NULL;
+// 	struct drm_gem_object *obj = NULL;
+// 	int ret = -EINVAL;
+
+// 	rknpu_obj = rknpu_gem_object_find(file_priv, handle);
+// 	if (!rknpu_obj)
+// 		return 0;
+
+// 	/* Don't allow imported objects to be mapped */
+// 	obj = &rknpu_obj->base;
+// 	if (obj->import_attach)
+// 		return -EINVAL;
+
+// 	ret = drm_gem_create_mmap_offset(obj);
+// 	if (ret)
+// 		return ret;
+
+// 	*offset = drm_vma_node_offset_addr(&obj->vma_node);
+
+// 	return 0;
+// }
+// #endif
+
+// #if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
+// vm_fault_t rknpu_gem_fault(struct vm_fault *vmf)
+// {
+// 	struct vm_area_struct *vma = vmf->vma;
+// 	struct drm_gem_object *obj = vma->vm_private_data;
+// 	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+// 	struct drm_device *drm = rknpu_obj->base.dev;
+// 	unsigned long pfn = 0;
+// 	pgoff_t page_offset = 0;
+
+// 	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+// 	if (page_offset >= (rknpu_obj->size >> PAGE_SHIFT)) {
+// 		LOG_DEV_ERROR(drm->dev, "invalid page offset\n");
+// 		return VM_FAULT_SIGBUS;
+// 	}
+
+// 	pfn = page_to_pfn(rknpu_obj->pages[page_offset]);
+// 	return vmf_insert_mixed(vma, vmf->address,
+// 				__pfn_to_pfn_t(pfn, PFN_DEV));
+// }
+// #elif KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE
+// int rknpu_gem_fault(struct vm_fault *vmf)
+// {
+// 	struct vm_area_struct *vma = vmf->vma;
+// 	struct drm_gem_object *obj = vma->vm_private_data;
+// 	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+// 	struct drm_device *drm = rknpu_obj->base.dev;
+// 	unsigned long pfn = 0;
+// 	pgoff_t page_offset = 0;
+// 	int ret = -EINVAL;
+
+// 	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+// 	if (page_offset >= (rknpu_obj->size >> PAGE_SHIFT)) {
+// 		LOG_DEV_ERROR(drm->dev, "invalid page offset\n");
+// 		ret = -EINVAL;
+// 		goto out;
+// 	}
+
+// 	pfn = page_to_pfn(rknpu_obj->pages[page_offset]);
+// 	ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
+
+// out:
+// 	switch (ret) {
+// 	case 0:
+// 	case -ERESTARTSYS:
+// 	case -EINTR:
+// 		return VM_FAULT_NOPAGE;
+// 	case -ENOMEM:
+// 		return VM_FAULT_OOM;
+// 	default:
+// 		return VM_FAULT_SIGBUS;
+// 	}
+// }
+// #else
+int rknpu_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct drm_gem_object *obj = vma->vm_private_data;
+	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+	struct drm_device *drm = rknpu_obj->base.dev;
+	unsigned long pfn = 0;
+	pgoff_t page_offset = 0;
+	int ret = -EINVAL;
+
+	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
+		      PAGE_SHIFT;
+
+	if (page_offset >= (rknpu_obj->size >> PAGE_SHIFT)) {
+		LOG_DEV_ERROR(drm->dev, "invalid page offset\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pfn = page_to_pfn(rknpu_obj->pages[page_offset]);
+	ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+			      __pfn_to_pfn_t(pfn, PFN_DEV));
+
+out:
+	switch (ret) {
+	case 0:
+	case -ERESTARTSYS:
+	case -EINTR:
+		return VM_FAULT_NOPAGE;
+	case -ENOMEM:
+		return VM_FAULT_OOM;
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+// #endif
+
+static int rknpu_gem_mmap_obj(struct drm_gem_object *obj,
+			      struct vm_area_struct *vma)
+{
+	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+	int ret = -EINVAL;
+
+	LOG_DEBUG("flags: %#x\n", rknpu_obj->flags);
+
+	/* non-cacheable as default. */
+	if (rknpu_obj->flags & RKNPU_MEM_CACHEABLE) {
+		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	} else if (rknpu_obj->flags & RKNPU_MEM_WRITE_COMBINE) {
+		vma->vm_page_prot =
+			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+	} else {
+		vma->vm_page_prot =
+			pgprot_noncached(vm_get_page_prot(vma->vm_flags));
+	}
+
+	ret = rknpu_gem_mmap_buffer(rknpu_obj, vma);
+	if (ret)
+		goto err_close_vm;
+
+	return 0;
+
+err_close_vm:
+	drm_gem_vm_close(vma);
+
+	return ret;
+}
+
+int rknpu_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_gem_object *obj = NULL;
+	int ret = -EINVAL;
+
+	/* set vm_area_struct. */
+	ret = drm_gem_mmap(filp, vma);
+	if (ret < 0) {
+		LOG_ERROR("failed to mmap, ret: %d\n", ret);
+		return ret;
+	}
+
+	obj = vma->vm_private_data;
+
+	if (obj->import_attach)
+		return dma_buf_mmap(obj->dma_buf, vma, 0);
+
+	return rknpu_gem_mmap_obj(obj, vma);
+}
+
+/* low-level interface prime helpers */
+// #if KERNEL_VERSION(4, 13, 0) <= LINUX_VERSION_CODE
+// struct drm_gem_object *rknpu_gem_prime_import(struct drm_device *dev,
+// 					      struct dma_buf *dma_buf)
+// {
+// 	return drm_gem_prime_import_dev(dev, dma_buf, dev->dev);
+// }
+// #endif
+
+struct sg_table *rknpu_gem_prime_get_sg_table(struct drm_gem_object *obj)
+{
+	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+	int npages = 0;
+
+	npages = rknpu_obj->size >> PAGE_SHIFT;
+
+// #if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+// 	return drm_prime_pages_to_sg(obj->dev, rknpu_obj->pages, npages);
+// #else
+	return drm_prime_pages_to_sg(obj->dev, rknpu_obj->pages, npages);
+// #endif
+}
+
+struct drm_gem_object *
+rknpu_gem_prime_import_sg_table(struct drm_device *dev,
+				struct dma_buf_attachment *attach,
+				struct sg_table *sgt)
+{
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	int npages = 0;
+	int ret = -EINVAL;
+
+	rknpu_obj = rknpu_gem_init(dev, attach->dmabuf->size);
+	if (IS_ERR(rknpu_obj)) {
+		ret = PTR_ERR(rknpu_obj);
+		return ERR_PTR(ret);
+	}
+
+	rknpu_obj->dma_addr = sg_dma_address(sgt->sgl);
+
+	npages = rknpu_obj->size >> PAGE_SHIFT;
+	rknpu_obj->pages = rknpu_gem_alloc_page(npages);
+	if (!rknpu_obj->pages) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = drm_prime_sg_to_page_array(sgt, rknpu_obj->pages, npages);
+	if (ret < 0)
+		goto err_free_large;
+
+	rknpu_obj->sgt = sgt;
+
+	if (sgt->nents == 1) {
+		/* always physically continuous memory if sgt->nents is 1. */
+		rknpu_obj->flags |= RKNPU_MEM_CONTIGUOUS;
+	} else {
+		/*
+		 * this case could be CONTIG or NONCONTIG type but for now
+		 * sets NONCONTIG.
+		 * TODO. we have to find a way that exporter can notify
+		 * the type of its own buffer to importer.
+		 */
+		rknpu_obj->flags |= RKNPU_MEM_NON_CONTIGUOUS;
+	}
+
+	return &rknpu_obj->base;
+
+err_free_large:
+	rknpu_gem_free_page(rknpu_obj->pages);
+err:
+	rknpu_gem_release(rknpu_obj);
+	return ERR_PTR(ret);
+}
+
+void *rknpu_gem_prime_vmap(struct drm_gem_object *obj)
+{
+	struct rknpu_gem_object *rknpu_obj = to_rknpu_obj(obj);
+
+	if (!rknpu_obj->pages)
+		return NULL;
+
+	return vmap(rknpu_obj->pages, rknpu_obj->num_pages, VM_MAP,
+		    PAGE_KERNEL);
+}
+
+void rknpu_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr)
+{
+	vunmap(vaddr);
+}
+
+int rknpu_gem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	int ret = -EINVAL;
+
+	ret = drm_gem_mmap_obj(obj, obj->size, vma);
+	if (ret < 0)
+		return ret;
+
+	return rknpu_gem_mmap_obj(obj, vma);
+}
+
+int rknpu_gem_sync_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv)
+{
+	struct rknpu_gem_object *rknpu_obj = NULL;
+	struct rknpu_mem_sync *args = data;
+	struct scatterlist *sg;
+	unsigned long length, offset = 0;
+	unsigned long sg_left, size = 0;
+	unsigned long len = 0;
+	int i;
+
+	rknpu_obj = (struct rknpu_gem_object *)(uintptr_t)args->obj_addr;
+	if (!rknpu_obj)
+		return -EINVAL;
+
+	if (!(rknpu_obj->flags & RKNPU_MEM_CACHEABLE))
+		return -EINVAL;
+
+	if (!(rknpu_obj->flags & RKNPU_MEM_NON_CONTIGUOUS)) {
+		if (args->flags & RKNPU_MEM_SYNC_TO_DEVICE) {
+			dma_sync_single_range_for_device(
+				dev->dev, rknpu_obj->dma_addr, args->offset,
+				args->size, DMA_TO_DEVICE);
+		}
+		if (args->flags & RKNPU_MEM_SYNC_FROM_DEVICE) {
+			dma_sync_single_range_for_cpu(dev->dev,
+						      rknpu_obj->dma_addr,
+						      args->offset, args->size,
+						      DMA_FROM_DEVICE);
+		}
+	} else {
+		length = args->size;
+		offset = args->offset;
+
+		if (IS_ENABLED(CONFIG_ROCKCHIP_RKNPU_SRAM) && rknpu_obj->sram_size > 0) {
+			struct drm_gem_object *obj = &rknpu_obj->base;
+			struct rknpu_device *rknpu_dev = obj->dev->dev_private;
+			unsigned long sram_offset =
+				rknpu_obj->sram_obj->range_start *
+				rknpu_dev->sram_mm->chunk_size;
+			if ((offset + length) <= rknpu_obj->sram_size) {
+				__dma_map_area(rknpu_dev->sram_base_io +
+						       offset + sram_offset,
+					       length, DMA_TO_DEVICE);
+				__dma_unmap_area(rknpu_dev->sram_base_io +
+							 offset + sram_offset,
+						 length, DMA_FROM_DEVICE);
+				length = 0;
+				offset = 0;
+			} else if (offset >= rknpu_obj->sram_size) {
+				offset -= rknpu_obj->sram_size;
+			} else {
+				unsigned long sram_length =
+					rknpu_obj->sram_size - offset;
+				__dma_map_area(rknpu_dev->sram_base_io +
+						       offset + sram_offset,
+					       sram_length, DMA_TO_DEVICE);
+				__dma_unmap_area(rknpu_dev->sram_base_io +
+							 offset + sram_offset,
+						 sram_length, DMA_FROM_DEVICE);
+				length -= sram_length;
+				offset = 0;
+			}
+		}
+
+		for_each_sg(rknpu_obj->sgt->sgl, sg, rknpu_obj->sgt->nents,
+			     i) {
+			if (length == 0)
+				break;
+
+			len += sg->length;
+			if (len <= offset)
+				continue;
+
+			sg_left = len - offset;
+			size = (length < sg_left) ? length : sg_left;
+
+			if (args->flags & RKNPU_MEM_SYNC_TO_DEVICE) {
+				dma_sync_sg_for_device(dev->dev, sg, 1,
+						       DMA_TO_DEVICE);
+			}
+
+			if (args->flags & RKNPU_MEM_SYNC_FROM_DEVICE) {
+				dma_sync_sg_for_cpu(dev->dev, sg, 1,
+						    DMA_FROM_DEVICE);
+			}
+
+			offset += size;
+			length -= size;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/rknpu/rknpu_job.c b/drivers/rknpu/rknpu_job.c
new file mode 100644
index 0000000000000000000000000000000000000000..dbfffdfff7a5fa4d80f2198543eda539dab789f3
--- /dev/null
+++ b/drivers/rknpu/rknpu_job.c
@@ -0,0 +1,910 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/sync_file.h>
+#include <linux/io.h>
+
+#include "rknpu_ioctl.h"
+#include "rknpu_drv.h"
+#include "rknpu_reset.h"
+#include "rknpu_gem.h"
+#include "rknpu_fence.h"
+#include "rknpu_job.h"
+#include "rknpu_mem.h"
+
+#define _REG_READ(base, offset) readl(base + (offset))
+#define _REG_WRITE(base, value, offset) writel(value, base + (offset))
+
+#define REG_READ(offset) _REG_READ(rknpu_core_base, offset)
+#define REG_WRITE(value, offset) _REG_WRITE(rknpu_core_base, value, offset)
+
+static int rknpu_core_index(int core_mask)
+{
+	int index = 0;
+
+	if (core_mask & RKNPU_CORE0_MASK)
+		index = 0;
+	else if (core_mask & RKNPU_CORE1_MASK)
+		index = 1;
+	else if (core_mask & RKNPU_CORE2_MASK)
+		index = 2;
+
+	return index;
+}
+
+static int rknpu_core_mask(int core_index)
+{
+	int core_mask = RKNPU_CORE_AUTO_MASK;
+
+	switch (core_index) {
+	case 0:
+		core_mask = RKNPU_CORE0_MASK;
+		break;
+	case 1:
+		core_mask = RKNPU_CORE1_MASK;
+		break;
+	case 2:
+		core_mask = RKNPU_CORE2_MASK;
+		break;
+	default:
+		break;
+	}
+
+	return core_mask;
+}
+
+static int rknn_get_task_number(struct rknpu_job *job, int core_index)
+{
+	int task_num = job->args->task_number;
+
+	if (job->use_core_num == 2)
+		task_num = job->args->subcore_task[core_index].task_number;
+	else if (job->use_core_num == 3)
+		task_num = job->args->subcore_task[core_index + 2].task_number;
+
+	return task_num;
+}
+
+static void rknpu_job_free(struct rknpu_job *job)
+{
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	struct rknpu_gem_object *task_obj = NULL;
+
+	task_obj =
+		(struct rknpu_gem_object *)(uintptr_t)job->args->task_obj_addr;
+	if (task_obj)
+		rknpu_gem_object_put(&task_obj->base);
+#endif
+
+	if (job->fence)
+		dma_fence_put(job->fence);
+
+	if (job->args_owner)
+		kfree(job->args);
+
+	kfree(job);
+}
+
+static int rknpu_job_cleanup(struct rknpu_job *job)
+{
+	rknpu_job_free(job);
+
+	return 0;
+}
+
+static void rknpu_job_cleanup_work(struct work_struct *work)
+{
+	struct rknpu_job *job =
+		container_of(work, struct rknpu_job, cleanup_work);
+
+	rknpu_job_cleanup(job);
+}
+
+static inline struct rknpu_job *rknpu_job_alloc(struct rknpu_device *rknpu_dev,
+						struct rknpu_submit *args)
+{
+	struct rknpu_job *job = NULL;
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	struct rknpu_gem_object *task_obj = NULL;
+#endif
+	if (rknpu_dev->config->num_irqs == 1)
+		args->core_mask = RKNPU_CORE0_MASK;
+
+	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return NULL;
+
+	job->timestamp = ktime_get();
+	job->rknpu_dev = rknpu_dev;
+	job->use_core_num = (args->core_mask & RKNPU_CORE0_MASK) +
+			    ((args->core_mask & RKNPU_CORE1_MASK) >> 1) +
+			    ((args->core_mask & RKNPU_CORE2_MASK) >> 2);
+	job->run_count = job->use_core_num;
+	job->interrupt_count = job->use_core_num;
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	task_obj = (struct rknpu_gem_object *)(uintptr_t)args->task_obj_addr;
+	if (task_obj)
+		rknpu_gem_object_get(&task_obj->base);
+#endif
+
+	if (!(args->flags & RKNPU_JOB_NONBLOCK)) {
+		job->args = args;
+		job->args_owner = false;
+		return job;
+	}
+
+	job->args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!job->args) {
+		kfree(job);
+		return NULL;
+	}
+	*job->args = *args;
+	job->args_owner = true;
+
+	INIT_WORK(&job->cleanup_work, rknpu_job_cleanup_work);
+
+	return job;
+}
+
+static inline int rknpu_job_wait(struct rknpu_job *job)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_submit *args = job->args;
+	struct rknpu_task *last_task = NULL;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	void __iomem *rknpu_core_base = NULL;
+	int core_index = rknpu_core_index(job->args->core_mask);
+	unsigned long flags;
+	int wait_count = 0;
+	int ret = -EINVAL;
+
+	subcore_data = &rknpu_dev->subcore_datas[core_index];
+
+	do {
+		ret = wait_event_interruptible_timeout(
+			subcore_data->job_done_wq,
+			job->flags & RKNPU_JOB_DONE || rknpu_dev->soft_reseting,
+			msecs_to_jiffies(args->timeout));
+		if (++wait_count >= 3)
+			break;
+	} while (ret == 0 && job->in_queue[core_index]);
+
+	if (job->in_queue[core_index]) {
+		spin_lock_irqsave(&rknpu_dev->lock, flags);
+		list_del_init(&job->head[core_index]);
+		subcore_data->task_num -= rknn_get_task_number(job, core_index);
+		job->in_queue[core_index] = false;
+		spin_unlock_irqrestore(&rknpu_dev->lock, flags);
+		return ret < 0 ? ret : -EINVAL;
+	}
+
+	last_task = job->last_task;
+	if (!last_task)
+		return ret < 0 ? ret : -EINVAL;
+
+	last_task->int_status = job->int_status[core_index];
+
+	if (ret <= 0) {
+		args->task_counter = 0;
+		rknpu_core_base = rknpu_dev->base[core_index];
+		if (args->flags & RKNPU_JOB_PC) {
+			uint32_t task_status = REG_READ(
+				rknpu_dev->config->pc_task_status_offset);
+			args->task_counter =
+				(task_status &
+				 rknpu_dev->config->pc_task_number_mask);
+		}
+
+		LOG_ERROR(
+			"failed to wait job, task counter: %d, flags: %#x, ret = %d, elapsed time: %lldus\n",
+			args->task_counter, args->flags, ret,
+			ktime_to_us(ktime_sub(ktime_get(), job->timestamp)));
+
+		return ret < 0 ? ret : -ETIMEDOUT;
+	}
+
+	if (!(job->flags & RKNPU_JOB_DONE))
+		return -EINVAL;
+
+	args->task_counter = args->task_number;
+
+	return 0;
+}
+
+static inline int rknpu_job_commit_pc(struct rknpu_job *job, int core_index)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_submit *args = job->args;
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+	struct rknpu_gem_object *task_obj =
+		(struct rknpu_gem_object *)(uintptr_t)args->task_obj_addr;
+#endif
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+	struct rknpu_mem_object *task_obj =
+		(struct rknpu_mem_object *)(uintptr_t)args->task_obj_addr;
+#endif
+	struct rknpu_task *task_base = NULL;
+	struct rknpu_task *first_task = NULL;
+	struct rknpu_task *last_task = NULL;
+	void __iomem *rknpu_core_base = rknpu_dev->base[core_index];
+	int task_start = args->task_start;
+	int task_end = args->task_start + args->task_number - 1;
+	int task_number = args->task_number;
+	int task_pp_en = args->flags & RKNPU_JOB_PINGPONG ? 1 : 0;
+	int pc_data_amount_scale = rknpu_dev->config->pc_data_amount_scale;
+	int pc_task_number_bits = rknpu_dev->config->pc_task_number_bits;
+	int i = 0;
+
+	if (!task_obj)
+		return -EINVAL;
+
+	if (rknpu_dev->config->num_irqs > 1) {
+		for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+			if (i == core_index) {
+				REG_WRITE((0xe + 0x10000000 * i), 0x1004);
+				REG_WRITE((0xe + 0x10000000 * i), 0x3004);
+			}
+		}
+
+		if (job->use_core_num == 1) {
+			task_start = args->subcore_task[core_index].task_start;
+			task_end = args->subcore_task[core_index].task_start +
+				   args->subcore_task[core_index].task_number -
+				   1;
+			task_number =
+				args->subcore_task[core_index].task_number;
+		} else if (job->use_core_num == 2) {
+			task_start = args->subcore_task[core_index].task_start;
+			task_end = args->subcore_task[core_index].task_start +
+				   args->subcore_task[core_index].task_number -
+				   1;
+			task_number =
+				args->subcore_task[core_index].task_number;
+		} else if (job->use_core_num == 3) {
+			task_start =
+				args->subcore_task[core_index + 2].task_start;
+			task_end =
+				args->subcore_task[core_index + 2].task_start +
+				args->subcore_task[core_index + 2].task_number -
+				1;
+			task_number =
+				args->subcore_task[core_index + 2].task_number;
+		}
+	}
+
+	task_base = task_obj->kv_addr;
+
+	first_task = &task_base[task_start];
+	last_task = &task_base[task_end];
+
+	REG_WRITE(first_task->regcmd_addr, RKNPU_OFFSET_PC_DATA_ADDR);
+
+	REG_WRITE((first_task->regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT +
+		   pc_data_amount_scale - 1) /
+				  pc_data_amount_scale -
+			  1,
+		  RKNPU_OFFSET_PC_DATA_AMOUNT);
+
+	REG_WRITE(last_task->int_mask, RKNPU_OFFSET_INT_MASK);
+
+	REG_WRITE(first_task->int_mask, RKNPU_OFFSET_INT_CLEAR);
+
+	REG_WRITE(((0x6 | task_pp_en) << pc_task_number_bits) | task_number,
+		  RKNPU_OFFSET_PC_TASK_CONTROL);
+
+	REG_WRITE(args->task_base_addr, RKNPU_OFFSET_PC_DMA_BASE_ADDR);
+
+	job->first_task = first_task;
+	job->last_task = last_task;
+	job->int_mask[core_index] = last_task->int_mask;
+
+	REG_WRITE(0x1, RKNPU_OFFSET_PC_OP_EN);
+	REG_WRITE(0x0, RKNPU_OFFSET_PC_OP_EN);
+
+	return 0;
+}
+
+static int rknpu_job_commit(struct rknpu_job *job, int core_index)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_submit *args = job->args;
+	void __iomem *rknpu_core_base = rknpu_dev->base[core_index];
+
+	// switch to slave mode
+	REG_WRITE(0x1, RKNPU_OFFSET_PC_DATA_ADDR);
+
+	if (!(args->flags & RKNPU_JOB_PC))
+		return -EINVAL;
+
+	return rknpu_job_commit_pc(job, core_index);
+}
+
+static void rknpu_job_next(struct rknpu_device *rknpu_dev, int core_index)
+{
+	struct rknpu_job *job = NULL;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	unsigned long flags;
+
+	if (rknpu_dev->soft_reseting)
+		return;
+
+	subcore_data = &rknpu_dev->subcore_datas[core_index];
+
+	spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+
+	if (subcore_data->job || list_empty(&subcore_data->todo_list)) {
+		spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+		return;
+	}
+
+	job = list_first_entry(&subcore_data->todo_list, struct rknpu_job,
+			       head[core_index]);
+
+	list_del_init(&job->head[core_index]);
+	job->in_queue[core_index] = false;
+	subcore_data->job = job;
+	job->run_count--;
+	job->hw_recoder_time = ktime_get();
+	spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+
+	if (job->run_count == 0) {
+		if (job->args->core_mask & RKNPU_CORE0_MASK)
+			job->ret = rknpu_job_commit(job, 0);
+		if (job->args->core_mask & RKNPU_CORE1_MASK)
+			job->ret = rknpu_job_commit(job, 1);
+		if (job->args->core_mask & RKNPU_CORE2_MASK)
+			job->ret = rknpu_job_commit(job, 2);
+	}
+}
+
+static void rknpu_job_done(struct rknpu_job *job, int ret, int core_index)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	unsigned long flags;
+	ktime_t now = ktime_get();
+
+	subcore_data = &rknpu_dev->subcore_datas[core_index];
+
+	spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+	subcore_data->job = NULL;
+	subcore_data->task_num -= rknn_get_task_number(job, core_index);
+	job->interrupt_count--;
+	subcore_data->timer.busy_time +=
+		ktime_us_delta(now, job->hw_recoder_time);
+	spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+
+	if (job->interrupt_count == 0) {
+		int use_core_num = job->use_core_num;
+
+		job->flags |= RKNPU_JOB_DONE;
+		job->ret = ret;
+
+		if (job->fence)
+			dma_fence_signal(job->fence);
+
+		if (job->flags & RKNPU_JOB_ASYNC)
+			schedule_work(&job->cleanup_work);
+
+		if (use_core_num > 1)
+			wake_up(&(&rknpu_dev->subcore_datas[0])->job_done_wq);
+		else
+			wake_up(&subcore_data->job_done_wq);
+	}
+
+	rknpu_job_next(rknpu_dev, core_index);
+}
+
+static void rknpu_job_schedule(struct rknpu_job *job)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	int i = 0, core_index = 0;
+	unsigned long flags;
+	int task_num_list[3] = { 0, 1, 2 };
+	int tmp = 0;
+
+	if ((job->args->core_mask & 0x07) == RKNPU_CORE_AUTO_MASK) {
+		if (rknpu_dev->subcore_datas[0].task_num >
+		    rknpu_dev->subcore_datas[1].task_num) {
+			tmp = task_num_list[1];
+			task_num_list[1] = task_num_list[0];
+			task_num_list[0] = tmp;
+		}
+		if (rknpu_dev->subcore_datas[task_num_list[0]].task_num >
+		    rknpu_dev->subcore_datas[2].task_num) {
+			tmp = task_num_list[2];
+			task_num_list[2] = task_num_list[1];
+			task_num_list[1] = task_num_list[0];
+			task_num_list[0] = tmp;
+		} else if (rknpu_dev->subcore_datas[task_num_list[1]].task_num >
+			   rknpu_dev->subcore_datas[2].task_num) {
+			tmp = task_num_list[2];
+			task_num_list[2] = task_num_list[1];
+			task_num_list[1] = tmp;
+		}
+		if (!rknpu_dev->subcore_datas[task_num_list[0]].job)
+			core_index = task_num_list[0];
+		else if (!rknpu_dev->subcore_datas[task_num_list[1]].job)
+			core_index = task_num_list[1];
+		else if (!rknpu_dev->subcore_datas[task_num_list[2]].job)
+			core_index = task_num_list[2];
+		else
+			core_index = task_num_list[0];
+
+		job->args->core_mask = rknpu_core_mask(core_index);
+		job->use_core_num = 1;
+		job->interrupt_count = 1;
+		job->run_count = 1;
+	}
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		if (job->args->core_mask & rknpu_core_mask(i)) {
+			subcore_data = &rknpu_dev->subcore_datas[i];
+			spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+			list_add_tail(&job->head[i], &subcore_data->todo_list);
+			subcore_data->task_num += rknn_get_task_number(job, i);
+			job->in_queue[i] = true;
+			spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+		}
+	}
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		if (job->args->core_mask & rknpu_core_mask(i))
+			rknpu_job_next(rknpu_dev, i);
+	}
+}
+
+static void rknpu_job_abort(struct rknpu_job *job)
+{
+	struct rknpu_device *rknpu_dev = job->rknpu_dev;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	int core_index = rknpu_core_index(job->args->core_mask);
+	void __iomem *rknpu_core_base = rknpu_dev->base[core_index];
+	unsigned long flags;
+	int i = 0;
+
+	msleep(100);
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		if (job->args->core_mask & rknpu_core_mask(i)) {
+			subcore_data = &rknpu_dev->subcore_datas[i];
+			spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+			if (job == subcore_data->job && !job->irq_entry[i]) {
+				subcore_data->job = NULL;
+				subcore_data->task_num -=
+					rknn_get_task_number(job, i);
+			}
+			spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+		}
+	}
+
+	if (job->ret == -ETIMEDOUT) {
+		LOG_ERROR(
+			"job timeout, flags: %#x, irq status: %#x, raw status: %#x, require mask: %#x, task counter: %#x, elapsed time: %lldus\n",
+			job->flags, REG_READ(RKNPU_OFFSET_INT_STATUS),
+			REG_READ(RKNPU_OFFSET_INT_RAW_STATUS),
+			job->int_mask[core_index],
+			(REG_READ(rknpu_dev->config->pc_task_status_offset) &
+			 rknpu_dev->config->pc_task_number_mask),
+			ktime_to_us(ktime_sub(ktime_get(), job->timestamp)));
+		rknpu_soft_reset(rknpu_dev);
+	} else {
+		LOG_ERROR(
+			"job abort, flags: %#x, ret: %d, elapsed time: %lldus\n",
+			job->flags, job->ret,
+			ktime_to_us(ktime_sub(ktime_get(), job->timestamp)));
+	}
+
+	rknpu_job_cleanup(job);
+}
+
+static inline uint32_t rknpu_fuzz_status(uint32_t status)
+{
+	uint32_t fuzz_status = 0;
+
+	if ((status & 0x3) != 0)
+		fuzz_status |= 0x3;
+
+	if ((status & 0xc) != 0)
+		fuzz_status |= 0xc;
+
+	if ((status & 0x30) != 0)
+		fuzz_status |= 0x30;
+
+	if ((status & 0xc0) != 0)
+		fuzz_status |= 0xc0;
+
+	if ((status & 0x300) != 0)
+		fuzz_status |= 0x300;
+
+	if ((status & 0xc00) != 0)
+		fuzz_status |= 0xc00;
+
+	return fuzz_status;
+}
+
+static inline irqreturn_t rknpu_irq_handler(int irq, void *data, int core_index)
+{
+	struct rknpu_device *rknpu_dev = data;
+	void __iomem *rknpu_core_base = rknpu_dev->base[core_index];
+	struct rknpu_subcore_data *subcore_data = NULL;
+	struct rknpu_job *job = NULL;
+	uint32_t status = 0;
+	unsigned long flags;
+
+	subcore_data = &rknpu_dev->subcore_datas[core_index];
+
+	spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+	job = subcore_data->job;
+	if (!job) {
+		spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+		REG_WRITE(RKNPU_INT_CLEAR, RKNPU_OFFSET_INT_CLEAR);
+		rknpu_job_next(rknpu_dev, core_index);
+		return IRQ_HANDLED;
+	}
+	job->irq_entry[core_index] = true;
+	spin_unlock_irqrestore(&rknpu_dev->irq_lock, flags);
+
+	status = REG_READ(RKNPU_OFFSET_INT_STATUS);
+
+	job->int_status[core_index] = status;
+
+	if (rknpu_fuzz_status(status) != job->int_mask[core_index]) {
+		LOG_ERROR(
+			"invalid irq status: %#x, raw status: %#x, require mask: %#x, task counter: %#x\n",
+			status, REG_READ(RKNPU_OFFSET_INT_RAW_STATUS),
+			job->int_mask[core_index],
+			(REG_READ(rknpu_dev->config->pc_task_status_offset) &
+			 rknpu_dev->config->pc_task_number_mask));
+		REG_WRITE(RKNPU_INT_CLEAR, RKNPU_OFFSET_INT_CLEAR);
+		return IRQ_HANDLED;
+	}
+
+	REG_WRITE(RKNPU_INT_CLEAR, RKNPU_OFFSET_INT_CLEAR);
+
+	rknpu_job_done(job, 0, core_index);
+
+	return IRQ_HANDLED;
+}
+
+irqreturn_t rknpu_core0_irq_handler(int irq, void *data)
+{
+	return rknpu_irq_handler(irq, data, 0);
+}
+
+irqreturn_t rknpu_core1_irq_handler(int irq, void *data)
+{
+	return rknpu_irq_handler(irq, data, 1);
+}
+
+irqreturn_t rknpu_core2_irq_handler(int irq, void *data)
+{
+	return rknpu_irq_handler(irq, data, 2);
+}
+
+static void rknpu_job_timeout_clean(struct rknpu_device *rknpu_dev,
+				    int core_mask)
+{
+	struct rknpu_job *job = NULL;
+	unsigned long flags;
+	ktime_t now = ktime_get();
+	struct rknpu_subcore_data *subcore_data = NULL;
+	int i = 0;
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; i++) {
+		if (core_mask & rknpu_core_mask(i)) {
+			subcore_data = &rknpu_dev->subcore_datas[i];
+			job = subcore_data->job;
+			if (job &&
+			    ktime_to_ms(ktime_sub(now, job->timestamp)) >=
+				    job->args->timeout) {
+				rknpu_soft_reset(rknpu_dev);
+
+				spin_lock_irqsave(&rknpu_dev->irq_lock, flags);
+				subcore_data->job = NULL;
+				spin_unlock_irqrestore(&rknpu_dev->irq_lock,
+						       flags);
+
+				do {
+					schedule_work(&job->cleanup_work);
+
+					spin_lock_irqsave(&rknpu_dev->irq_lock,
+							  flags);
+
+					if (!list_empty(
+						    &subcore_data->todo_list)) {
+						job = list_first_entry(
+							&subcore_data->todo_list,
+							struct rknpu_job,
+							head[i]);
+						list_del_init(&job->head[i]);
+						job->in_queue[i] = false;
+					} else {
+						job = NULL;
+					}
+
+					spin_unlock_irqrestore(
+						&rknpu_dev->irq_lock, flags);
+				} while (job);
+			}
+		}
+	}
+}
+
+static int rknpu_submit(struct rknpu_device *rknpu_dev,
+			struct rknpu_submit *args)
+{
+	struct rknpu_job *job = NULL;
+	int ret = -EINVAL;
+
+	if (args->task_number == 0) {
+		LOG_ERROR("invalid rknpu task number!\n");
+		return -EINVAL;
+	}
+
+	job = rknpu_job_alloc(rknpu_dev, args);
+	if (!job) {
+		LOG_ERROR("failed to allocate rknpu job!\n");
+		return -ENOMEM;
+	}
+
+	if (args->flags & RKNPU_JOB_FENCE_IN) {
+#ifdef CONFIG_ROCKCHIP_RKNPU_FENCE
+		struct dma_fence *in_fence;
+
+		in_fence = sync_file_get_fence(args->fence_fd);
+
+		if (!in_fence) {
+			LOG_ERROR("invalid fence in fd, fd: %d\n",
+				  args->fence_fd);
+			return -EINVAL;
+		}
+		args->fence_fd = -1;
+
+		/*
+		 * Wait if the fence is from a foreign context, or if the fence
+		 * array contains any fence from a foreign context.
+		 */
+		ret = 0;
+		if (!dma_fence_match_context(in_fence,
+					     rknpu_dev->fence_ctx->context))
+			ret = dma_fence_wait_timeout(in_fence, true,
+						     args->timeout);
+		dma_fence_put(in_fence);
+		if (ret < 0) {
+			if (ret != -ERESTARTSYS)
+				LOG_ERROR("Error (%d) waiting for fence!\n",
+					  ret);
+
+			return ret;
+		}
+#else
+		LOG_ERROR(
+			"failed to use rknpu fence, please enable rknpu fence config!\n");
+		rknpu_job_free(job);
+		return -EINVAL;
+#endif
+	}
+
+	if (args->flags & RKNPU_JOB_FENCE_OUT) {
+#ifdef CONFIG_ROCKCHIP_RKNPU_FENCE
+		ret = rknpu_fence_alloc(job);
+		if (ret) {
+			rknpu_job_free(job);
+			return ret;
+		}
+		job->args->fence_fd = rknpu_fence_get_fd(job);
+		args->fence_fd = job->args->fence_fd;
+#else
+		LOG_ERROR(
+			"failed to use rknpu fence, please enable rknpu fence config!\n");
+		rknpu_job_free(job);
+		return -EINVAL;
+#endif
+	}
+
+	if (args->flags & RKNPU_JOB_NONBLOCK) {
+		job->flags |= RKNPU_JOB_ASYNC;
+		rknpu_job_timeout_clean(rknpu_dev, job->args->core_mask);
+		rknpu_job_schedule(job);
+		ret = job->ret;
+		if (ret) {
+			rknpu_job_abort(job);
+			return ret;
+		}
+	} else {
+		rknpu_job_schedule(job);
+		if (args->flags & RKNPU_JOB_PC)
+			job->ret = rknpu_job_wait(job);
+
+		args->task_counter = job->args->task_counter;
+		ret = job->ret;
+		if (!ret)
+			rknpu_job_cleanup(job);
+		else
+			rknpu_job_abort(job);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DRM_GEM
+int rknpu_submit_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv)
+{
+	struct rknpu_device *rknpu_dev = dev_get_drvdata(dev->dev);
+	struct rknpu_submit *args = data;
+
+	return rknpu_submit(rknpu_dev, args);
+}
+#endif
+
+#ifdef CONFIG_ROCKCHIP_RKNPU_DMA_HEAP
+int rknpu_submit_ioctl(struct rknpu_device *rknpu_dev, unsigned long data)
+{
+	struct rknpu_submit args;
+	int ret = -EINVAL;
+
+	if (unlikely(copy_from_user(&args, (struct rknpu_submit *)data,
+				    sizeof(struct rknpu_submit)))) {
+		LOG_ERROR("%s: copy_from_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	ret = rknpu_submit(rknpu_dev, &args);
+
+	if (unlikely(copy_to_user((struct rknpu_submit *)data, &args,
+				  sizeof(struct rknpu_submit)))) {
+		LOG_ERROR("%s: copy_to_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	return ret;
+}
+#endif
+
+int rknpu_get_hw_version(struct rknpu_device *rknpu_dev, uint32_t *version)
+{
+	void __iomem *rknpu_core_base = rknpu_dev->base[0];
+
+	if (version == NULL)
+		return -EINVAL;
+
+	*version = REG_READ(RKNPU_OFFSET_VERSION) +
+		   (REG_READ(RKNPU_OFFSET_VERSION_NUM) & 0xffff);
+
+	return 0;
+}
+
+int rknpu_get_bw_priority(struct rknpu_device *rknpu_dev, uint32_t *priority,
+			  uint32_t *expect, uint32_t *tw)
+{
+	void __iomem *base = rknpu_dev->bw_priority_base;
+
+	if (!rknpu_dev->config->bw_enable) {
+		LOG_WARN("Get bw_priority is not supported on this device!\n");
+		return 0;
+	}
+
+	if (!base)
+		return -EINVAL;
+
+	spin_lock(&rknpu_dev->lock);
+
+	if (priority != NULL)
+		*priority = _REG_READ(base, 0x0);
+
+	if (expect != NULL)
+		*expect = _REG_READ(base, 0x8);
+
+	if (tw != NULL)
+		*tw = _REG_READ(base, 0xc);
+
+	spin_unlock(&rknpu_dev->lock);
+
+	return 0;
+}
+
+int rknpu_set_bw_priority(struct rknpu_device *rknpu_dev, uint32_t priority,
+			  uint32_t expect, uint32_t tw)
+{
+	void __iomem *base = rknpu_dev->bw_priority_base;
+
+	if (!rknpu_dev->config->bw_enable) {
+		LOG_WARN("Set bw_priority is not supported on this device!\n");
+		return 0;
+	}
+
+	if (!base)
+		return -EINVAL;
+
+	spin_lock(&rknpu_dev->lock);
+
+	if (priority != 0)
+		_REG_WRITE(base, priority, 0x0);
+
+	if (expect != 0)
+		_REG_WRITE(base, expect, 0x8);
+
+	if (tw != 0)
+		_REG_WRITE(base, tw, 0xc);
+
+	spin_unlock(&rknpu_dev->lock);
+
+	return 0;
+}
+
+int rknpu_clear_rw_amount(struct rknpu_device *rknpu_dev)
+{
+	void __iomem *rknpu_core_base = rknpu_dev->base[0];
+
+	if (!rknpu_dev->config->bw_enable) {
+		LOG_WARN("Clear rw_amount is not supported on this device!\n");
+		return 0;
+	}
+
+	spin_lock(&rknpu_dev->lock);
+
+	REG_WRITE(0x80000101, RKNPU_OFFSET_CLR_ALL_RW_AMOUNT);
+	REG_WRITE(0x00000101, RKNPU_OFFSET_CLR_ALL_RW_AMOUNT);
+
+	spin_unlock(&rknpu_dev->lock);
+
+	return 0;
+}
+
+int rknpu_get_rw_amount(struct rknpu_device *rknpu_dev, uint32_t *dt_wr,
+			uint32_t *dt_rd, uint32_t *wd_rd)
+{
+	void __iomem *rknpu_core_base = rknpu_dev->base[0];
+	int amount_scale = rknpu_dev->config->pc_data_amount_scale;
+
+	if (!rknpu_dev->config->bw_enable) {
+		LOG_WARN("Get rw_amount is not supported on this device!\n");
+		return 0;
+	}
+
+	spin_lock(&rknpu_dev->lock);
+
+	if (dt_wr != NULL)
+		*dt_wr = REG_READ(RKNPU_OFFSET_DT_WR_AMOUNT) * amount_scale;
+
+	if (dt_rd != NULL)
+		*dt_rd = REG_READ(RKNPU_OFFSET_DT_RD_AMOUNT) * amount_scale;
+
+	if (wd_rd != NULL)
+		*wd_rd = REG_READ(RKNPU_OFFSET_WT_RD_AMOUNT) * amount_scale;
+
+	spin_unlock(&rknpu_dev->lock);
+
+	return 0;
+}
+
+int rknpu_get_total_rw_amount(struct rknpu_device *rknpu_dev, uint32_t *amount)
+{
+	uint32_t dt_wr = 0;
+	uint32_t dt_rd = 0;
+	uint32_t wd_rd = 0;
+	int ret = -EINVAL;
+
+	if (!rknpu_dev->config->bw_enable) {
+		LOG_WARN(
+			"Get total_rw_amount is not supported on this device!\n");
+		return 0;
+	}
+
+	ret = rknpu_get_rw_amount(rknpu_dev, &dt_wr, &dt_rd, &wd_rd);
+
+	if (amount != NULL)
+		*amount = dt_wr + dt_rd + wd_rd;
+
+	return ret;
+}
diff --git a/drivers/rknpu/rknpu_mem.c b/drivers/rknpu/rknpu_mem.c
new file mode 100644
index 0000000000000000000000000000000000000000..5535598f2acabe9ca1ff811c3f0b90955296ecbc
--- /dev/null
+++ b/drivers/rknpu/rknpu_mem.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/version.h>
+#include <linux/rk-dma-heap.h>
+
+#if KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE
+#include <linux/dma-map-ops.h>
+#endif
+
+#include "rknpu_drv.h"
+#include "rknpu_ioctl.h"
+#include "rknpu_mem.h"
+
+int rknpu_mem_create_ioctl(struct rknpu_device *rknpu_dev, unsigned long data)
+{
+	struct rknpu_mem_create args;
+	int ret = -EINVAL;
+	struct dma_buf_attachment *attachment;
+	struct sg_table *table;
+	struct scatterlist *sgl;
+	dma_addr_t phys;
+	struct dma_buf *dmabuf;
+	struct page **pages;
+	struct page *page;
+	struct rknpu_mem_object *rknpu_obj = NULL;
+	int i, fd;
+	unsigned int length, page_count;
+
+	if (unlikely(copy_from_user(&args, (struct rknpu_mem_create *)data,
+				    sizeof(struct rknpu_mem_create)))) {
+		LOG_ERROR("%s: copy_from_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	if (args.flags & RKNPU_MEM_NON_CONTIGUOUS) {
+		LOG_ERROR("%s: malloc iommu memory unsupported in current!\n",
+			  __func__);
+		ret = -EINVAL;
+		return ret;
+	}
+
+	rknpu_obj = kzalloc(sizeof(*rknpu_obj), GFP_KERNEL);
+	if (!rknpu_obj)
+		return -ENOMEM;
+
+	if (args.handle > 0) {
+		fd = args.handle;
+
+		dmabuf = dma_buf_get(fd);
+		if (IS_ERR(dmabuf)) {
+			ret = PTR_ERR(dmabuf);
+			goto err_free_obj;
+		}
+
+		rknpu_obj->dmabuf = dmabuf;
+		rknpu_obj->owner = 0;
+	} else {
+		/* Start test kernel alloc/free dma buf */
+		dmabuf = rk_dma_heap_buffer_alloc(rknpu_dev->heap, args.size,
+						  O_CLOEXEC | O_RDWR, 0x0,
+						  dev_name(rknpu_dev->dev));
+		if (IS_ERR(dmabuf)) {
+			ret = PTR_ERR(dmabuf);
+			goto err_free_obj;
+		}
+
+		rknpu_obj->dmabuf = dmabuf;
+		rknpu_obj->owner = 1;
+
+		fd = dma_buf_fd(dmabuf, O_CLOEXEC | O_RDWR);
+		if (fd < 0) {
+			ret = -EFAULT;
+			goto err_free_dma_buf;
+		}
+	}
+
+	attachment = dma_buf_attach(dmabuf, rknpu_dev->dev);
+	if (IS_ERR(attachment)) {
+		ret = PTR_ERR(attachment);
+		goto err_free_dma_buf;
+	}
+
+	table = dma_buf_map_attachment(attachment, DMA_BIDIRECTIONAL);
+	if (IS_ERR(table)) {
+		dma_buf_detach(dmabuf, attachment);
+		ret = PTR_ERR(table);
+		goto err_free_dma_buf;
+	}
+
+	for_each_sgtable_sg(table, sgl, i) {
+		phys = sg_dma_address(sgl);
+		page = sg_page(sgl);
+		length = sg_dma_len(sgl);
+		LOG_DEBUG("%s, %d, phys: %pad, length: %u\n", __func__,
+			  __LINE__, &phys, length);
+	}
+
+	page_count = length >> PAGE_SHIFT;
+	pages = kmalloc_array(page_count, sizeof(struct page), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto err_detach_dma_buf;
+	}
+
+	for (i = 0; i < page_count; i++)
+		pages[i] = &page[i];
+
+	rknpu_obj->kv_addr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+	if (!rknpu_obj->kv_addr) {
+		ret = -ENOMEM;
+		goto err_free_pages;
+	}
+
+	rknpu_obj->size = PAGE_ALIGN(args.size);
+	rknpu_obj->dma_addr = phys;
+	rknpu_obj->sgt = table;
+
+	args.size = rknpu_obj->size;
+	args.obj_addr = (__u64)(uintptr_t)rknpu_obj;
+	args.dma_addr = rknpu_obj->dma_addr;
+	args.handle = fd;
+
+	LOG_DEBUG(
+		"args.handle: %d, args.size: %lld, rknpu_obj: %#llx, rknpu_obj->dma_addr: %#llx\n",
+		args.handle, args.size, (__u64)(uintptr_t)rknpu_obj,
+		(__u64)rknpu_obj->dma_addr);
+
+	if (unlikely(copy_to_user((struct rknpu_mem_create *)data, &args,
+				  sizeof(struct rknpu_mem_create)))) {
+		LOG_ERROR("%s: copy_to_user failed\n", __func__);
+		ret = -EFAULT;
+		goto err_unmap_kv_addr;
+	}
+
+	kfree(pages);
+	dma_buf_unmap_attachment(attachment, table, DMA_BIDIRECTIONAL);
+	dma_buf_detach(dmabuf, attachment);
+
+	return 0;
+
+err_unmap_kv_addr:
+	vunmap(rknpu_obj->kv_addr);
+	rknpu_obj->kv_addr = NULL;
+
+err_free_pages:
+	kfree(pages);
+
+err_detach_dma_buf:
+	dma_buf_unmap_attachment(attachment, table, DMA_BIDIRECTIONAL);
+	dma_buf_detach(dmabuf, attachment);
+
+err_free_dma_buf:
+	if (rknpu_obj->owner)
+		rk_dma_heap_buffer_free(dmabuf);
+	else
+		dma_buf_put(dmabuf);
+
+err_free_obj:
+	kfree(rknpu_obj);
+
+	return ret;
+}
+
+int rknpu_mem_destroy_ioctl(struct rknpu_device *rknpu_dev, unsigned long data)
+{
+	struct rknpu_mem_object *rknpu_obj = NULL;
+	struct rknpu_mem_destroy args;
+	struct dma_buf *dmabuf;
+	int ret = -EFAULT;
+
+	if (unlikely(copy_from_user(&args, (struct rknpu_mem_destroy *)data,
+				    sizeof(struct rknpu_mem_destroy)))) {
+		LOG_ERROR("%s: copy_from_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	rknpu_obj = (struct rknpu_mem_object *)(uintptr_t)args.obj_addr;
+	dmabuf = rknpu_obj->dmabuf;
+	LOG_DEBUG(
+		"free args.handle: %d, rknpu_obj: %#llx, rknpu_obj->dma_addr: %#llx\n",
+		args.handle, (__u64)(uintptr_t)rknpu_obj,
+		(__u64)rknpu_obj->dma_addr);
+
+	vunmap(rknpu_obj->kv_addr);
+	rknpu_obj->kv_addr = NULL;
+
+	if (!rknpu_obj->owner)
+		dma_buf_put(dmabuf);
+
+	kfree(rknpu_obj);
+
+	return 0;
+}
+
+int rknpu_mem_sync_ioctl(struct rknpu_device *rknpu_dev, unsigned long data)
+{
+	struct rknpu_mem_object *rknpu_obj = NULL;
+	struct rknpu_mem_sync args;
+	struct dma_buf *dmabuf;
+	int ret = -EFAULT;
+
+	if (unlikely(copy_from_user(&args, (struct rknpu_mem_sync *)data,
+				    sizeof(struct rknpu_mem_sync)))) {
+		LOG_ERROR("%s: copy_from_user failed\n", __func__);
+		ret = -EFAULT;
+		return ret;
+	}
+
+	rknpu_obj = (struct rknpu_mem_object *)(uintptr_t)args.obj_addr;
+	dmabuf = rknpu_obj->dmabuf;
+
+	if (args.flags & RKNPU_MEM_SYNC_TO_DEVICE) {
+		dmabuf->ops->end_cpu_access_partial(dmabuf, DMA_TO_DEVICE,
+						    args.offset, args.size);
+	}
+	if (args.flags & RKNPU_MEM_SYNC_FROM_DEVICE) {
+		dmabuf->ops->begin_cpu_access_partial(dmabuf, DMA_FROM_DEVICE,
+						      args.offset, args.size);
+	}
+
+	return 0;
+}
diff --git a/drivers/rknpu/rknpu_mm.c b/drivers/rknpu/rknpu_mm.c
new file mode 100644
index 0000000000000000000000000000000000000000..9a13c3e256a4226f809f2465818c215611043e8d
--- /dev/null
+++ b/drivers/rknpu/rknpu_mm.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include "rknpu_debugger.h"
+#include "rknpu_mm.h"
+
+int rknpu_mm_create(unsigned int mem_size, unsigned int chunk_size,
+		    struct rknpu_mm **mm)
+{
+	unsigned int num_of_longs;
+	int ret = -EINVAL;
+
+	if (WARN_ON(mem_size < chunk_size))
+		return -EINVAL;
+	if (WARN_ON(mem_size == 0))
+		return -EINVAL;
+	if (WARN_ON(chunk_size == 0))
+		return -EINVAL;
+
+	*mm = kzalloc(sizeof(struct rknpu_mm), GFP_KERNEL);
+	if (!(*mm))
+		return -ENOMEM;
+
+	(*mm)->chunk_size = chunk_size;
+	(*mm)->total_chunks = mem_size / chunk_size;
+	(*mm)->free_chunks = (*mm)->total_chunks;
+
+	num_of_longs =
+		((*mm)->total_chunks + BITS_PER_LONG - 1) / BITS_PER_LONG;
+
+	(*mm)->bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL);
+	if (!(*mm)->bitmap) {
+		ret = -ENOMEM;
+		goto free_mm;
+	}
+
+	mutex_init(&(*mm)->lock);
+
+	LOG_DEBUG("total_chunks: %d, bitmap: %p\n", (*mm)->total_chunks,
+		  (*mm)->bitmap);
+
+	return 0;
+
+free_mm:
+	kfree(mm);
+	return ret;
+}
+
+void rknpu_mm_destroy(struct rknpu_mm *mm)
+{
+	if (mm != NULL) {
+		mutex_destroy(&mm->lock);
+		kfree(mm->bitmap);
+		kfree(mm);
+	}
+}
+
+int rknpu_mm_alloc(struct rknpu_mm *mm, unsigned int size,
+		   struct rknpu_mm_obj **mm_obj)
+{
+	unsigned int found, start_search, cur_size;
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (size > mm->total_chunks * mm->chunk_size)
+		return -ENOMEM;
+
+	*mm_obj = kzalloc(sizeof(struct rknpu_mm_obj), GFP_KERNEL);
+	if (!(*mm_obj))
+		return -ENOMEM;
+
+	start_search = 0;
+
+	mutex_lock(&mm->lock);
+
+mm_restart_search:
+	/* Find the first chunk that is free */
+	found = find_next_zero_bit(mm->bitmap, mm->total_chunks, start_search);
+
+	/* If there wasn't any free chunk, bail out */
+	if (found == mm->total_chunks)
+		goto mm_no_free_chunk;
+
+	/* Update fields of mm_obj */
+	(*mm_obj)->range_start = found;
+	(*mm_obj)->range_end = found;
+
+	/* If we need only one chunk, mark it as allocated and get out */
+	if (size <= mm->chunk_size) {
+		set_bit(found, mm->bitmap);
+		goto mm_out;
+	}
+
+	/* Otherwise, try to see if we have enough contiguous chunks */
+	cur_size = size - mm->chunk_size;
+	do {
+		(*mm_obj)->range_end = find_next_zero_bit(
+			mm->bitmap, mm->total_chunks, ++found);
+		/*
+		 * If next free chunk is not contiguous than we need to
+		 * restart our search from the last free chunk we found (which
+		 * wasn't contiguous to the previous ones
+		 */
+		if ((*mm_obj)->range_end != found) {
+			start_search = found;
+			goto mm_restart_search;
+		}
+
+		/*
+		 * If we reached end of buffer, bail out with error
+		 */
+		if (found == mm->total_chunks)
+			goto mm_no_free_chunk;
+
+		/* Check if we don't need another chunk */
+		if (cur_size <= mm->chunk_size)
+			cur_size = 0;
+		else
+			cur_size -= mm->chunk_size;
+
+	} while (cur_size > 0);
+
+	/* Mark the chunks as allocated */
+	for (found = (*mm_obj)->range_start; found <= (*mm_obj)->range_end;
+	     found++)
+		set_bit(found, mm->bitmap);
+
+mm_out:
+	mm->free_chunks -= ((*mm_obj)->range_end - (*mm_obj)->range_start + 1);
+	mutex_unlock(&mm->lock);
+
+	LOG_DEBUG("mm allocate, mm_obj: %p, range_start: %d, range_end: %d\n",
+		  *mm_obj, (*mm_obj)->range_start, (*mm_obj)->range_end);
+
+	return 0;
+
+mm_no_free_chunk:
+	mutex_unlock(&mm->lock);
+	kfree(*mm_obj);
+
+	return -ENOMEM;
+}
+
+int rknpu_mm_free(struct rknpu_mm *mm, struct rknpu_mm_obj *mm_obj)
+{
+	unsigned int bit;
+
+	/* Act like kfree when trying to free a NULL object */
+	if (!mm_obj)
+		return 0;
+
+	LOG_DEBUG("mm free, mem_obj: %p, range_start: %d, range_end: %d\n",
+		  mm_obj, mm_obj->range_start, mm_obj->range_end);
+
+	mutex_lock(&mm->lock);
+
+	/* Mark the chunks as free */
+	for (bit = mm_obj->range_start; bit <= mm_obj->range_end; bit++)
+		clear_bit(bit, mm->bitmap);
+
+	mm->free_chunks += (mm_obj->range_end - mm_obj->range_start + 1);
+
+	mutex_unlock(&mm->lock);
+
+	kfree(mm_obj);
+
+	return 0;
+}
+
+int rknpu_mm_dump(struct seq_file *m, void *data)
+{
+	struct rknpu_debugger_node *node = m->private;
+	struct rknpu_debugger *debugger = node->debugger;
+	struct rknpu_device *rknpu_dev =
+		container_of(debugger, struct rknpu_device, debugger);
+	struct rknpu_mm *mm = NULL;
+	int cur = 0, rbot = 0, rtop = 0;
+	size_t ret = 0;
+	char buf[64];
+	size_t size = sizeof(buf);
+	int seg_chunks = 32, seg_id = 0;
+	int free_size = 0;
+	int i = 0;
+
+	mm = rknpu_dev->sram_mm;
+	if (mm == NULL)
+		return 0;
+
+	seq_printf(m, "SRAM bitmap: \"*\" - used, \".\" - free (1bit = %dKB)\n",
+		   mm->chunk_size / 1024);
+
+	rbot = cur = find_first_bit(mm->bitmap, mm->total_chunks);
+	for (i = 0; i < cur; ++i) {
+		ret += scnprintf(buf + ret, size - ret, ".");
+		if (ret >= seg_chunks) {
+			seq_printf(m, "[%03d] [%s]\n", seg_id++, buf);
+			ret = 0;
+		}
+	}
+	while (cur < mm->total_chunks) {
+		rtop = cur;
+		cur = find_next_bit(mm->bitmap, mm->total_chunks, cur + 1);
+		if (cur < mm->total_chunks && cur <= rtop + 1)
+			continue;
+
+		for (i = rbot; i <= rtop; ++i) {
+			ret += scnprintf(buf + ret, size - ret, "*");
+			if (ret >= seg_chunks) {
+				seq_printf(m, "[%03d] [%s]\n", seg_id++, buf);
+				ret = 0;
+			}
+		}
+
+		for (i = rtop + 1; i < cur; ++i) {
+			ret += scnprintf(buf + ret, size - ret, ".");
+			if (ret >= seg_chunks) {
+				seq_printf(m, "[%03d] [%s]\n", seg_id++, buf);
+				ret = 0;
+			}
+		}
+
+		rbot = cur;
+	}
+
+	if (ret > 0)
+		seq_printf(m, "[%03d] [%s]\n", seg_id++, buf);
+
+	free_size = mm->free_chunks * mm->chunk_size;
+	seq_printf(m, "SRAM total size: %d, used: %d, free: %d\n",
+		   rknpu_dev->sram_size, rknpu_dev->sram_size - free_size,
+		   free_size);
+
+	return 0;
+}
+
+dma_addr_t rknpu_iommu_dma_alloc_iova(struct iommu_domain *domain, size_t size,
+				      u64 dma_limit, struct device *dev)
+{
+	struct rknpu_iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	unsigned long shift, iova_len, iova = 0;
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+	dma_addr_t limit;
+#endif
+
+	shift = iova_shift(iovad);
+	iova_len = size >> shift;
+	/*
+	 * Freeing non-power-of-two-sized allocations back into the IOVA caches
+	 * will come back to bite us badly, so we have to waste a bit of space
+	 * rounding up anything cacheable to make sure that can't happen. The
+	 * order of the unadjusted size will still match upon freeing.
+	 */
+	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+		iova_len = roundup_pow_of_two(iova_len);
+
+#if (KERNEL_VERSION(5, 10, 0) <= LINUX_VERSION_CODE)
+	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
+#else
+	if (dev->bus_dma_mask)
+		dma_limit &= dev->bus_dma_mask;
+#endif
+
+	if (domain->geometry.force_aperture)
+		dma_limit =
+			min_t(u64, dma_limit, domain->geometry.aperture_end);
+
+#if (KERNEL_VERSION(5, 4, 0) <= LINUX_VERSION_CODE)
+	iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true);
+#else
+	limit = min_t(dma_addr_t, dma_limit >> shift, iovad->end_pfn);
+
+	iova = alloc_iova_fast(iovad, iova_len, limit, true);
+#endif
+
+	return (dma_addr_t)iova << shift;
+}
+
+void rknpu_iommu_dma_free_iova(struct rknpu_iommu_dma_cookie *cookie,
+			       dma_addr_t iova, size_t size)
+{
+	struct iova_domain *iovad = &cookie->iovad;
+
+	free_iova_fast(iovad, iova_pfn(iovad, iova), size >> iova_shift(iovad));
+}
diff --git a/drivers/rknpu/rknpu_reset.c b/drivers/rknpu/rknpu_reset.c
new file mode 100644
index 0000000000000000000000000000000000000000..91c9b75d68e77eea1745cd4327a4d27048b427a8
--- /dev/null
+++ b/drivers/rknpu/rknpu_reset.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/iommu.h>
+
+#include "rknpu_reset.h"
+
+#ifndef FPGA_PLATFORM
+static inline struct reset_control *rknpu_reset_control_get(struct device *dev,
+							    const char *name)
+{
+	struct reset_control *rst = NULL;
+
+	rst = devm_reset_control_get(dev, name);
+	if (IS_ERR(rst))
+		LOG_DEV_ERROR(dev,
+			      "failed to get rknpu reset control: %s, %ld\n",
+			      name, PTR_ERR(rst));
+
+	return rst;
+}
+#endif
+
+int rknpu_reset_get(struct rknpu_device *rknpu_dev)
+{
+#ifndef FPGA_PLATFORM
+	struct reset_control *srst_a = NULL;
+	struct reset_control *srst_h = NULL;
+	int i = 0;
+
+	for (i = 0; i < rknpu_dev->config->num_resets; i++) {
+		srst_a = rknpu_reset_control_get(
+			rknpu_dev->dev,
+			rknpu_dev->config->resets[i].srst_a_name);
+		if (IS_ERR(srst_a))
+			return PTR_ERR(srst_a);
+
+		rknpu_dev->srst_a[i] = srst_a;
+
+		srst_h = rknpu_reset_control_get(
+			rknpu_dev->dev,
+			rknpu_dev->config->resets[i].srst_h_name);
+		if (IS_ERR(srst_h))
+			return PTR_ERR(srst_h);
+
+		rknpu_dev->srst_h[i] = srst_h;
+	}
+#endif
+
+	return 0;
+}
+
+#ifndef FPGA_PLATFORM
+static int rknpu_reset_assert(struct reset_control *rst)
+{
+	int ret = -EINVAL;
+
+	if (!rst)
+		return -EINVAL;
+
+	ret = reset_control_assert(rst);
+	if (ret < 0) {
+		LOG_ERROR("failed to assert rknpu reset: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int rknpu_reset_deassert(struct reset_control *rst)
+{
+	int ret = -EINVAL;
+
+	if (!rst)
+		return -EINVAL;
+
+	ret = reset_control_deassert(rst);
+	if (ret < 0) {
+		LOG_ERROR("failed to deassert rknpu reset: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+#endif
+
+int rknpu_soft_reset(struct rknpu_device *rknpu_dev)
+{
+#ifndef FPGA_PLATFORM
+	struct iommu_domain *domain = NULL;
+	struct rknpu_subcore_data *subcore_data = NULL;
+	int ret = -EINVAL, i = 0;
+
+	if (rknpu_dev->bypass_soft_reset) {
+		LOG_WARN("bypass soft reset\n");
+		return 0;
+	}
+
+	if (!mutex_trylock(&rknpu_dev->reset_lock))
+		return 0;
+
+	rknpu_dev->soft_reseting = true;
+
+	msleep(100);
+
+	for (i = 0; i < rknpu_dev->config->num_irqs; ++i) {
+		subcore_data = &rknpu_dev->subcore_datas[i];
+		wake_up(&subcore_data->job_done_wq);
+	}
+
+	LOG_INFO("soft reset\n");
+
+	for (i = 0; i < rknpu_dev->config->num_resets; i++) {
+		ret = rknpu_reset_assert(rknpu_dev->srst_a[i]);
+		ret |= rknpu_reset_assert(rknpu_dev->srst_h[i]);
+
+		udelay(10);
+
+		ret |= rknpu_reset_deassert(rknpu_dev->srst_a[i]);
+		ret |= rknpu_reset_deassert(rknpu_dev->srst_h[i]);
+	}
+
+	if (ret) {
+		LOG_DEV_ERROR(rknpu_dev->dev,
+			      "failed to soft reset for rknpu: %d\n", ret);
+		mutex_unlock(&rknpu_dev->reset_lock);
+		return ret;
+	}
+
+	if (rknpu_dev->iommu_en)
+		domain = iommu_get_domain_for_dev(rknpu_dev->dev);
+
+	if (domain) {
+		iommu_detach_device(domain, rknpu_dev->dev);
+		iommu_attach_device(domain, rknpu_dev->dev);
+	}
+
+	rknpu_dev->soft_reseting = false;
+
+	mutex_unlock(&rknpu_dev->reset_lock);
+#endif
+
+	return 0;
+}