From 562efd41f7ad80693547001c3678e3a83568d8c4 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Fri, 28 Nov 2025 16:46:40 +0800 Subject: [PATCH 1/2] anolis: criu: io_uring: support basic checkpoint/restore for io_uring ANBZ: #30028 To support basic checkpoint/restore functionality for io_uring instances, the kernel has been modified as follows: 1. The information exposed in /proc/*/fdinfo for io_uring fd is insufficient for checkpointing and restoring io_uring instances. It lacks the original io_uring setup parameters and does not provide a mapping between VMAs and the corresponding fd. To address this, additional fields are added to fdinfo to expose the necessary details required for accurate restore. 2. The kernel currently does not support transmitting io_uring fd via SCM rights. However, CRIU's checkpoint/restore of io_uring relies on this mechanism to obtain the file options associated with the io_uring fd. To enable this, the kernel is modified to allow SCM-based transmission of io_uring fds. 3. To prevent misuse of the above mechanism, a prctl-based restriction has been introduced for explicit control. Signed-off-by: Bitao Hu Reviewed-by: Liu Song Reviewed-by: hr567 --- fs/Kconfig | 13 +++++ fs/exec.c | 3 + include/linux/sched.h | 4 ++ include/uapi/linux/prctl.h | 5 ++ init/init_task.c | 3 + io_uring/io_uring.c | 115 +++++++++++++++++++++++++++++++++++++ kernel/fork.c | 4 ++ kernel/sys.c | 8 +++ net/core/scm.c | 11 ++++ 9 files changed, 166 insertions(+) diff --git a/fs/Kconfig b/fs/Kconfig index 5e27f498e52a..739ed6eecc19 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -222,6 +222,19 @@ config TMPFS_INODE64 If unsure, say N. +config CR_IO_URING + bool "allow io_uring support checkpoint/restore" + default n + help + This option enables checkpoint/restore support for io_uring. + Note that only basic io_uring functionality is supported. When + enabled, additional information will be exposed in fdinfo, and + io_uring file descriptors may be transferred via SCM rights—both + behaviors can be controlled via prctl. Users must carefully + assess the associated risks before enabling this feature. + + If unsure say N. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/exec.c b/fs/exec.c index 700534200f7f..4d729f5c4833 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1852,6 +1852,9 @@ static int bprm_execve(struct linux_binprm *bprm, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; +#ifdef CONFIG_CR_IO_URING + current->cr_io_uring_enabled = false; +#endif rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/include/linux/sched.h b/include/linux/sched.h index 59f3dbe59395..c43d3acf5ecb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1522,7 +1522,11 @@ struct task_struct { #endif bool proxy_exec; +#ifdef CONFIG_CR_IO_URING + CK_KABI_USE(1, bool cr_io_uring_enabled) +#else CK_KABI_RESERVE(1) +#endif CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index efeba34bc0c0..8af1e20f57d5 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -263,4 +263,9 @@ struct prctl_mm_map { #define PR_GET_IDENTITY 1000 #define PR_SET_IDENTITY 1001 +#define PR_RESERVED0 1100 +#define PR_RESERVED1 1101 +#define PR_ENABLE_CR_IO_URING 1102 +#define PR_DISABLE_CR_IO_URING 1103 + #endif /* _LINUX_PRCTL_H */ diff --git a/init/init_task.c b/init/init_task.c index 7ed93be4682d..5fbfa0cc7dcd 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -216,6 +216,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_CR_IO_URING + .cr_io_uring_enabled = false, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5481878aa55a..5e8963b53a8b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -86,6 +86,9 @@ #include #include "../fs/internal.h" +#ifdef CONFIG_CR_IO_URING +#include "../fs/proc/internal.h" +#endif #include "io-wq.h" #define IORING_MAX_ENTRIES 32768 @@ -10311,6 +10314,33 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } #ifdef CONFIG_PROC_FS +#ifdef CONFIG_CR_IO_URING +#define IO_URING_FDINFO_MAPS_SIZE 128 +static void format_io_uring_mapping(char *buf, struct vm_area_struct *vma, int *offset, int size) +{ + int len; + + if (vma->vm_start == 0) + len = 1; + else + len = (sizeof(vma->vm_start) * 8 - __builtin_clzll(vma->vm_start) + 3) / 4; + + if (vma->vm_end == 0) + len += 1; + else + len += (sizeof(vma->vm_end) * 8 - __builtin_clzll(vma->vm_end) + 3) / 4; + + /* The delimiter is two characters long */ + len += 2; + + if (len + (*offset) > size) + return; + + sprintf(buf + (*offset), "%08lx-%08lx,", vma->vm_start, vma->vm_end); + *offset += len; +} +#endif + static int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { @@ -10345,6 +10375,17 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id, static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { +#ifdef CONFIG_CR_IO_URING + struct io_overflow_cqe *ocqe; + struct task_struct *task; + struct vm_area_struct *vma; + struct mm_struct *mm; + char sq_mapping[IO_URING_FDINFO_MAPS_SIZE]; + char cq_mapping[IO_URING_FDINFO_MAPS_SIZE]; + char sqes_mapping[IO_URING_FDINFO_MAPS_SIZE]; + int sq_mapping_offset = 0, cq_mapping_offset = 0, sqes_mapping_offset = 0; + +#endif struct io_rings *r = ctx->rings; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); @@ -10481,6 +10522,80 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) req->task->task_works != NULL); } spin_unlock(&ctx->completion_lock); + +#ifdef CONFIG_CR_IO_URING + if (!current->cr_io_uring_enabled) + goto out; + + seq_puts(m, "CqOverflowList:\n"); + spin_lock(&ctx->completion_lock); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + } + spin_unlock(&ctx->completion_lock); + + seq_printf(m, "Locked: %d\n", has_lock ? 1 : 0); + seq_printf(m, "SqThreadIdle: %u\n", ctx->sq_thread_idle); + seq_printf(m, "SetupFlags: 0x%x\n", ctx->flags); + seq_printf(m, "SqEntries: %u\n", ctx->sq_entries); + seq_printf(m, "CqEntries: %u\n", ctx->cq_entries); + seq_printf(m, "SqOffArray: %u\n", (u32)((char *)ctx->sq_array - (char *)ctx->rings)); + + task = get_proc_task(m->private); + if (!task) + goto out; + mm = task->mm; + if (!mm) + goto err; + mmap_read_lock(mm); + + memset(sq_mapping, 0, sizeof(sq_mapping)); + memset(cq_mapping, 0, sizeof(cq_mapping)); + memset(sqes_mapping, 0, sizeof(sqes_mapping)); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && vma->vm_file->private_data == ctx) { + unsigned long long pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + + switch (pgoff) { + case IORING_OFF_SQ_RING: + format_io_uring_mapping(sq_mapping, vma, &sq_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + case IORING_OFF_CQ_RING: + format_io_uring_mapping(cq_mapping, vma, &cq_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + case IORING_OFF_SQES: + format_io_uring_mapping(sqes_mapping, vma, &sqes_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + default: + break; + } + } + } + + if (sq_mapping[0] || cq_mapping[0] || sqes_mapping[0]) { + if (sq_mapping_offset) + sq_mapping[sq_mapping_offset - 1] = '\0'; + if (cq_mapping_offset) + cq_mapping[cq_mapping_offset - 1] = '\0'; + if (sqes_mapping_offset) + sqes_mapping[sqes_mapping_offset - 1] = '\0'; + seq_puts(m, "Mappings:\n"); + seq_printf(m, " SqRingMapping: %s\n", sq_mapping); + seq_printf(m, " CqRingMapping: %s\n", cq_mapping); + seq_printf(m, " SQEsMapping: %s\n", sqes_mapping); + } + + mmap_read_unlock(mm); +err: + put_task_struct(task); +out: +#endif if (has_lock) mutex_unlock(&ctx->uring_lock); } diff --git a/kernel/fork.c b/kernel/fork.c index 3b1f2b6a7428..c5274eaa1a1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1009,6 +1009,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_CR_IO_URING + tsk->cr_io_uring_enabled = false; +#endif + return tsk; free_stack: diff --git a/kernel/sys.c b/kernel/sys.c index 93e02a2e1c1d..3bce5c46f35a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2562,6 +2562,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = sched_identity_set_pid(arg2, arg3); break; +#endif +#ifdef CONFIG_CR_IO_URING + case PR_ENABLE_CR_IO_URING: + current->cr_io_uring_enabled = true; + break; + case PR_DISABLE_CR_IO_URING: + current->cr_io_uring_enabled = false; + break; #endif default: error = -EINVAL; diff --git a/net/core/scm.c b/net/core/scm.c index d09849cb60f0..66b4c8a8050f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -106,8 +106,19 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -EBADF; /* don't allow io_uring files */ if (io_is_uring_fops(file)) { +#ifdef CONFIG_CR_IO_URING + if (current->cr_io_uring_enabled) { + printk_ratelimited(KERN_WARNING + "Allow %s(%d) to send io_uring FD via SCM\n", + current->comm, task_pid_nr(current)); + } else { + fput(file); + return -EINVAL; + } +#else fput(file); return -EINVAL; +#endif } *fpp++ = file; fpl->count++; -- Gitee From b21f6221b60a19051075774c6b9657335d2dfb47 Mon Sep 17 00:00:00 2001 From: Bitao Hu Date: Thu, 22 Jan 2026 15:30:01 +0800 Subject: [PATCH 2/2] anolis: configs: enable checkpoint/restore for io_uring ANBZ: #30028 Enable CONFIG_CR_IO_URING to support checkpoint and restore functionality for io_uring instances. Signed-off-by: Bitao Hu Reviewed-by: Liu Song --- anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING | 1 + 1 file changed, 1 insertion(+) create mode 100644 anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING diff --git a/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING b/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING new file mode 100644 index 000000000000..24eb5d1cbb66 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING @@ -0,0 +1 @@ +CONFIG_CR_IO_URING=y -- Gitee