diff --git a/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING b/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING new file mode 100644 index 0000000000000000000000000000000000000000..24eb5d1cbb6693c5577dab5bc04460ab88dc9673 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/default/CONFIG_CR_IO_URING @@ -0,0 +1 @@ +CONFIG_CR_IO_URING=y diff --git a/fs/Kconfig b/fs/Kconfig index 5e27f498e52a0e7814d9392308c3bd2ab21888e9..739ed6eecc198abcb761f055eaa85f9c2b6546a6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -222,6 +222,19 @@ config TMPFS_INODE64 If unsure, say N. +config CR_IO_URING + bool "allow io_uring support checkpoint/restore" + default n + help + This option enables checkpoint/restore support for io_uring. + Note that only basic io_uring functionality is supported. When + enabled, additional information will be exposed in fdinfo, and + io_uring file descriptors may be transferred via SCM rights—both + behaviors can be controlled via prctl. Users must carefully + assess the associated risks before enabling this feature. + + If unsure say N. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/exec.c b/fs/exec.c index 700534200f7ffde2f982313584ab08174dedf1dd..4d729f5c4833ecc14767c183ca9ee87397032e73 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1852,6 +1852,9 @@ static int bprm_execve(struct linux_binprm *bprm, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; +#ifdef CONFIG_CR_IO_URING + current->cr_io_uring_enabled = false; +#endif rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/include/linux/sched.h b/include/linux/sched.h index 59f3dbe59395d02dbc1f2fd04efe14bb10e3b5fc..c43d3acf5ecb620a0ec65b7bc68a79daa79c04e8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1522,7 +1522,11 @@ struct task_struct { #endif bool proxy_exec; +#ifdef CONFIG_CR_IO_URING + CK_KABI_USE(1, bool cr_io_uring_enabled) +#else CK_KABI_RESERVE(1) +#endif CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index efeba34bc0c0f85eba7339b6a28cf38c0ef74f1c..8af1e20f57d51ce81c3e26d17e3a4edd8baa6165 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -263,4 +263,9 @@ struct prctl_mm_map { #define PR_GET_IDENTITY 1000 #define PR_SET_IDENTITY 1001 +#define PR_RESERVED0 1100 +#define PR_RESERVED1 1101 +#define PR_ENABLE_CR_IO_URING 1102 +#define PR_DISABLE_CR_IO_URING 1103 + #endif /* _LINUX_PRCTL_H */ diff --git a/init/init_task.c b/init/init_task.c index 7ed93be4682d98422c5974967e62c7e39512283e..5fbfa0cc7dcd127d7f06bf0de5c868821b21dead 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -216,6 +216,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_CR_IO_URING + .cr_io_uring_enabled = false, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5481878aa55a2ede3d73ed7329e5f762f6c6d377..5e8963b53a8b0bc89706e2b7b152c8e8d11faeab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -86,6 +86,9 @@ #include #include "../fs/internal.h" +#ifdef CONFIG_CR_IO_URING +#include "../fs/proc/internal.h" +#endif #include "io-wq.h" #define IORING_MAX_ENTRIES 32768 @@ -10311,6 +10314,33 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } #ifdef CONFIG_PROC_FS +#ifdef CONFIG_CR_IO_URING +#define IO_URING_FDINFO_MAPS_SIZE 128 +static void format_io_uring_mapping(char *buf, struct vm_area_struct *vma, int *offset, int size) +{ + int len; + + if (vma->vm_start == 0) + len = 1; + else + len = (sizeof(vma->vm_start) * 8 - __builtin_clzll(vma->vm_start) + 3) / 4; + + if (vma->vm_end == 0) + len += 1; + else + len += (sizeof(vma->vm_end) * 8 - __builtin_clzll(vma->vm_end) + 3) / 4; + + /* The delimiter is two characters long */ + len += 2; + + if (len + (*offset) > size) + return; + + sprintf(buf + (*offset), "%08lx-%08lx,", vma->vm_start, vma->vm_end); + *offset += len; +} +#endif + static int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { @@ -10345,6 +10375,17 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id, static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { +#ifdef CONFIG_CR_IO_URING + struct io_overflow_cqe *ocqe; + struct task_struct *task; + struct vm_area_struct *vma; + struct mm_struct *mm; + char sq_mapping[IO_URING_FDINFO_MAPS_SIZE]; + char cq_mapping[IO_URING_FDINFO_MAPS_SIZE]; + char sqes_mapping[IO_URING_FDINFO_MAPS_SIZE]; + int sq_mapping_offset = 0, cq_mapping_offset = 0, sqes_mapping_offset = 0; + +#endif struct io_rings *r = ctx->rings; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); @@ -10481,6 +10522,80 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) req->task->task_works != NULL); } spin_unlock(&ctx->completion_lock); + +#ifdef CONFIG_CR_IO_URING + if (!current->cr_io_uring_enabled) + goto out; + + seq_puts(m, "CqOverflowList:\n"); + spin_lock(&ctx->completion_lock); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + } + spin_unlock(&ctx->completion_lock); + + seq_printf(m, "Locked: %d\n", has_lock ? 1 : 0); + seq_printf(m, "SqThreadIdle: %u\n", ctx->sq_thread_idle); + seq_printf(m, "SetupFlags: 0x%x\n", ctx->flags); + seq_printf(m, "SqEntries: %u\n", ctx->sq_entries); + seq_printf(m, "CqEntries: %u\n", ctx->cq_entries); + seq_printf(m, "SqOffArray: %u\n", (u32)((char *)ctx->sq_array - (char *)ctx->rings)); + + task = get_proc_task(m->private); + if (!task) + goto out; + mm = task->mm; + if (!mm) + goto err; + mmap_read_lock(mm); + + memset(sq_mapping, 0, sizeof(sq_mapping)); + memset(cq_mapping, 0, sizeof(cq_mapping)); + memset(sqes_mapping, 0, sizeof(sqes_mapping)); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && vma->vm_file->private_data == ctx) { + unsigned long long pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + + switch (pgoff) { + case IORING_OFF_SQ_RING: + format_io_uring_mapping(sq_mapping, vma, &sq_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + case IORING_OFF_CQ_RING: + format_io_uring_mapping(cq_mapping, vma, &cq_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + case IORING_OFF_SQES: + format_io_uring_mapping(sqes_mapping, vma, &sqes_mapping_offset, + IO_URING_FDINFO_MAPS_SIZE); + break; + default: + break; + } + } + } + + if (sq_mapping[0] || cq_mapping[0] || sqes_mapping[0]) { + if (sq_mapping_offset) + sq_mapping[sq_mapping_offset - 1] = '\0'; + if (cq_mapping_offset) + cq_mapping[cq_mapping_offset - 1] = '\0'; + if (sqes_mapping_offset) + sqes_mapping[sqes_mapping_offset - 1] = '\0'; + seq_puts(m, "Mappings:\n"); + seq_printf(m, " SqRingMapping: %s\n", sq_mapping); + seq_printf(m, " CqRingMapping: %s\n", cq_mapping); + seq_printf(m, " SQEsMapping: %s\n", sqes_mapping); + } + + mmap_read_unlock(mm); +err: + put_task_struct(task); +out: +#endif if (has_lock) mutex_unlock(&ctx->uring_lock); } diff --git a/kernel/fork.c b/kernel/fork.c index 3b1f2b6a74285fc468dfdf7a847b075157247b5c..c5274eaa1a1e19f3a08c131c4133bc63677deaf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1009,6 +1009,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_CR_IO_URING + tsk->cr_io_uring_enabled = false; +#endif + return tsk; free_stack: diff --git a/kernel/sys.c b/kernel/sys.c index 93e02a2e1c1de48d5e0946aceda8791a2ed3f745..3bce5c46f35adc476404da27e8599f5c362e286f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2562,6 +2562,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = sched_identity_set_pid(arg2, arg3); break; +#endif +#ifdef CONFIG_CR_IO_URING + case PR_ENABLE_CR_IO_URING: + current->cr_io_uring_enabled = true; + break; + case PR_DISABLE_CR_IO_URING: + current->cr_io_uring_enabled = false; + break; #endif default: error = -EINVAL; diff --git a/net/core/scm.c b/net/core/scm.c index d09849cb60f08b96ba518fea99f4b5d1a1fb9dde..66b4c8a8050ffc577a4b1892c2aca2f2ddfd6ce9 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -106,8 +106,19 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -EBADF; /* don't allow io_uring files */ if (io_is_uring_fops(file)) { +#ifdef CONFIG_CR_IO_URING + if (current->cr_io_uring_enabled) { + printk_ratelimited(KERN_WARNING + "Allow %s(%d) to send io_uring FD via SCM\n", + current->comm, task_pid_nr(current)); + } else { + fput(file); + return -EINVAL; + } +#else fput(file); return -EINVAL; +#endif } *fpp++ = file; fpl->count++;