Blame - fs/userfaultfd.c - kernel/hikey-linaro

blob: 1f2ddaaf3c03bd02ad43a4d20dfac1e0fe48ed04 [file] [log] [blame]

Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	1	/*
				2	* fs/userfaultfd.c
				3	*
				4	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
				5	* Copyright (C) 2008-2009 Red Hat, Inc.
				6	* Copyright (C) 2015 Red Hat, Inc.
				7	*
				8	* This work is licensed under the terms of the GNU GPL, version 2. See
				9	* the COPYING file in the top-level directory.
				10	*
				11	* Some part derived from fs/eventfd.c (anon inode setup) and
				12	* mm/ksm.c (mm hashing).
				13	*/
				14
				15	#include <linux/hashtable.h>
				16	#include <linux/sched.h>
				17	#include <linux/mm.h>
				18	#include <linux/poll.h>
				19	#include <linux/slab.h>
				20	#include <linux/seq_file.h>
				21	#include <linux/file.h>
				22	#include <linux/bug.h>
				23	#include <linux/anon_inodes.h>
				24	#include <linux/syscalls.h>
				25	#include <linux/userfaultfd_k.h>
				26	#include <linux/mempolicy.h>
				27	#include <linux/ioctl.h>
				28	#include <linux/security.h>
				29
				30	enum userfaultfd_state {
				31	UFFD_STATE_WAIT_API,
				32	UFFD_STATE_RUNNING,
				33	};
				34
				35	struct userfaultfd_ctx {
				36	/* pseudo fd refcounting */
				37	atomic_t refcount;
				38	/* waitqueue head for the userfaultfd page faults */
				39	wait_queue_head_t fault_wqh;
				40	/* waitqueue head for the pseudo fd to wakeup poll/read */
				41	wait_queue_head_t fd_wqh;
				42	/* userfaultfd syscall flags */
				43	unsigned int flags;
				44	/* state machine */
				45	enum userfaultfd_state state;
				46	/* released */
				47	bool released;
				48	/* mm with one ore more vmas attached to this userfaultfd_ctx */
				49	struct mm_struct *mm;
				50	};
				51
				52	struct userfaultfd_wait_queue {
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	53	struct uffd_msg msg;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	54	wait_queue_t wq;
				55	bool pending;
				56	struct userfaultfd_ctx *ctx;
				57	};
				58
				59	struct userfaultfd_wake_range {
				60	unsigned long start;
				61	unsigned long len;
				62	};
				63
				64	static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
				65	int wake_flags, void *key)
				66	{
				67	struct userfaultfd_wake_range *range = key;
				68	int ret;
				69	struct userfaultfd_wait_queue *uwq;
				70	unsigned long start, len;
				71
				72	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				73	ret = 0;
				74	/* don't wake the pending ones to avoid reads to block */
				75	if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
				76	goto out;
				77	/* len == 0 means wake all */
				78	start = range->start;
				79	len = range->len;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	80	if (len && (start > uwq->msg.arg.pagefault.address \|\|
				81	start + len <= uwq->msg.arg.pagefault.address))
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	82	goto out;
				83	ret = wake_up_state(wq->private, mode);
				84	if (ret)
				85	/*
				86	* Wake only once, autoremove behavior.
				87	*
				88	* After the effect of list_del_init is visible to the
				89	* other CPUs, the waitqueue may disappear from under
				90	* us, see the !list_empty_careful() in
				91	* handle_userfault(). try_to_wake_up() has an
				92	* implicit smp_mb__before_spinlock, and the
				93	* wq->private is read before calling the extern
				94	* function "wake_up_state" (which in turns calls
				95	* try_to_wake_up). While the spin_lock;spin_unlock;
				96	* wouldn't be enough, the smp_mb__before_spinlock is
				97	* enough to avoid an explicit smp_mb() here.
				98	*/
				99	list_del_init(&wq->task_list);
				100	out:
				101	return ret;
				102	}
				103
				104	/**
				105	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
				106	* context.
				107	* @ctx: [in] Pointer to the userfaultfd context.
				108	*
				109	* Returns: In case of success, returns not zero.
				110	*/
				111	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
				112	{
				113	if (!atomic_inc_not_zero(&ctx->refcount))
				114	BUG();
				115	}
				116
				117	/**
				118	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
				119	* context.
				120	* @ctx: [in] Pointer to userfaultfd context.
				121	*
				122	* The userfaultfd context reference must have been previously acquired either
				123	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
				124	*/
				125	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
				126	{
				127	if (atomic_dec_and_test(&ctx->refcount)) {
				128	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
				129	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
				130	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
				131	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
				132	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
				133	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
				134	mmput(ctx->mm);
				135	kfree(ctx);
				136	}
				137	}
				138
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	139	static inline void msg_init(struct uffd_msg *msg)
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	140	{
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	141	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
				142	/*
				143	* Must use memset to zero out the paddings or kernel data is
				144	* leaked to userland.
				145	*/
				146	memset(msg, 0, sizeof(struct uffd_msg));
				147	}
				148
				149	static inline struct uffd_msg userfault_msg(unsigned long address,
				150	unsigned int flags,
				151	unsigned long reason)
				152	{
				153	struct uffd_msg msg;
				154	msg_init(&msg);
				155	msg.event = UFFD_EVENT_PAGEFAULT;
				156	msg.arg.pagefault.address = address;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	157	if (flags & FAULT_FLAG_WRITE)
				158	/*
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	159	* If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
				160	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
				161	* was not set in a UFFD_EVENT_PAGEFAULT, it means it
				162	* was a read fault, otherwise if set it means it's
				163	* a write fault.
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	164	*/
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	165	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	166	if (reason & VM_UFFD_WP)
				167	/*
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	168	* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
				169	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
				170	* not set in a UFFD_EVENT_PAGEFAULT, it means it was
				171	* a missing fault, otherwise if set it means it's a
				172	* write protect fault.
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	173	*/
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	174	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
				175	return msg;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	176	}
				177
				178	/*
				179	* The locking rules involved in returning VM_FAULT_RETRY depending on
				180	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
				181	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
				182	* recommendation in __lock_page_or_retry is not an understatement.
				183	*
				184	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
				185	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
				186	* not set.
				187	*
				188	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
				189	* set, VM_FAULT_RETRY can still be returned if and only if there are
				190	* fatal_signal_pending()s, and the mmap_sem must be released before
				191	* returning it.
				192	*/
				193	int handle_userfault(struct vm_area_struct *vma, unsigned long address,
				194	unsigned int flags, unsigned long reason)
				195	{
				196	struct mm_struct *mm = vma->vm_mm;
				197	struct userfaultfd_ctx *ctx;
				198	struct userfaultfd_wait_queue uwq;
				199
				200	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				201
				202	ctx = vma->vm_userfaultfd_ctx.ctx;
				203	if (!ctx)
				204	return VM_FAULT_SIGBUS;
				205
				206	BUG_ON(ctx->mm != mm);
				207
				208	VM_BUG_ON(reason & ~(VM_UFFD_MISSING\|VM_UFFD_WP));
				209	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
				210
				211	/*
				212	* If it's already released don't get it. This avoids to loop
				213	* in __get_user_pages if userfaultfd_release waits on the
				214	* caller of handle_userfault to release the mmap_sem.
				215	*/
				216	if (unlikely(ACCESS_ONCE(ctx->released)))
				217	return VM_FAULT_SIGBUS;
				218
				219	/*
				220	* Check that we can return VM_FAULT_RETRY.
				221	*
				222	* NOTE: it should become possible to return VM_FAULT_RETRY
				223	* even if FAULT_FLAG_TRIED is set without leading to gup()
				224	* -EBUSY failures, if the userfaultfd is to be extended for
				225	* VM_UFFD_WP tracking and we intend to arm the userfault
				226	* without first stopping userland access to the memory. For
				227	* VM_UFFD_MISSING userfaults this is enough for now.
				228	*/
				229	if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
				230	/*
				231	* Validate the invariant that nowait must allow retry
				232	* to be sure not to return SIGBUS erroneously on
				233	* nowait invocations.
				234	*/
				235	BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
				236	#ifdef CONFIG_DEBUG_VM
				237	if (printk_ratelimit()) {
				238	printk(KERN_WARNING
				239	"FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
				240	dump_stack();
				241	}
				242	#endif
				243	return VM_FAULT_SIGBUS;
				244	}
				245
				246	/*
				247	* Handle nowait, not much to do other than tell it to retry
				248	* and wait.
				249	*/
				250	if (flags & FAULT_FLAG_RETRY_NOWAIT)
				251	return VM_FAULT_RETRY;
				252
				253	/* take the reference before dropping the mmap_sem */
				254	userfaultfd_ctx_get(ctx);
				255
				256	/* be gentle and immediately relinquish the mmap_sem */
				257	up_read(&mm->mmap_sem);
				258
				259	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
				260	uwq.wq.private = current;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	261	uwq.msg = userfault_msg(address, flags, reason);
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	262	uwq.pending = true;
				263	uwq.ctx = ctx;
				264
				265	spin_lock(&ctx->fault_wqh.lock);
				266	/*
				267	* After the __add_wait_queue the uwq is visible to userland
				268	* through poll/read().
				269	*/
				270	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
				271	for (;;) {
				272	set_current_state(TASK_KILLABLE);
				273	if (!uwq.pending \|\| ACCESS_ONCE(ctx->released) \|\|
				274	fatal_signal_pending(current))
				275	break;
				276	spin_unlock(&ctx->fault_wqh.lock);
				277
				278	wake_up_poll(&ctx->fd_wqh, POLLIN);
				279	schedule();
				280
				281	spin_lock(&ctx->fault_wqh.lock);
				282	}
				283	__remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
				284	__set_current_state(TASK_RUNNING);
				285	spin_unlock(&ctx->fault_wqh.lock);
				286
				287	/*
				288	* ctx may go away after this if the userfault pseudo fd is
				289	* already released.
				290	*/
				291	userfaultfd_ctx_put(ctx);
				292
				293	return VM_FAULT_RETRY;
				294	}
				295
				296	static int userfaultfd_release(struct inode inode, struct file file)
				297	{
				298	struct userfaultfd_ctx *ctx = file->private_data;
				299	struct mm_struct *mm = ctx->mm;
				300	struct vm_area_struct vma, prev;
				301	/* len == 0 means wake all */
				302	struct userfaultfd_wake_range range = { .len = 0, };
				303	unsigned long new_flags;
				304
				305	ACCESS_ONCE(ctx->released) = true;
				306
				307	/*
				308	* Flush page faults out of all CPUs. NOTE: all page faults
				309	* must be retried without returning VM_FAULT_SIGBUS if
				310	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
				311	* changes while handle_userfault released the mmap_sem. So
				312	* it's critical that released is set to true (above), before
				313	* taking the mmap_sem for writing.
				314	*/
				315	down_write(&mm->mmap_sem);
				316	prev = NULL;
				317	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				318	cond_resched();
				319	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
				320	!!(vma->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				321	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
				322	prev = vma;
				323	continue;
				324	}
				325	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				326	prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
				327	new_flags, vma->anon_vma,
				328	vma->vm_file, vma->vm_pgoff,
				329	vma_policy(vma),
				330	NULL_VM_UFFD_CTX);
				331	if (prev)
				332	vma = prev;
				333	else
				334	prev = vma;
				335	vma->vm_flags = new_flags;
				336	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				337	}
				338	up_write(&mm->mmap_sem);
				339
				340	/*
				341	* After no new page faults can wait on this fault_wqh, flush
				342	* the last page faults that may have been already waiting on
				343	* the fault_wqh.
				344	*/
				345	spin_lock(&ctx->fault_wqh.lock);
				346	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
				347	spin_unlock(&ctx->fault_wqh.lock);
				348
				349	wake_up_poll(&ctx->fd_wqh, POLLHUP);
				350	userfaultfd_ctx_put(ctx);
				351	return 0;
				352	}
				353
				354	/* fault_wqh.lock must be hold by the caller */
				355	static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
				356	struct userfaultfd_wait_queue **uwq)
				357	{
				358	wait_queue_t *wq;
				359	struct userfaultfd_wait_queue *_uwq;
				360	unsigned int ret = 0;
				361
				362	VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
				363
				364	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
				365	_uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				366	if (_uwq->pending) {
				367	ret = POLLIN;
				368	if (!uwq)
				369	/*
				370	* If there's at least a pending and
				371	* we don't care which one it is,
				372	* break immediately and leverage the
				373	* efficiency of the LIFO walk.
				374	*/
				375	break;
				376	/*
				377	* If we need to find which one was pending we
				378	* keep walking until we find the first not
				379	* pending one, so we read() them in FIFO order.
				380	*/
				381	*uwq = _uwq;
				382	} else
				383	/*
				384	* break the loop at the first not pending
				385	* one, there cannot be pending userfaults
				386	* after the first not pending one, because
				387	* all new pending ones are inserted at the
				388	* head and we walk it in LIFO.
				389	*/
				390	break;
				391	}
				392
				393	return ret;
				394	}
				395
				396	static unsigned int userfaultfd_poll(struct file file, poll_table wait)
				397	{
				398	struct userfaultfd_ctx *ctx = file->private_data;
				399	unsigned int ret;
				400
				401	poll_wait(file, &ctx->fd_wqh, wait);
				402
				403	switch (ctx->state) {
				404	case UFFD_STATE_WAIT_API:
				405	return POLLERR;
				406	case UFFD_STATE_RUNNING:
				407	spin_lock(&ctx->fault_wqh.lock);
				408	ret = find_userfault(ctx, NULL);
				409	spin_unlock(&ctx->fault_wqh.lock);
				410	return ret;
				411	default:
				412	BUG();
				413	}
				414	}
				415
				416	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	417	struct uffd_msg *msg)
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	418	{
				419	ssize_t ret;
				420	DECLARE_WAITQUEUE(wait, current);
				421	struct userfaultfd_wait_queue *uwq = NULL;
				422
				423	/* always take the fd_wqh lock before the fault_wqh lock */
				424	spin_lock(&ctx->fd_wqh.lock);
				425	__add_wait_queue(&ctx->fd_wqh, &wait);
				426	for (;;) {
				427	set_current_state(TASK_INTERRUPTIBLE);
				428	spin_lock(&ctx->fault_wqh.lock);
				429	if (find_userfault(ctx, &uwq)) {
				430	/*
				431	* The fault_wqh.lock prevents the uwq to
				432	* disappear from under us.
				433	*/
				434	uwq->pending = false;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	435	/* careful to always initialize msg if ret == 0 */
				436	*msg = uwq->msg;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	437	spin_unlock(&ctx->fault_wqh.lock);
				438	ret = 0;
				439	break;
				440	}
				441	spin_unlock(&ctx->fault_wqh.lock);
				442	if (signal_pending(current)) {
				443	ret = -ERESTARTSYS;
				444	break;
				445	}
				446	if (no_wait) {
				447	ret = -EAGAIN;
				448	break;
				449	}
				450	spin_unlock(&ctx->fd_wqh.lock);
				451	schedule();
				452	spin_lock(&ctx->fd_wqh.lock);
				453	}
				454	__remove_wait_queue(&ctx->fd_wqh, &wait);
				455	__set_current_state(TASK_RUNNING);
				456	spin_unlock(&ctx->fd_wqh.lock);
				457
				458	return ret;
				459	}
				460
				461	static ssize_t userfaultfd_read(struct file file, char __user buf,
				462	size_t count, loff_t *ppos)
				463	{
				464	struct userfaultfd_ctx *ctx = file->private_data;
				465	ssize_t _ret, ret = 0;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	466	struct uffd_msg msg;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	467	int no_wait = file->f_flags & O_NONBLOCK;
				468
				469	if (ctx->state == UFFD_STATE_WAIT_API)
				470	return -EINVAL;
				471	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
				472
				473	for (;;) {
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	474	if (count < sizeof(msg))
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	475	return ret ? ret : -EINVAL;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	476	_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	477	if (_ret < 0)
				478	return ret ? ret : _ret;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	479	if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	480	return ret ? ret : -EFAULT;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	481	ret += sizeof(msg);
				482	buf += sizeof(msg);
				483	count -= sizeof(msg);
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	484	/*
				485	* Allow to read more than one fault at time but only
				486	* block if waiting for the very first one.
				487	*/
				488	no_wait = O_NONBLOCK;
				489	}
				490	}
				491
				492	static void __wake_userfault(struct userfaultfd_ctx *ctx,
				493	struct userfaultfd_wake_range *range)
				494	{
				495	unsigned long start, end;
				496
				497	start = range->start;
				498	end = range->start + range->len;
				499
				500	spin_lock(&ctx->fault_wqh.lock);
				501	/* wake all in the range and autoremove */
				502	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
				503	spin_unlock(&ctx->fault_wqh.lock);
				504	}
				505
				506	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
				507	struct userfaultfd_wake_range *range)
				508	{
				509	/*
				510	* To be sure waitqueue_active() is not reordered by the CPU
				511	* before the pagetable update, use an explicit SMP memory
				512	* barrier here. PT lock release or up_read(mmap_sem) still
				513	* have release semantics that can allow the
				514	* waitqueue_active() to be reordered before the pte update.
				515	*/
				516	smp_mb();
				517
				518	/*
				519	* Use waitqueue_active because it's very frequent to
				520	* change the address space atomically even if there are no
				521	* userfaults yet. So we take the spinlock only when we're
				522	* sure we've userfaults to wake.
				523	*/
				524	if (waitqueue_active(&ctx->fault_wqh))
				525	__wake_userfault(ctx, range);
				526	}
				527
				528	static __always_inline int validate_range(struct mm_struct *mm,
				529	__u64 start, __u64 len)
				530	{
				531	__u64 task_size = mm->task_size;
				532
				533	if (start & ~PAGE_MASK)
				534	return -EINVAL;
				535	if (len & ~PAGE_MASK)
				536	return -EINVAL;
				537	if (!len)
				538	return -EINVAL;
				539	if (start < mmap_min_addr)
				540	return -EINVAL;
				541	if (start >= task_size)
				542	return -EINVAL;
				543	if (len > task_size - start)
				544	return -EINVAL;
				545	return 0;
				546	}
				547
				548	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
				549	unsigned long arg)
				550	{
				551	struct mm_struct *mm = ctx->mm;
				552	struct vm_area_struct vma, prev, *cur;
				553	int ret;
				554	struct uffdio_register uffdio_register;
				555	struct uffdio_register __user *user_uffdio_register;
				556	unsigned long vm_flags, new_flags;
				557	bool found;
				558	unsigned long start, end, vma_end;
				559
				560	user_uffdio_register = (struct uffdio_register __user *) arg;
				561
				562	ret = -EFAULT;
				563	if (copy_from_user(&uffdio_register, user_uffdio_register,
				564	sizeof(uffdio_register)-sizeof(__u64)))
				565	goto out;
				566
				567	ret = -EINVAL;
				568	if (!uffdio_register.mode)
				569	goto out;
				570	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING\|
				571	UFFDIO_REGISTER_MODE_WP))
				572	goto out;
				573	vm_flags = 0;
				574	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
				575	vm_flags \|= VM_UFFD_MISSING;
				576	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
				577	vm_flags \|= VM_UFFD_WP;
				578	/*
				579	* FIXME: remove the below error constraint by
				580	* implementing the wprotect tracking mode.
				581	*/
				582	ret = -EINVAL;
				583	goto out;
				584	}
				585
				586	ret = validate_range(mm, uffdio_register.range.start,
				587	uffdio_register.range.len);
				588	if (ret)
				589	goto out;
				590
				591	start = uffdio_register.range.start;
				592	end = start + uffdio_register.range.len;
				593
				594	down_write(&mm->mmap_sem);
				595	vma = find_vma_prev(mm, start, &prev);
				596
				597	ret = -ENOMEM;
				598	if (!vma)
				599	goto out_unlock;
				600
				601	/* check that there's at least one vma in the range */
				602	ret = -EINVAL;
				603	if (vma->vm_start >= end)
				604	goto out_unlock;
				605
				606	/*
				607	* Search for not compatible vmas.
				608	*
				609	* FIXME: this shall be relaxed later so that it doesn't fail
				610	* on tmpfs backed vmas (in addition to the current allowance
				611	* on anonymous vmas).
				612	*/
				613	found = false;
				614	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				615	cond_resched();
				616
				617	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				618	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				619
				620	/* check not compatible vmas */
				621	ret = -EINVAL;
				622	if (cur->vm_ops)
				623	goto out_unlock;
				624
				625	/*
				626	* Check that this vma isn't already owned by a
				627	* different userfaultfd. We can't allow more than one
				628	* userfaultfd to own a single vma simultaneously or we
				629	* wouldn't know which one to deliver the userfaults to.
				630	*/
				631	ret = -EBUSY;
				632	if (cur->vm_userfaultfd_ctx.ctx &&
				633	cur->vm_userfaultfd_ctx.ctx != ctx)
				634	goto out_unlock;
				635
				636	found = true;
				637	}
				638	BUG_ON(!found);
				639
				640	if (vma->vm_start < start)
				641	prev = vma;
				642
				643	ret = 0;
				644	do {
				645	cond_resched();
				646
				647	BUG_ON(vma->vm_ops);
				648	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
				649	vma->vm_userfaultfd_ctx.ctx != ctx);
				650
				651	/*
				652	* Nothing to do: this vma is already registered into this
				653	* userfaultfd and with the right tracking mode too.
				654	*/
				655	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
				656	(vma->vm_flags & vm_flags) == vm_flags)
				657	goto skip;
				658
				659	if (vma->vm_start > start)
				660	start = vma->vm_start;
				661	vma_end = min(end, vma->vm_end);
				662
				663	new_flags = (vma->vm_flags & ~vm_flags) \| vm_flags;
				664	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				665	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				666	vma_policy(vma),
				667	((struct vm_userfaultfd_ctx){ ctx }));
				668	if (prev) {
				669	vma = prev;
				670	goto next;
				671	}
				672	if (vma->vm_start < start) {
				673	ret = split_vma(mm, vma, start, 1);
				674	if (ret)
				675	break;
				676	}
				677	if (vma->vm_end > end) {
				678	ret = split_vma(mm, vma, end, 0);
				679	if (ret)
				680	break;
				681	}
				682	next:
				683	/*
				684	* In the vma_merge() successful mprotect-like case 8:
				685	* the next vma was merged into the current one and
				686	* the current one has not been updated yet.
				687	*/
				688	vma->vm_flags = new_flags;
				689	vma->vm_userfaultfd_ctx.ctx = ctx;
				690
				691	skip:
				692	prev = vma;
				693	start = vma->vm_end;
				694	vma = vma->vm_next;
				695	} while (vma && vma->vm_start < end);
				696	out_unlock:
				697	up_write(&mm->mmap_sem);
				698	if (!ret) {
				699	/*
				700	* Now that we scanned all vmas we can already tell
				701	* userland which ioctls methods are guaranteed to
				702	* succeed on this range.
				703	*/
				704	if (put_user(UFFD_API_RANGE_IOCTLS,
				705	&user_uffdio_register->ioctls))
				706	ret = -EFAULT;
				707	}
				708	out:
				709	return ret;
				710	}
				711
				712	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
				713	unsigned long arg)
				714	{
				715	struct mm_struct *mm = ctx->mm;
				716	struct vm_area_struct vma, prev, *cur;
				717	int ret;
				718	struct uffdio_range uffdio_unregister;
				719	unsigned long new_flags;
				720	bool found;
				721	unsigned long start, end, vma_end;
				722	const void __user buf = (void __user )arg;
				723
				724	ret = -EFAULT;
				725	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
				726	goto out;
				727
				728	ret = validate_range(mm, uffdio_unregister.start,
				729	uffdio_unregister.len);
				730	if (ret)
				731	goto out;
				732
				733	start = uffdio_unregister.start;
				734	end = start + uffdio_unregister.len;
				735
				736	down_write(&mm->mmap_sem);
				737	vma = find_vma_prev(mm, start, &prev);
				738
				739	ret = -ENOMEM;
				740	if (!vma)
				741	goto out_unlock;
				742
				743	/* check that there's at least one vma in the range */
				744	ret = -EINVAL;
				745	if (vma->vm_start >= end)
				746	goto out_unlock;
				747
				748	/*
				749	* Search for not compatible vmas.
				750	*
				751	* FIXME: this shall be relaxed later so that it doesn't fail
				752	* on tmpfs backed vmas (in addition to the current allowance
				753	* on anonymous vmas).
				754	*/
				755	found = false;
				756	ret = -EINVAL;
				757	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				758	cond_resched();
				759
				760	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				761	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				762
				763	/*
				764	* Check not compatible vmas, not strictly required
				765	* here as not compatible vmas cannot have an
				766	* userfaultfd_ctx registered on them, but this
				767	* provides for more strict behavior to notice
				768	* unregistration errors.
				769	*/
				770	if (cur->vm_ops)
				771	goto out_unlock;
				772
				773	found = true;
				774	}
				775	BUG_ON(!found);
				776
				777	if (vma->vm_start < start)
				778	prev = vma;
				779
				780	ret = 0;
				781	do {
				782	cond_resched();
				783
				784	BUG_ON(vma->vm_ops);
				785
				786	/*
				787	* Nothing to do: this vma is already registered into this
				788	* userfaultfd and with the right tracking mode too.
				789	*/
				790	if (!vma->vm_userfaultfd_ctx.ctx)
				791	goto skip;
				792
				793	if (vma->vm_start > start)
				794	start = vma->vm_start;
				795	vma_end = min(end, vma->vm_end);
				796
				797	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				798	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				799	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				800	vma_policy(vma),
				801	NULL_VM_UFFD_CTX);
				802	if (prev) {
				803	vma = prev;
				804	goto next;
				805	}
				806	if (vma->vm_start < start) {
				807	ret = split_vma(mm, vma, start, 1);
				808	if (ret)
				809	break;
				810	}
				811	if (vma->vm_end > end) {
				812	ret = split_vma(mm, vma, end, 0);
				813	if (ret)
				814	break;
				815	}
				816	next:
				817	/*
				818	* In the vma_merge() successful mprotect-like case 8:
				819	* the next vma was merged into the current one and
				820	* the current one has not been updated yet.
				821	*/
				822	vma->vm_flags = new_flags;
				823	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				824
				825	skip:
				826	prev = vma;
				827	start = vma->vm_end;
				828	vma = vma->vm_next;
				829	} while (vma && vma->vm_start < end);
				830	out_unlock:
				831	up_write(&mm->mmap_sem);
				832	out:
				833	return ret;
				834	}
				835
				836	/*
				837	* This is mostly needed to re-wakeup those userfaults that were still
				838	* pending when userland wake them up the first time. We don't wake
				839	* the pending one to avoid blocking reads to block, or non blocking
				840	* read to return -EAGAIN, if used with POLLIN, to avoid userland
				841	* doubts on why POLLIN wasn't reliable.
				842	*/
				843	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
				844	unsigned long arg)
				845	{
				846	int ret;
				847	struct uffdio_range uffdio_wake;
				848	struct userfaultfd_wake_range range;
				849	const void __user buf = (void __user )arg;
				850
				851	ret = -EFAULT;
				852	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
				853	goto out;
				854
				855	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
				856	if (ret)
				857	goto out;
				858
				859	range.start = uffdio_wake.start;
				860	range.len = uffdio_wake.len;
				861
				862	/*
				863	* len == 0 means wake all and we don't want to wake all here,
				864	* so check it again to be sure.
				865	*/
				866	VM_BUG_ON(!range.len);
				867
				868	wake_userfault(ctx, &range);
				869	ret = 0;
				870
				871	out:
				872	return ret;
				873	}
				874
				875	/*
				876	* userland asks for a certain API version and we return which bits
				877	* and ioctl commands are implemented in this kernel for such API
				878	* version or -EINVAL if unknown.
				879	*/
				880	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
				881	unsigned long arg)
				882	{
				883	struct uffdio_api uffdio_api;
				884	void __user buf = (void __user )arg;
				885	int ret;
				886
				887	ret = -EINVAL;
				888	if (ctx->state != UFFD_STATE_WAIT_API)
				889	goto out;
				890	ret = -EFAULT;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	891	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	892	goto out;
Andrea Arcangeli	a9b85f9	2015-09-04 15:46:37 -0700	[diff] [blame^]	893	if (uffdio_api.api != UFFD_API \|\| uffdio_api.features) {
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	894	memset(&uffdio_api, 0, sizeof(uffdio_api));
				895	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				896	goto out;
				897	ret = -EINVAL;
				898	goto out;
				899	}
Pavel Emelyanov	3f602d2	2015-09-04 15:46:34 -0700	[diff] [blame]	900	uffdio_api.features = UFFD_API_FEATURES;
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	901	uffdio_api.ioctls = UFFD_API_IOCTLS;
				902	ret = -EFAULT;
				903	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				904	goto out;
				905	ctx->state = UFFD_STATE_RUNNING;
				906	ret = 0;
				907	out:
				908	return ret;
				909	}
				910
				911	static long userfaultfd_ioctl(struct file *file, unsigned cmd,
				912	unsigned long arg)
				913	{
				914	int ret = -EINVAL;
				915	struct userfaultfd_ctx *ctx = file->private_data;
				916
				917	switch(cmd) {
				918	case UFFDIO_API:
				919	ret = userfaultfd_api(ctx, arg);
				920	break;
				921	case UFFDIO_REGISTER:
				922	ret = userfaultfd_register(ctx, arg);
				923	break;
				924	case UFFDIO_UNREGISTER:
				925	ret = userfaultfd_unregister(ctx, arg);
				926	break;
				927	case UFFDIO_WAKE:
				928	ret = userfaultfd_wake(ctx, arg);
				929	break;
				930	}
				931	return ret;
				932	}
				933
				934	#ifdef CONFIG_PROC_FS
				935	static void userfaultfd_show_fdinfo(struct seq_file m, struct file f)
				936	{
				937	struct userfaultfd_ctx *ctx = f->private_data;
				938	wait_queue_t *wq;
				939	struct userfaultfd_wait_queue *uwq;
				940	unsigned long pending = 0, total = 0;
				941
				942	spin_lock(&ctx->fault_wqh.lock);
				943	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
				944	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				945	if (uwq->pending)
				946	pending++;
				947	total++;
				948	}
				949	spin_unlock(&ctx->fault_wqh.lock);
				950
				951	/*
				952	* If more protocols will be added, there will be all shown
				953	* separated by a space. Like this:
				954	* protocols: aa:... bb:...
				955	*/
				956	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
Pavel Emelyanov	3f602d2	2015-09-04 15:46:34 -0700	[diff] [blame]	957	pending, total, UFFD_API, UFFD_API_FEATURES,
Andrea Arcangeli	86039bd	2015-09-04 15:46:31 -0700	[diff] [blame]	958	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
				959	}
				960	#endif
				961
				962	static const struct file_operations userfaultfd_fops = {
				963	#ifdef CONFIG_PROC_FS
				964	.show_fdinfo = userfaultfd_show_fdinfo,
				965	#endif
				966	.release = userfaultfd_release,
				967	.poll = userfaultfd_poll,
				968	.read = userfaultfd_read,
				969	.unlocked_ioctl = userfaultfd_ioctl,
				970	.compat_ioctl = userfaultfd_ioctl,
				971	.llseek = noop_llseek,
				972	};
				973
				974	/**
				975	* userfaultfd_file_create - Creates an userfaultfd file pointer.
				976	* @flags: Flags for the userfaultfd file.
				977	*
				978	* This function creates an userfaultfd file pointer, w/out installing
				979	* it into the fd table. This is useful when the userfaultfd file is
				980	* used during the initialization of data structures that require
				981	* extra setup after the userfaultfd creation. So the userfaultfd
				982	* creation is split into the file pointer creation phase, and the
				983	* file descriptor installation phase. In this way races with
				984	* userspace closing the newly installed file descriptor can be
				985	* avoided. Returns an userfaultfd file pointer, or a proper error
				986	* pointer.
				987	*/
				988	static struct file *userfaultfd_file_create(int flags)
				989	{
				990	struct file *file;
				991	struct userfaultfd_ctx *ctx;
				992
				993	BUG_ON(!current->mm);
				994
				995	/* Check the UFFD_* constants for consistency. */
				996	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
				997	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
				998
				999	file = ERR_PTR(-EINVAL);
				1000	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
				1001	goto out;
				1002
				1003	file = ERR_PTR(-ENOMEM);
				1004	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
				1005	if (!ctx)
				1006	goto out;
				1007
				1008	atomic_set(&ctx->refcount, 1);
				1009	init_waitqueue_head(&ctx->fault_wqh);
				1010	init_waitqueue_head(&ctx->fd_wqh);
				1011	ctx->flags = flags;
				1012	ctx->state = UFFD_STATE_WAIT_API;
				1013	ctx->released = false;
				1014	ctx->mm = current->mm;
				1015	/* prevent the mm struct to be freed */
				1016	atomic_inc(&ctx->mm->mm_users);
				1017
				1018	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
				1019	O_RDWR \| (flags & UFFD_SHARED_FCNTL_FLAGS));
				1020	if (IS_ERR(file))
				1021	kfree(ctx);
				1022	out:
				1023	return file;
				1024	}
				1025
				1026	SYSCALL_DEFINE1(userfaultfd, int, flags)
				1027	{
				1028	int fd, error;
				1029	struct file *file;
				1030
				1031	error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
				1032	if (error < 0)
				1033	return error;
				1034	fd = error;
				1035
				1036	file = userfaultfd_file_create(flags);
				1037	if (IS_ERR(file)) {
				1038	error = PTR_ERR(file);
				1039	goto err_put_unused_fd;
				1040	}
				1041	fd_install(fd, file);
				1042
				1043	return fd;
				1044
				1045	err_put_unused_fd:
				1046	put_unused_fd(fd);
				1047
				1048	return error;
				1049	}