Blame - drivers/infiniband/core/umem_odp.c - kernel/hikey-linaro

blob: 1f0fe3217f2386e03caf4b7e14cd228bddac2020 [file] [log] [blame]

Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
				3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*/
				32
				33	#include <linux/types.h>
				34	#include <linux/sched.h>
				35	#include <linux/pid.h>
				36	#include <linux/slab.h>
				37	#include <linux/export.h>
				38	#include <linux/vmalloc.h>
				39
				40	#include <rdma/ib_verbs.h>
				41	#include <rdma/ib_umem.h>
				42	#include <rdma/ib_umem_odp.h>
				43
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	44	static void ib_umem_notifier_start_account(struct ib_umem *item)
				45	{
				46	mutex_lock(&item->odp_data->umem_mutex);
				47
				48	/* Only update private counters for this umem if it has them.
				49	* Otherwise skip it. All page faults will be delayed for this umem. */
				50	if (item->odp_data->mn_counters_active) {
				51	int notifiers_count = item->odp_data->notifiers_count++;
				52
				53	if (notifiers_count == 0)
				54	/* Initialize the completion object for waiting on
				55	* notifiers. Since notifier_count is zero, no one
				56	* should be waiting right now. */
				57	reinit_completion(&item->odp_data->notifier_completion);
				58	}
				59	mutex_unlock(&item->odp_data->umem_mutex);
				60	}
				61
				62	static void ib_umem_notifier_end_account(struct ib_umem *item)
				63	{
				64	mutex_lock(&item->odp_data->umem_mutex);
				65
				66	/* Only update private counters for this umem if it has them.
				67	* Otherwise skip it. All page faults will be delayed for this umem. */
				68	if (item->odp_data->mn_counters_active) {
				69	/*
				70	* This sequence increase will notify the QP page fault that
				71	* the page that is going to be mapped in the spte could have
				72	* been freed.
				73	*/
				74	++item->odp_data->notifiers_seq;
				75	if (--item->odp_data->notifiers_count == 0)
				76	complete_all(&item->odp_data->notifier_completion);
				77	}
				78	mutex_unlock(&item->odp_data->umem_mutex);
				79	}
				80
				81	/* Account for a new mmu notifier in an ib_ucontext. */
				82	static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
				83	{
				84	atomic_inc(&context->notifier_count);
				85	}
				86
				87	/* Account for a terminating mmu notifier in an ib_ucontext.
				88	*
				89	* Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
				90	* the function takes the semaphore itself. */
				91	static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
				92	{
				93	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
				94
				95	if (zero_notifiers &&
				96	!list_empty(&context->no_private_counters)) {
				97	/* No currently running mmu notifiers. Now is the chance to
				98	* add private accounting to all previously added umems. */
				99	struct ib_umem_odp odp_data, next;
				100
				101	/* Prevent concurrent mmu notifiers from working on the
				102	* no_private_counters list. */
				103	down_write(&context->umem_rwsem);
				104
				105	/* Read the notifier_count again, with the umem_rwsem
				106	* semaphore taken for write. */
				107	if (!atomic_read(&context->notifier_count)) {
				108	list_for_each_entry_safe(odp_data, next,
				109	&context->no_private_counters,
				110	no_private_counters) {
				111	mutex_lock(&odp_data->umem_mutex);
				112	odp_data->mn_counters_active = true;
				113	list_del(&odp_data->no_private_counters);
				114	complete_all(&odp_data->notifier_completion);
				115	mutex_unlock(&odp_data->umem_mutex);
				116	}
				117	}
				118
				119	up_write(&context->umem_rwsem);
				120	}
				121	}
				122
				123	static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
				124	u64 end, void *cookie) {
				125	/*
				126	* Increase the number of notifiers running, to
				127	* prevent any further fault handling on this MR.
				128	*/
				129	ib_umem_notifier_start_account(item);
				130	item->odp_data->dying = 1;
				131	/* Make sure that the fact the umem is dying is out before we release
				132	* all pending page faults. */
				133	smp_wmb();
				134	complete_all(&item->odp_data->notifier_completion);
				135	item->context->invalidate_range(item, ib_umem_start(item),
				136	ib_umem_end(item));
				137	return 0;
				138	}
				139
				140	static void ib_umem_notifier_release(struct mmu_notifier *mn,
				141	struct mm_struct *mm)
				142	{
				143	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
				144
				145	if (!context->invalidate_range)
				146	return;
				147
				148	ib_ucontext_notifier_start_account(context);
				149	down_read(&context->umem_rwsem);
				150	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
				151	ULLONG_MAX,
				152	ib_umem_notifier_release_trampoline,
				153	NULL);
				154	up_read(&context->umem_rwsem);
				155	}
				156
				157	static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
				158	u64 end, void *cookie)
				159	{
				160	ib_umem_notifier_start_account(item);
				161	item->context->invalidate_range(item, start, start + PAGE_SIZE);
				162	ib_umem_notifier_end_account(item);
				163	return 0;
				164	}
				165
				166	static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
				167	struct mm_struct *mm,
				168	unsigned long address)
				169	{
				170	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
				171
				172	if (!context->invalidate_range)
				173	return;
				174
				175	ib_ucontext_notifier_start_account(context);
				176	down_read(&context->umem_rwsem);
				177	rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
				178	address + PAGE_SIZE,
				179	invalidate_page_trampoline, NULL);
				180	up_read(&context->umem_rwsem);
				181	ib_ucontext_notifier_end_account(context);
				182	}
				183
				184	static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
				185	u64 end, void *cookie)
				186	{
				187	ib_umem_notifier_start_account(item);
				188	item->context->invalidate_range(item, start, end);
				189	return 0;
				190	}
				191
				192	static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
				193	struct mm_struct *mm,
				194	unsigned long start,
				195	unsigned long end)
				196	{
				197	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
				198
				199	if (!context->invalidate_range)
				200	return;
				201
				202	ib_ucontext_notifier_start_account(context);
				203	down_read(&context->umem_rwsem);
				204	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
				205	end,
				206	invalidate_range_start_trampoline, NULL);
				207	up_read(&context->umem_rwsem);
				208	}
				209
				210	static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
				211	u64 end, void *cookie)
				212	{
				213	ib_umem_notifier_end_account(item);
				214	return 0;
				215	}
				216
				217	static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
				218	struct mm_struct *mm,
				219	unsigned long start,
				220	unsigned long end)
				221	{
				222	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
				223
				224	if (!context->invalidate_range)
				225	return;
				226
				227	down_read(&context->umem_rwsem);
				228	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
				229	end,
				230	invalidate_range_end_trampoline, NULL);
				231	up_read(&context->umem_rwsem);
				232	ib_ucontext_notifier_end_account(context);
				233	}
				234
Julia Lawall	46e741f	2015-11-29 23:02:51 +0100	[diff] [blame]	235	static const struct mmu_notifier_ops ib_umem_notifiers = {
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	236	.release = ib_umem_notifier_release,
				237	.invalidate_page = ib_umem_notifier_invalidate_page,
				238	.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
				239	.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
				240	};
				241
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	242	int ib_umem_odp_get(struct ib_ucontext context, struct ib_umem umem)
				243	{
				244	int ret_val;
				245	struct pid *our_pid;
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	246	struct mm_struct *mm = get_task_mm(current);
				247
				248	if (!mm)
				249	return -EINVAL;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	250
				251	/* Prevent creating ODP MRs in child processes */
				252	rcu_read_lock();
				253	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
				254	rcu_read_unlock();
				255	put_pid(our_pid);
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	256	if (context->tgid != our_pid) {
				257	ret_val = -EINVAL;
				258	goto out_mm;
				259	}
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	260
				261	umem->hugetlb = 0;
				262	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	263	if (!umem->odp_data) {
				264	ret_val = -ENOMEM;
				265	goto out_mm;
				266	}
				267	umem->odp_data->umem = umem;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	268
				269	mutex_init(&umem->odp_data->umem_mutex);
				270
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	271	init_completion(&umem->odp_data->notifier_completion);
				272
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	273	umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
				274	sizeof(*umem->odp_data->page_list));
				275	if (!umem->odp_data->page_list) {
				276	ret_val = -ENOMEM;
				277	goto out_odp_data;
				278	}
				279
				280	umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
				281	sizeof(*umem->odp_data->dma_list));
				282	if (!umem->odp_data->dma_list) {
				283	ret_val = -ENOMEM;
				284	goto out_page_list;
				285	}
				286
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	287	/*
				288	* When using MMU notifiers, we will get a
				289	* notification before the "current" task (and MM) is
				290	* destroyed. We use the umem_rwsem semaphore to synchronize.
				291	*/
				292	down_write(&context->umem_rwsem);
				293	context->odp_mrs_count++;
				294	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
				295	rbt_ib_umem_insert(&umem->odp_data->interval_tree,
				296	&context->umem_tree);
Haggai Eran	4fc701e	2015-01-06 13:56:02 +0200	[diff] [blame]	297	if (likely(!atomic_read(&context->notifier_count)) \|\|
				298	context->odp_mrs_count == 1)
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	299	umem->odp_data->mn_counters_active = true;
				300	else
				301	list_add(&umem->odp_data->no_private_counters,
				302	&context->no_private_counters);
				303	downgrade_write(&context->umem_rwsem);
				304
				305	if (context->odp_mrs_count == 1) {
				306	/*
				307	* Note that at this point, no MMU notifier is running
				308	* for this context!
				309	*/
				310	atomic_set(&context->notifier_count, 0);
				311	INIT_HLIST_NODE(&context->mn.hlist);
				312	context->mn.ops = &ib_umem_notifiers;
				313	/*
				314	* Lock-dep detects a false positive for mmap_sem vs.
				315	* umem_rwsem, due to not grasping downgrade_write correctly.
				316	*/
				317	lockdep_off();
				318	ret_val = mmu_notifier_register(&context->mn, mm);
				319	lockdep_on();
				320	if (ret_val) {
				321	pr_err("Failed to register mmu_notifier %d\n", ret_val);
				322	ret_val = -EBUSY;
				323	goto out_mutex;
				324	}
				325	}
				326
				327	up_read(&context->umem_rwsem);
				328
				329	/*
				330	* Note that doing an mmput can cause a notifier for the relevant mm.
				331	* If the notifier is called while we hold the umem_rwsem, this will
				332	* cause a deadlock. Therefore, we release the reference only after we
				333	* released the semaphore.
				334	*/
				335	mmput(mm);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	336	return 0;
				337
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	338	out_mutex:
				339	up_read(&context->umem_rwsem);
				340	vfree(umem->odp_data->dma_list);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	341	out_page_list:
				342	vfree(umem->odp_data->page_list);
				343	out_odp_data:
				344	kfree(umem->odp_data);
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	345	out_mm:
				346	mmput(mm);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	347	return ret_val;
				348	}
				349
				350	void ib_umem_odp_release(struct ib_umem *umem)
				351	{
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	352	struct ib_ucontext *context = umem->context;
				353
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	354	/*
				355	* Ensure that no more pages are mapped in the umem.
				356	*
				357	* It is the driver's responsibility to ensure, before calling us,
				358	* that the hardware will not attempt to access the MR any more.
				359	*/
				360	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
				361	ib_umem_end(umem));
				362
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	363	down_write(&context->umem_rwsem);
				364	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
				365	rbt_ib_umem_remove(&umem->odp_data->interval_tree,
				366	&context->umem_tree);
				367	context->odp_mrs_count--;
				368	if (!umem->odp_data->mn_counters_active) {
				369	list_del(&umem->odp_data->no_private_counters);
				370	complete_all(&umem->odp_data->notifier_completion);
				371	}
				372
				373	/*
				374	* Downgrade the lock to a read lock. This ensures that the notifiers
				375	* (who lock the mutex for reading) will be able to finish, and we
				376	* will be able to enventually obtain the mmu notifiers SRCU. Note
				377	* that since we are doing it atomically, no other user could register
				378	* and unregister while we do the check.
				379	*/
				380	downgrade_write(&context->umem_rwsem);
				381	if (!context->odp_mrs_count) {
				382	struct task_struct *owning_process = NULL;
				383	struct mm_struct *owning_mm = NULL;
				384
				385	owning_process = get_pid_task(context->tgid,
				386	PIDTYPE_PID);
				387	if (owning_process == NULL)
				388	/*
				389	* The process is already dead, notifier were removed
				390	* already.
				391	*/
				392	goto out;
				393
				394	owning_mm = get_task_mm(owning_process);
				395	if (owning_mm == NULL)
				396	/*
				397	* The process' mm is already dead, notifier were
				398	* removed already.
				399	*/
				400	goto out_put_task;
				401	mmu_notifier_unregister(&context->mn, owning_mm);
				402
				403	mmput(owning_mm);
				404
				405	out_put_task:
				406	put_task_struct(owning_process);
				407	}
				408	out:
				409	up_read(&context->umem_rwsem);
				410
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	411	vfree(umem->odp_data->dma_list);
				412	vfree(umem->odp_data->page_list);
				413	kfree(umem->odp_data);
				414	kfree(umem);
				415	}
				416
				417	/*
				418	* Map for DMA and insert a single page into the on-demand paging page tables.
				419	*
				420	* @umem: the umem to insert the page to.
				421	* @page_index: index in the umem to add the page to.
				422	* @page: the page struct to map and add.
				423	* @access_mask: access permissions needed for this page.
				424	* @current_seq: sequence number for synchronization with invalidations.
				425	* the sequence number is taken from
				426	* umem->odp_data->notifiers_seq.
				427	*
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	428	* The function returns -EFAULT if the DMA mapping operation fails. It returns
				429	* -EAGAIN if a concurrent invalidation prevents us from updating the page.
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	430	*
				431	* The page is released via put_page even if the operation failed. For
				432	* on-demand pinning, the page is released whenever it isn't stored in the
				433	* umem.
				434	*/
				435	static int ib_umem_odp_map_dma_single_page(
				436	struct ib_umem *umem,
				437	int page_index,
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	438	u64 base_virt_addr,
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	439	struct page *page,
				440	u64 access_mask,
				441	unsigned long current_seq)
				442	{
				443	struct ib_device *dev = umem->context->device;
				444	dma_addr_t dma_addr;
				445	int stored_page = 0;
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	446	int remove_existing_mapping = 0;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	447	int ret = 0;
				448
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	449	/*
				450	* Note: we avoid writing if seq is different from the initial seq, to
				451	* handle case of a racing notifier. This check also allows us to bail
				452	* early if we have a notifier running in parallel with us.
				453	*/
				454	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
				455	ret = -EAGAIN;
				456	goto out;
				457	}
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	458	if (!(umem->odp_data->dma_list[page_index])) {
				459	dma_addr = ib_dma_map_page(dev,
				460	page,
				461	0, PAGE_SIZE,
				462	DMA_BIDIRECTIONAL);
				463	if (ib_dma_mapping_error(dev, dma_addr)) {
				464	ret = -EFAULT;
				465	goto out;
				466	}
				467	umem->odp_data->dma_list[page_index] = dma_addr \| access_mask;
				468	umem->odp_data->page_list[page_index] = page;
				469	stored_page = 1;
				470	} else if (umem->odp_data->page_list[page_index] == page) {
				471	umem->odp_data->dma_list[page_index] \|= access_mask;
				472	} else {
				473	pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
				474	umem->odp_data->page_list[page_index], page);
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	475	/* Better remove the mapping now, to prevent any further
				476	* damage. */
				477	remove_existing_mapping = 1;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	478	}
				479
				480	out:
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	481	/* On Demand Paging - avoid pinning the page */
				482	if (umem->context->invalidate_range \|\| !stored_page)
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	483	put_page(page);
				484
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	485	if (remove_existing_mapping && umem->context->invalidate_range) {
				486	invalidate_page_trampoline(
				487	umem,
				488	base_virt_addr + (page_index * PAGE_SIZE),
				489	base_virt_addr + ((page_index+1)*PAGE_SIZE),
				490	NULL);
				491	ret = -EAGAIN;
				492	}
				493
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	494	return ret;
				495	}
				496
				497	/**
				498	* ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
				499	*
				500	* Pins the range of pages passed in the argument, and maps them to
				501	* DMA addresses. The DMA addresses of the mapped pages is updated in
				502	* umem->odp_data->dma_list.
				503	*
				504	* Returns the number of pages mapped in success, negative error code
				505	* for failure.
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	506	* An -EAGAIN error code is returned when a concurrent mmu notifier prevents
				507	* the function from completing its task.
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	508	*
				509	* @umem: the umem to map and pin
				510	* @user_virt: the address from which we need to map.
				511	* @bcnt: the minimal number of bytes to pin and map. The mapping might be
				512	* bigger due to alignment, and may also be smaller in case of an error
				513	* pinning or mapping a page. The actual pages mapped is returned in
				514	* the return value.
				515	* @access_mask: bit mask of the requested access permissions for the given
				516	* range.
				517	* @current_seq: the MMU notifiers sequance value for synchronization with
				518	* invalidations. the sequance number is read from
				519	* umem->odp_data->notifiers_seq before calling this function
				520	*/
				521	int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
				522	u64 access_mask, unsigned long current_seq)
				523	{
				524	struct task_struct *owning_process = NULL;
				525	struct mm_struct *owning_mm = NULL;
				526	struct page **local_page_list = NULL;
				527	u64 off;
				528	int j, k, ret = 0, start_idx, npages = 0;
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	529	u64 base_virt_addr;
Lorenzo Stoakes	9beae1e	2016-10-13 01:20:17 +0100	[diff] [blame]	530	unsigned int flags = 0;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	531
				532	if (access_mask == 0)
				533	return -EINVAL;
				534
				535	if (user_virt < ib_umem_start(umem) \|\|
				536	user_virt + bcnt > ib_umem_end(umem))
				537	return -EFAULT;
				538
				539	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
				540	if (!local_page_list)
				541	return -ENOMEM;
				542
				543	off = user_virt & (~PAGE_MASK);
				544	user_virt = user_virt & PAGE_MASK;
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	545	base_virt_addr = user_virt;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	546	bcnt += off; /* Charge for the first page offset as well. */
				547
				548	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
				549	if (owning_process == NULL) {
				550	ret = -EINVAL;
				551	goto out_no_task;
				552	}
				553
				554	owning_mm = get_task_mm(owning_process);
				555	if (owning_mm == NULL) {
				556	ret = -EINVAL;
				557	goto out_put_task;
				558	}
				559
Lorenzo Stoakes	9beae1e	2016-10-13 01:20:17 +0100	[diff] [blame]	560	if (access_mask & ODP_WRITE_ALLOWED_BIT)
				561	flags \|= FOLL_WRITE;
				562
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	563	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
				564	k = start_idx;
				565
				566	while (bcnt > 0) {
				567	const size_t gup_num_pages =
				568	min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
				569	PAGE_SIZE / sizeof(struct page *));
				570
				571	down_read(&owning_mm->mmap_sem);
				572	/*
				573	* Note: this might result in redundent page getting. We can
				574	* avoid this by checking dma_list to be 0 before calling
				575	* get_user_pages. However, this make the code much more
				576	* complex (and doesn't gain us much performance in most use
				577	* cases).
				578	*/
Dave Hansen	1e98779	2016-02-12 13:01:54 -0800	[diff] [blame]	579	npages = get_user_pages_remote(owning_process, owning_mm,
				580	user_virt, gup_num_pages,
Lorenzo Stoakes	9beae1e	2016-10-13 01:20:17 +0100	[diff] [blame]	581	flags, local_page_list, NULL);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	582	up_read(&owning_mm->mmap_sem);
				583
				584	if (npages < 0)
				585	break;
				586
				587	bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
				588	user_virt += npages << PAGE_SHIFT;
Guy Shapiro	c1d383b	2015-04-15 18:17:56 +0300	[diff] [blame]	589	mutex_lock(&umem->odp_data->umem_mutex);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	590	for (j = 0; j < npages; ++j) {
				591	ret = ib_umem_odp_map_dma_single_page(
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	592	umem, k, base_virt_addr, local_page_list[j],
				593	access_mask, current_seq);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	594	if (ret < 0)
				595	break;
				596	k++;
				597	}
Guy Shapiro	c1d383b	2015-04-15 18:17:56 +0300	[diff] [blame]	598	mutex_unlock(&umem->odp_data->umem_mutex);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	599
				600	if (ret < 0) {
				601	/* Release left over pages when handling errors. */
				602	for (++j; j < npages; ++j)
				603	put_page(local_page_list[j]);
				604	break;
				605	}
				606	}
				607
				608	if (ret >= 0) {
				609	if (npages < 0 && k == start_idx)
				610	ret = npages;
				611	else
				612	ret = k - start_idx;
				613	}
				614
				615	mmput(owning_mm);
				616	out_put_task:
				617	put_task_struct(owning_process);
				618	out_no_task:
				619	free_page((unsigned long)local_page_list);
				620	return ret;
				621	}
				622	EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
				623
				624	void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
				625	u64 bound)
				626	{
				627	int idx;
				628	u64 addr;
				629	struct ib_device *dev = umem->context->device;
				630
				631	virt = max_t(u64, virt, ib_umem_start(umem));
				632	bound = min_t(u64, bound, ib_umem_end(umem));
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	633	/* Note that during the run of this function, the
				634	* notifiers_count of the MR is > 0, preventing any racing
				635	* faults from completion. We might be racing with other
				636	* invalidations, so we must make sure we free each page only
				637	* once. */
Guy Shapiro	c1d383b	2015-04-15 18:17:56 +0300	[diff] [blame]	638	mutex_lock(&umem->odp_data->umem_mutex);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	639	for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
				640	idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	641	if (umem->odp_data->page_list[idx]) {
				642	struct page *page = umem->odp_data->page_list[idx];
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	643	dma_addr_t dma = umem->odp_data->dma_list[idx];
				644	dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
				645
				646	WARN_ON(!dma_addr);
				647
				648	ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
				649	DMA_BIDIRECTIONAL);
Guy Shapiro	325ad06	2015-04-15 18:17:57 +0300	[diff] [blame]	650	if (dma & ODP_WRITE_ALLOWED_BIT) {
				651	struct page *head_page = compound_head(page);
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	652	/*
				653	* set_page_dirty prefers being called with
				654	* the page lock. However, MMU notifiers are
				655	* called sometimes with and sometimes without
				656	* the lock. We rely on the umem_mutex instead
				657	* to prevent other mmu notifiers from
				658	* continuing and allowing the page mapping to
				659	* be removed.
				660	*/
				661	set_page_dirty(head_page);
Guy Shapiro	325ad06	2015-04-15 18:17:57 +0300	[diff] [blame]	662	}
Haggai Eran	882214e	2014-12-11 17:04:18 +0200	[diff] [blame]	663	/* on demand pinning support */
				664	if (!umem->context->invalidate_range)
				665	put_page(page);
				666	umem->odp_data->page_list[idx] = NULL;
				667	umem->odp_data->dma_list[idx] = 0;
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	668	}
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	669	}
Guy Shapiro	c1d383b	2015-04-15 18:17:56 +0300	[diff] [blame]	670	mutex_unlock(&umem->odp_data->umem_mutex);
Shachar Raindel	8ada2c1	2014-12-11 17:04:17 +0200	[diff] [blame]	671	}
				672	EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);