1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
|
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2005 Mellanox Technologies. All rights reserved.
* Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
* Copyright 2019 Marvell. All rights reserved.
*/
#include <linux/xarray.h>
#include "uverbs.h"
#include "core_priv.h"
/**
* rdma_umap_priv_init() - Initialize the private data of a vma
*
* @priv: The already allocated private data
* @vma: The vm area struct that needs private data
* @entry: entry into the mmap_xa that needs to be linked with
* this vma
*
* Each time we map IO memory into user space this keeps track of the
* mapping. When the device is hot-unplugged we 'zap' the mmaps in user space
* to point to the zero page and allow the hot unplug to proceed.
*
* This is necessary for cases like PCI physical hot unplug as the actual BAR
* memory may vanish after this and access to it from userspace could MCE.
*
* RDMA drivers supporting disassociation must have their user space designed
* to cope in some way with their IO pages going to the zero page.
*
*/
void rdma_umap_priv_init(struct rdma_umap_priv *priv,
struct vm_area_struct *vma,
struct rdma_user_mmap_entry *entry)
{
struct ib_uverbs_file *ufile = vma->vm_file->private_data;
priv->vma = vma;
if (entry) {
kref_get(&entry->ref);
priv->entry = entry;
}
vma->vm_private_data = priv;
/* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */
mutex_lock(&ufile->umap_lock);
list_add(&priv->list, &ufile->umaps);
mutex_unlock(&ufile->umap_lock);
}
EXPORT_SYMBOL(rdma_umap_priv_init);
/**
* rdma_user_mmap_io() - Map IO memory into a process
*
* @ucontext: associated user context
* @vma: the vma related to the current mmap call
* @pfn: pfn to map
* @size: size to map
* @prot: pgprot to use in remap call
* @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL
* if mmap_entry is not used by the driver
*
* This is to be called by drivers as part of their mmap() functions if they
* wish to send something like PCI-E BAR memory to userspace.
*
* Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on
* success.
*/
int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
unsigned long pfn, unsigned long size, pgprot_t prot,
struct rdma_user_mmap_entry *entry)
{
struct ib_uverbs_file *ufile = ucontext->ufile;
struct rdma_umap_priv *priv;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
if (vma->vm_end - vma->vm_start != size)
return -EINVAL;
/* Driver is using this wrong, must be called by ib_uverbs_mmap */
if (WARN_ON(!vma->vm_file ||
vma->vm_file->private_data != ufile))
return -EINVAL;
lockdep_assert_held(&ufile->device->disassociate_srcu);
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
vma->vm_page_prot = prot;
if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
kfree(priv);
return -EAGAIN;
}
rdma_umap_priv_init(priv, vma, entry);
return 0;
}
EXPORT_SYMBOL(rdma_user_mmap_io);
/**
* rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
*
* @ucontext: associated user context
* @pgoff: The mmap offset >> PAGE_SHIFT
*
* This function is called when a user tries to mmap with an offset (returned
* by rdma_user_mmap_get_offset()) it initially received from the driver. The
* rdma_user_mmap_entry was created by the function
* rdma_user_mmap_entry_insert(). This function increases the refcnt of the
* entry so that it won't be deleted from the xarray in the meantime.
*
* Return an reference to an entry if exists or NULL if there is no
* match. rdma_user_mmap_entry_put() must be called to put the reference.
*/
struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
unsigned long pgoff)
{
struct rdma_user_mmap_entry *entry;
if (pgoff > U32_MAX)
return NULL;
xa_lock(&ucontext->mmap_xa);
entry = xa_load(&ucontext->mmap_xa, pgoff);
/*
* If refcount is zero, entry is already being deleted, driver_removed
* indicates that the no further mmaps are possible and we waiting for
* the active VMAs to be closed.
*/
if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
!kref_get_unless_zero(&entry->ref))
goto err;
xa_unlock(&ucontext->mmap_xa);
ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
pgoff, entry->npages);
return entry;
err:
xa_unlock(&ucontext->mmap_xa);
return NULL;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);
/**
* rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
*
* @ucontext: associated user context
* @vma: the vma being mmap'd into
*
* This function is like rdma_user_mmap_entry_get_pgoff() except that it also
* checks that the VMA is correct.
*/
struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
struct vm_area_struct *vma)
{
struct rdma_user_mmap_entry *entry;
if (!(vma->vm_flags & VM_SHARED))
return NULL;
entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
if (!entry)
return NULL;
if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
rdma_user_mmap_entry_put(entry);
return NULL;
}
return entry;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_get);
static void rdma_user_mmap_entry_free(struct kref *kref)
{
struct rdma_user_mmap_entry *entry =
container_of(kref, struct rdma_user_mmap_entry, ref);
struct ib_ucontext *ucontext = entry->ucontext;
unsigned long i;
/*
* Erase all entries occupied by this single entry, this is deferred
* until all VMA are closed so that the mmap offsets remain unique.
*/
xa_lock(&ucontext->mmap_xa);
for (i = 0; i < entry->npages; i++)
__xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
xa_unlock(&ucontext->mmap_xa);
ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
entry->start_pgoff, entry->npages);
if (ucontext->device->ops.mmap_free)
ucontext->device->ops.mmap_free(entry);
}
/**
* rdma_user_mmap_entry_put() - Drop reference to the mmap entry
*
* @entry: an entry in the mmap_xa
*
* This function is called when the mapping is closed if it was
* an io mapping or when the driver is done with the entry for
* some other reason.
* Should be called after rdma_user_mmap_entry_get was called
* and entry is no longer needed. This function will erase the
* entry and free it if its refcnt reaches zero.
*/
void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
{
kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_put);
/**
* rdma_user_mmap_entry_remove() - Drop reference to entry and
* mark it as unmmapable
*
* @entry: the entry to insert into the mmap_xa
*
* Drivers can call this to prevent userspace from creating more mappings for
* entry, however existing mmaps continue to exist and ops->mmap_free() will
* not be called until all user mmaps are destroyed.
*/
void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
{
if (!entry)
return;
xa_lock(&entry->ucontext->mmap_xa);
entry->driver_removed = true;
xa_unlock(&entry->ucontext->mmap_xa);
kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
/**
* rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa
* in a given range.
*
* @ucontext: associated user context.
* @entry: the entry to insert into the mmap_xa
* @length: length of the address that will be mmapped
* @min_pgoff: minimum pgoff to be returned
* @max_pgoff: maximum pgoff to be returned
*
* This function should be called by drivers that use the rdma_user_mmap
* interface for implementing their mmap syscall A database of mmap offsets is
* handled in the core and helper functions are provided to insert entries
* into the database and extract entries when the user calls mmap with the
* given offset. The function allocates a unique page offset in a given range
* that should be provided to user, the user will use the offset to retrieve
* information such as address to be mapped and how.
*
* Return: 0 on success and -ENOMEM on failure
*/
int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
struct rdma_user_mmap_entry *entry,
size_t length, u32 min_pgoff,
u32 max_pgoff)
{
struct ib_uverbs_file *ufile = ucontext->ufile;
XA_STATE(xas, &ucontext->mmap_xa, min_pgoff);
u32 xa_first, xa_last, npages;
int err;
u32 i;
if (!entry)
return -EINVAL;
kref_init(&entry->ref);
entry->ucontext = ucontext;
/*
* We want the whole allocation to be done without interruption from a
* different thread. The allocation requires finding a free range and
* storing. During the xa_insert the lock could be released, possibly
* allowing another thread to choose the same range.
*/
mutex_lock(&ufile->umap_lock);
xa_lock(&ucontext->mmap_xa);
/* We want to find an empty range */
npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
entry->npages = npages;
while (true) {
/* First find an empty index */
xas_find_marked(&xas, max_pgoff, XA_FREE_MARK);
if (xas.xa_node == XAS_RESTART)
goto err_unlock;
xa_first = xas.xa_index;
/* Is there enough room to have the range? */
if (check_add_overflow(xa_first, npages, &xa_last))
goto err_unlock;
/*
* Now look for the next present entry. If an entry doesn't
* exist, we found an empty range and can proceed.
*/
xas_next_entry(&xas, xa_last - 1);
if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
break;
}
for (i = xa_first; i < xa_last; i++) {
err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
if (err)
goto err_undo;
}
/*
* Internally the kernel uses a page offset, in libc this is a byte
* offset. Drivers should not return pgoff to userspace.
*/
entry->start_pgoff = xa_first;
xa_unlock(&ucontext->mmap_xa);
mutex_unlock(&ufile->umap_lock);
ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
entry->start_pgoff, npages);
return 0;
err_undo:
for (; i > xa_first; i--)
__xa_erase(&ucontext->mmap_xa, i - 1);
err_unlock:
xa_unlock(&ucontext->mmap_xa);
mutex_unlock(&ufile->umap_lock);
return -ENOMEM;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range);
/**
* rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa.
*
* @ucontext: associated user context.
* @entry: the entry to insert into the mmap_xa
* @length: length of the address that will be mmapped
*
* This function should be called by drivers that use the rdma_user_mmap
* interface for handling user mmapped addresses. The database is handled in
* the core and helper functions are provided to insert entries into the
* database and extract entries when the user calls mmap with the given offset.
* The function allocates a unique page offset that should be provided to user,
* the user will use the offset to retrieve information such as address to
* be mapped and how.
*
* Return: 0 on success and -ENOMEM on failure
*/
int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
struct rdma_user_mmap_entry *entry,
size_t length)
{
return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0,
U32_MAX);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
|