blob: 92867f3e945c2b46eeedf147c43d0ea993f2c46b [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Alex Elder21079782012-01-24 10:08:36 -060058#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070059#define RBD_MAX_SNAP_NAME_LEN 32
60#define RBD_MAX_OPT_LEN 1024
61
62#define RBD_SNAP_HEAD_NAME "-"
63
Alex Elder81a89792012-02-02 08:13:30 -060064/*
65 * An RBD device name will be "rbd#", where the "rbd" comes from
66 * RBD_DRV_NAME above, and # is a unique integer identifier.
67 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
68 * enough to hold all possible device names.
69 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070070#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060071#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070073#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
74
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075/*
76 * block device image metadata (in-memory version)
77 */
78struct rbd_image_header {
79 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050080 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070081 __u8 obj_order;
82 __u8 crypt_type;
83 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070084 struct ceph_snap_context *snapc;
85 size_t snap_names_len;
86 u64 snap_seq;
87 u32 total_snaps;
88
89 char *snap_names;
90 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070091
92 u64 obj_version;
93};
94
95struct rbd_options {
96 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097};
98
99/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600100 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 */
102struct rbd_client {
103 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700104 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 struct kref kref;
106 struct list_head node;
107};
108
109/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600110 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700111 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700112struct rbd_req_status {
113 int done;
114 int rc;
115 u64 bytes;
116};
117
118/*
119 * a collection of requests
120 */
121struct rbd_req_coll {
122 int total;
123 int num_done;
124 struct kref kref;
125 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700126};
127
Alex Elderf0f8cef2012-01-29 13:57:44 -0600128/*
129 * a single io request
130 */
131struct rbd_request {
132 struct request *rq; /* blk layer request */
133 struct bio *bio; /* cloned bio */
134 struct page **pages; /* list of used pages */
135 u64 len;
136 int coll_index;
137 struct rbd_req_coll *coll;
138};
139
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800140struct rbd_snap {
141 struct device dev;
142 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800143 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800144 struct list_head node;
145 u64 id;
146};
147
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148/*
149 * a single device
150 */
151struct rbd_device {
152 int id; /* blkdev unique id */
153
154 int major; /* blkdev assigned major */
155 struct gendisk *disk; /* blkdev's gendisk and rq */
156 struct request_queue *q;
157
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 struct rbd_client *rbd_client;
159
160 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
161
162 spinlock_t lock; /* queue lock */
163
164 struct rbd_image_header header;
165 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
166 int obj_len;
167 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
Alex Elderd22f76e2012-07-12 10:46:35 -0500168 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500169 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700170
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700171 struct ceph_osd_event *watch_event;
172 struct ceph_osd_request *watch_request;
173
Josh Durginc6666012011-11-21 17:11:12 -0800174 /* protects updating the header */
175 struct rw_semaphore header_rwsem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176 char snap_name[RBD_MAX_SNAP_NAME_LEN];
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700178 int read_only;
179
180 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800181
182 /* list of snapshots */
183 struct list_head snaps;
184
185 /* sysfs related */
186 struct device dev;
187};
188
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600192static DEFINE_SPINLOCK(rbd_dev_list_lock);
193
Alex Elder432b8582012-01-29 13:57:44 -0600194static LIST_HEAD(rbd_client_list); /* clients */
195static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800197static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
198static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static ssize_t rbd_snap_add(struct device *dev,
200 struct device_attribute *attr,
201 const char *buf,
202 size_t count);
203static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700204 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800205
Alex Elderf0f8cef2012-01-29 13:57:44 -0600206static ssize_t rbd_add(struct bus_type *bus, const char *buf,
207 size_t count);
208static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
209 size_t count);
210
211static struct bus_attribute rbd_bus_attrs[] = {
212 __ATTR(add, S_IWUSR, NULL, rbd_add),
213 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
214 __ATTR_NULL
215};
216
217static struct bus_type rbd_bus_type = {
218 .name = "rbd",
219 .bus_attrs = rbd_bus_attrs,
220};
221
222static void rbd_root_dev_release(struct device *dev)
223{
224}
225
226static struct device rbd_root_dev = {
227 .init_name = "rbd",
228 .release = rbd_root_dev_release,
229};
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
233{
234 return get_device(&rbd_dev->dev);
235}
236
237static void rbd_put_dev(struct rbd_device *rbd_dev)
238{
239 put_device(&rbd_dev->dev);
240}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241
Josh Durgin263c6ca2011-12-05 10:43:42 -0800242static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700243
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244static int rbd_open(struct block_device *bdev, fmode_t mode)
245{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600246 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800248 rbd_get_dev(rbd_dev);
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250 set_device_ro(bdev, rbd_dev->read_only);
251
252 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
253 return -EROFS;
254
255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
275 * We own *opt.
276 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700277static struct rbd_client *rbd_client_create(struct ceph_options *opt,
278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Sage Weil6ab00d42011-08-09 09:41:59 -0700293 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400296 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d82012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400319 if (opt)
320 ceph_destroy_options(opt);
321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
327static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
328{
329 struct rbd_client *client_node;
330
331 if (opt->flags & CEPH_OPT_NOSHARE)
332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
335 if (ceph_compare_options(opt, client_node->client) == 0)
336 return client_node;
337 return NULL;
338}
339
340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
351static match_table_t rbdopt_tokens = {
352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
360 struct rbd_options *rbdopt = private;
361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
Alex Elder21079782012-01-24 10:08:36 -0600364 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
385 rbdopt->notify_timeout = intval;
386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
393/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400{
401 struct rbd_client *rbdc;
402 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600407 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elderee577412012-01-24 10:08:36 -0600411 opt = ceph_parse_options(options, mon_addr,
Alex Elder5214ecc2012-02-02 08:13:30 -0600412 mon_addr + mon_addr_len,
Alex Elder21079782012-01-24 10:08:36 -0600413 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600414 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600415 kfree(rbd_opts);
416 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600417 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420 rbdc = __rbd_client_find(opt);
421 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600422 /* using an existing client */
423 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600424 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600425
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600427 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elderd720bcb2012-02-02 08:13:30 -0600429 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 }
Alex Elder432b8582012-01-29 13:57:44 -0600431 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
441/*
442 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600443 *
Alex Elder432b8582012-01-29 13:57:44 -0600444 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500451 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454
455 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700456 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
482/*
483 * Create a new header structure, translate header format from the on-disk
484 * header.
485 */
486static int rbd_header_from_disk(struct rbd_image_header *header,
487 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500488 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489 gfp_t gfp_flags)
490{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500491 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492
Alex Elder21079782012-01-24 10:08:36 -0600493 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800494 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800495
Alex Elder00f1f362012-02-07 12:03:36 -0600496 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500497 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
498 / sizeof (*ondisk))
499 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500501 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502 gfp_flags);
503 if (!header->snapc)
504 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600505
Alex Elder00f1f362012-02-07 12:03:36 -0600506 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700507 if (snap_count) {
508 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500509 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510 if (!header->snap_names)
511 goto err_snapc;
512 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500513 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514 if (!header->snap_sizes)
515 goto err_names;
516 } else {
517 header->snap_names = NULL;
518 header->snap_sizes = NULL;
519 }
Alex Elder849b4262012-07-09 21:04:24 -0500520
521 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
522 gfp_flags);
523 if (!header->object_prefix)
524 goto err_sizes;
525
Alex Elderca1e49a2012-07-10 20:30:09 -0500526 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700527 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500528 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529
530 header->image_size = le64_to_cpu(ondisk->image_size);
531 header->obj_order = ondisk->options.order;
532 header->crypt_type = ondisk->options.crypt_type;
533 header->comp_type = ondisk->options.comp_type;
534
535 atomic_set(&header->snapc->nref, 1);
536 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
537 header->snapc->num_snaps = snap_count;
538 header->total_snaps = snap_count;
539
Alex Elder21079782012-01-24 10:08:36 -0600540 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700541 for (i = 0; i < snap_count; i++) {
542 header->snapc->snaps[i] =
543 le64_to_cpu(ondisk->snaps[i].id);
544 header->snap_sizes[i] =
545 le64_to_cpu(ondisk->snaps[i].image_size);
546 }
547
548 /* copy snapshot names */
549 memcpy(header->snap_names, &ondisk->snaps[i],
550 header->snap_names_len);
551 }
552
553 return 0;
554
Alex Elder849b4262012-07-09 21:04:24 -0500555err_sizes:
556 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557err_names:
558 kfree(header->snap_names);
559err_snapc:
560 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600561 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562}
563
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
565 u64 *seq, u64 *size)
566{
567 int i;
568 char *p = header->snap_names;
569
Alex Elder00f1f362012-02-07 12:03:36 -0600570 for (i = 0; i < header->total_snaps; i++) {
571 if (!strcmp(snap_name, p)) {
572
573 /* Found it. Pass back its id and/or size */
574
575 if (seq)
576 *seq = header->snapc->snaps[i];
577 if (size)
578 *size = header->snap_sizes[i];
579 return i;
580 }
581 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 }
Alex Elder00f1f362012-02-07 12:03:36 -0600583 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584}
585
Josh Durgincc9d7342011-11-21 18:19:13 -0800586static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587{
588 struct rbd_image_header *header = &dev->header;
589 struct ceph_snap_context *snapc = header->snapc;
590 int ret = -ENOENT;
591
Josh Durgincc9d7342011-11-21 18:19:13 -0800592 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
593
Josh Durginc6666012011-11-21 17:11:12 -0800594 down_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595
Josh Durgincc9d7342011-11-21 18:19:13 -0800596 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
597 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598 if (header->total_snaps)
599 snapc->seq = header->snap_seq;
600 else
601 snapc->seq = 0;
Josh Durgin77dfe992011-11-21 13:04:42 -0800602 dev->snap_id = CEPH_NOSNAP;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603 dev->read_only = 0;
604 if (size)
605 *size = header->image_size;
606 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800607 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700608 if (ret < 0)
609 goto done;
Josh Durgin77dfe992011-11-21 13:04:42 -0800610 dev->snap_id = snapc->seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 dev->read_only = 1;
612 }
613
614 ret = 0;
615done:
Josh Durginc6666012011-11-21 17:11:12 -0800616 up_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 return ret;
618}
619
620static void rbd_header_free(struct rbd_image_header *header)
621{
Alex Elder849b4262012-07-09 21:04:24 -0500622 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500624 kfree(header->snap_names);
625 kfree(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626}
627
628/*
629 * get the actual striped segment name, offset and length
630 */
631static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500632 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633 u64 ofs, u64 len,
634 char *seg_name, u64 *segofs)
635{
636 u64 seg = ofs >> header->obj_order;
637
638 if (seg_name)
639 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500640 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641
642 ofs = ofs & ((1 << header->obj_order) - 1);
643 len = min_t(u64, len, (1 << header->obj_order) - ofs);
644
645 if (segofs)
646 *segofs = ofs;
647
648 return len;
649}
650
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700651static int rbd_get_num_segments(struct rbd_image_header *header,
652 u64 ofs, u64 len)
653{
654 u64 start_seg = ofs >> header->obj_order;
655 u64 end_seg = (ofs + len - 1) >> header->obj_order;
656 return end_seg - start_seg + 1;
657}
658
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700660 * returns the size of an object in the image
661 */
662static u64 rbd_obj_bytes(struct rbd_image_header *header)
663{
664 return 1 << header->obj_order;
665}
666
667/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 * bio helpers
669 */
670
671static void bio_chain_put(struct bio *chain)
672{
673 struct bio *tmp;
674
675 while (chain) {
676 tmp = chain;
677 chain = chain->bi_next;
678 bio_put(tmp);
679 }
680}
681
682/*
683 * zeros a bio chain, starting at specific offset
684 */
685static void zero_bio_chain(struct bio *chain, int start_ofs)
686{
687 struct bio_vec *bv;
688 unsigned long flags;
689 void *buf;
690 int i;
691 int pos = 0;
692
693 while (chain) {
694 bio_for_each_segment(bv, chain, i) {
695 if (pos + bv->bv_len > start_ofs) {
696 int remainder = max(start_ofs - pos, 0);
697 buf = bvec_kmap_irq(bv, &flags);
698 memset(buf + remainder, 0,
699 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200700 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 }
702 pos += bv->bv_len;
703 }
704
705 chain = chain->bi_next;
706 }
707}
708
709/*
710 * bio_chain_clone - clone a chain of bios up to a certain length.
711 * might return a bio_pair that will need to be released.
712 */
713static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
714 struct bio_pair **bp,
715 int len, gfp_t gfpmask)
716{
717 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
718 int total = 0;
719
720 if (*bp) {
721 bio_pair_release(*bp);
722 *bp = NULL;
723 }
724
725 while (old_chain && (total < len)) {
726 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
727 if (!tmp)
728 goto err_out;
729
730 if (total + old_chain->bi_size > len) {
731 struct bio_pair *bp;
732
733 /*
734 * this split can only happen with a single paged bio,
735 * split_bio will BUG_ON if this is not the case
736 */
737 dout("bio_chain_clone split! total=%d remaining=%d"
738 "bi_size=%d\n",
739 (int)total, (int)len-total,
740 (int)old_chain->bi_size);
741
742 /* split the bio. We'll release it either in the next
743 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600744 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 if (!bp)
746 goto err_out;
747
748 __bio_clone(tmp, &bp->bio1);
749
750 *next = &bp->bio2;
751 } else {
752 __bio_clone(tmp, old_chain);
753 *next = old_chain->bi_next;
754 }
755
756 tmp->bi_bdev = NULL;
757 gfpmask &= ~__GFP_WAIT;
758 tmp->bi_next = NULL;
759
760 if (!new_chain) {
761 new_chain = tail = tmp;
762 } else {
763 tail->bi_next = tmp;
764 tail = tmp;
765 }
766 old_chain = old_chain->bi_next;
767
768 total += tmp->bi_size;
769 }
770
771 BUG_ON(total < len);
772
773 if (tail)
774 tail->bi_next = NULL;
775
776 *old = old_chain;
777
778 return new_chain;
779
780err_out:
781 dout("bio_chain_clone with err\n");
782 bio_chain_put(new_chain);
783 return NULL;
784}
785
786/*
787 * helpers for osd request op vectors.
788 */
789static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
790 int num_ops,
791 int opcode,
792 u32 payload_len)
793{
794 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
795 GFP_NOIO);
796 if (!*ops)
797 return -ENOMEM;
798 (*ops)[0].op = opcode;
799 /*
800 * op extent offset and length will be set later on
801 * in calc_raw_layout()
802 */
803 (*ops)[0].payload_len = payload_len;
804 return 0;
805}
806
807static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
808{
809 kfree(ops);
810}
811
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700812static void rbd_coll_end_req_index(struct request *rq,
813 struct rbd_req_coll *coll,
814 int index,
815 int ret, u64 len)
816{
817 struct request_queue *q;
818 int min, max, i;
819
820 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
821 coll, index, ret, len);
822
823 if (!rq)
824 return;
825
826 if (!coll) {
827 blk_end_request(rq, ret, len);
828 return;
829 }
830
831 q = rq->q;
832
833 spin_lock_irq(q->queue_lock);
834 coll->status[index].done = 1;
835 coll->status[index].rc = ret;
836 coll->status[index].bytes = len;
837 max = min = coll->num_done;
838 while (max < coll->total && coll->status[max].done)
839 max++;
840
841 for (i = min; i<max; i++) {
842 __blk_end_request(rq, coll->status[i].rc,
843 coll->status[i].bytes);
844 coll->num_done++;
845 kref_put(&coll->kref, rbd_coll_release);
846 }
847 spin_unlock_irq(q->queue_lock);
848}
849
850static void rbd_coll_end_req(struct rbd_request *req,
851 int ret, u64 len)
852{
853 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
854}
855
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856/*
857 * Send ceph osd request
858 */
859static int rbd_do_request(struct request *rq,
860 struct rbd_device *dev,
861 struct ceph_snap_context *snapc,
862 u64 snapid,
863 const char *obj, u64 ofs, u64 len,
864 struct bio *bio,
865 struct page **pages,
866 int num_pages,
867 int flags,
868 struct ceph_osd_req_op *ops,
869 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700870 struct rbd_req_coll *coll,
871 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700873 struct ceph_msg *msg),
874 struct ceph_osd_request **linger_req,
875 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876{
877 struct ceph_osd_request *req;
878 struct ceph_file_layout *layout;
879 int ret;
880 u64 bno;
881 struct timespec mtime = CURRENT_TIME;
882 struct rbd_request *req_data;
883 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600884 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700886 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700887 if (!req_data) {
888 if (coll)
889 rbd_coll_end_req_index(rq, coll, coll_index,
890 -ENOMEM, len);
891 return -ENOMEM;
892 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700894 if (coll) {
895 req_data->coll = coll;
896 req_data->coll_index = coll_index;
897 }
898
899 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Josh Durginc6666012011-11-21 17:11:12 -0800901 down_read(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Alex Elder1dbb4392012-01-24 10:08:37 -0600903 osdc = &dev->rbd_client->client->osdc;
904 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
905 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700906 if (!req) {
Josh Durginc6666012011-11-21 17:11:12 -0800907 up_read(&dev->header_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700908 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909 goto done_pages;
910 }
911
912 req->r_callback = rbd_cb;
913
914 req_data->rq = rq;
915 req_data->bio = bio;
916 req_data->pages = pages;
917 req_data->len = len;
918
919 req->r_priv = req_data;
920
921 reqhead = req->r_request->front.iov_base;
922 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
923
924 strncpy(req->r_oid, obj, sizeof(req->r_oid));
925 req->r_oid_len = strlen(req->r_oid);
926
927 layout = &req->r_file_layout;
928 memset(layout, 0, sizeof(*layout));
929 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
930 layout->fl_stripe_count = cpu_to_le32(1);
931 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder9bb2f332012-07-12 10:46:35 -0500932 layout->fl_pg_pool = cpu_to_le32(dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600933 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
934 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935
936 ceph_osdc_build_request(req, ofs, &len,
937 ops,
938 snapc,
939 &mtime,
940 req->r_oid, req->r_oid_len);
Josh Durginc6666012011-11-21 17:11:12 -0800941 up_read(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600944 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700945 *linger_req = req;
946 }
947
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 if (ret < 0)
950 goto done_err;
951
952 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600953 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700954 if (ver)
955 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700956 dout("reassert_ver=%lld\n",
957 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 ceph_osdc_put_request(req);
959 }
960 return ret;
961
962done_err:
963 bio_chain_put(req_data->bio);
964 ceph_osdc_put_request(req);
965done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700966 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 return ret;
969}
970
971/*
972 * Ceph osd op callback
973 */
974static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
975{
976 struct rbd_request *req_data = req->r_priv;
977 struct ceph_osd_reply_head *replyhead;
978 struct ceph_osd_op *op;
979 __s32 rc;
980 u64 bytes;
981 int read_op;
982
983 /* parse reply */
984 replyhead = msg->front.iov_base;
985 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
986 op = (void *)(replyhead + 1);
987 rc = le32_to_cpu(replyhead->result);
988 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500989 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990
991 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
992
993 if (rc == -ENOENT && read_op) {
994 zero_bio_chain(req_data->bio, 0);
995 rc = 0;
996 } else if (rc == 0 && read_op && bytes < req_data->len) {
997 zero_bio_chain(req_data->bio, bytes);
998 bytes = req_data->len;
999 }
1000
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001001 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
1003 if (req_data->bio)
1004 bio_chain_put(req_data->bio);
1005
1006 ceph_osdc_put_request(req);
1007 kfree(req_data);
1008}
1009
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001010static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1011{
1012 ceph_osdc_put_request(req);
1013}
1014
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015/*
1016 * Do a synchronous ceph osd operation
1017 */
1018static int rbd_req_sync_op(struct rbd_device *dev,
1019 struct ceph_snap_context *snapc,
1020 u64 snapid,
1021 int opcode,
1022 int flags,
1023 struct ceph_osd_req_op *orig_ops,
1024 int num_reply,
1025 const char *obj,
1026 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001027 char *buf,
1028 struct ceph_osd_request **linger_req,
1029 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030{
1031 int ret;
1032 struct page **pages;
1033 int num_pages;
1034 struct ceph_osd_req_op *ops = orig_ops;
1035 u32 payload_len;
1036
1037 num_pages = calc_pages_for(ofs , len);
1038 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001039 if (IS_ERR(pages))
1040 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041
1042 if (!orig_ops) {
1043 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1044 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1045 if (ret < 0)
1046 goto done;
1047
1048 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1049 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1050 if (ret < 0)
1051 goto done_ops;
1052 }
1053 }
1054
1055 ret = rbd_do_request(NULL, dev, snapc, snapid,
1056 obj, ofs, len, NULL,
1057 pages, num_pages,
1058 flags,
1059 ops,
1060 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001061 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001062 NULL,
1063 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064 if (ret < 0)
1065 goto done_ops;
1066
1067 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1068 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1069
1070done_ops:
1071 if (!orig_ops)
1072 rbd_destroy_ops(ops);
1073done:
1074 ceph_release_page_vector(pages, num_pages);
1075 return ret;
1076}
1077
1078/*
1079 * Do an asynchronous ceph osd operation
1080 */
1081static int rbd_do_op(struct request *rq,
1082 struct rbd_device *rbd_dev ,
1083 struct ceph_snap_context *snapc,
1084 u64 snapid,
1085 int opcode, int flags, int num_reply,
1086 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001087 struct bio *bio,
1088 struct rbd_req_coll *coll,
1089 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090{
1091 char *seg_name;
1092 u64 seg_ofs;
1093 u64 seg_len;
1094 int ret;
1095 struct ceph_osd_req_op *ops;
1096 u32 payload_len;
1097
1098 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1099 if (!seg_name)
1100 return -ENOMEM;
1101
1102 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001103 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104 ofs, len,
1105 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
1107 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1108
1109 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1110 if (ret < 0)
1111 goto done;
1112
1113 /* we've taken care of segment sizes earlier when we
1114 cloned the bios. We should never have a segment
1115 truncated at this point */
1116 BUG_ON(seg_len < len);
1117
1118 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1119 seg_name, seg_ofs, seg_len,
1120 bio,
1121 NULL, 0,
1122 flags,
1123 ops,
1124 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001125 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001126 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001127
1128 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129done:
1130 kfree(seg_name);
1131 return ret;
1132}
1133
1134/*
1135 * Request async osd write
1136 */
1137static int rbd_req_write(struct request *rq,
1138 struct rbd_device *rbd_dev,
1139 struct ceph_snap_context *snapc,
1140 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001141 struct bio *bio,
1142 struct rbd_req_coll *coll,
1143 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144{
1145 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1146 CEPH_OSD_OP_WRITE,
1147 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1148 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001149 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150}
1151
1152/*
1153 * Request async osd read
1154 */
1155static int rbd_req_read(struct request *rq,
1156 struct rbd_device *rbd_dev,
1157 u64 snapid,
1158 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 struct bio *bio,
1160 struct rbd_req_coll *coll,
1161 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162{
1163 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001164 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165 CEPH_OSD_OP_READ,
1166 CEPH_OSD_FLAG_READ,
1167 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001168 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169}
1170
1171/*
1172 * Request sync osd read
1173 */
1174static int rbd_req_sync_read(struct rbd_device *dev,
1175 struct ceph_snap_context *snapc,
1176 u64 snapid,
1177 const char *obj,
1178 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001179 char *buf,
1180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
1182 return rbd_req_sync_op(dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001183 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184 CEPH_OSD_OP_READ,
1185 CEPH_OSD_FLAG_READ,
1186 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188}
1189
1190/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001191 * Request sync osd watch
1192 */
1193static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1194 u64 ver,
1195 u64 notify_id,
1196 const char *obj)
1197{
1198 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001199 int ret;
1200
1201 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001202 if (ret < 0)
1203 return ret;
1204
1205 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1206 ops[0].watch.cookie = notify_id;
1207 ops[0].watch.flag = 0;
1208
1209 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1210 obj, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001211 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001212 CEPH_OSD_FLAG_READ,
1213 ops,
1214 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 rbd_simple_req_cb, 0, NULL);
1217
1218 rbd_destroy_ops(ops);
1219 return ret;
1220}
1221
1222static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1223{
1224 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001225 int rc;
1226
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 if (!dev)
1228 return;
1229
1230 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1231 notify_id, (int)opcode);
1232 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08001233 rc = __rbd_refresh_header(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001234 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001235 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001236 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1237 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238
1239 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1240}
1241
1242/*
1243 * Request sync osd watch
1244 */
1245static int rbd_req_sync_watch(struct rbd_device *dev,
1246 const char *obj,
1247 u64 ver)
1248{
1249 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001250 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251
1252 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1253 if (ret < 0)
1254 return ret;
1255
1256 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1257 (void *)dev, &dev->watch_event);
1258 if (ret < 0)
1259 goto fail;
1260
1261 ops[0].watch.ver = cpu_to_le64(ver);
1262 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1263 ops[0].watch.flag = 1;
1264
1265 ret = rbd_req_sync_op(dev, NULL,
1266 CEPH_NOSNAP,
1267 0,
1268 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1269 ops,
1270 1, obj, 0, 0, NULL,
1271 &dev->watch_request, NULL);
1272
1273 if (ret < 0)
1274 goto fail_event;
1275
1276 rbd_destroy_ops(ops);
1277 return 0;
1278
1279fail_event:
1280 ceph_osdc_cancel_event(dev->watch_event);
1281 dev->watch_event = NULL;
1282fail:
1283 rbd_destroy_ops(ops);
1284 return ret;
1285}
1286
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001287/*
1288 * Request sync osd unwatch
1289 */
1290static int rbd_req_sync_unwatch(struct rbd_device *dev,
1291 const char *obj)
1292{
1293 struct ceph_osd_req_op *ops;
1294
1295 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1296 if (ret < 0)
1297 return ret;
1298
1299 ops[0].watch.ver = 0;
1300 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1301 ops[0].watch.flag = 0;
1302
1303 ret = rbd_req_sync_op(dev, NULL,
1304 CEPH_NOSNAP,
1305 0,
1306 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1307 ops,
1308 1, obj, 0, 0, NULL, NULL, NULL);
1309
1310 rbd_destroy_ops(ops);
1311 ceph_osdc_cancel_event(dev->watch_event);
1312 dev->watch_event = NULL;
1313 return ret;
1314}
1315
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316struct rbd_notify_info {
1317 struct rbd_device *dev;
1318};
1319
1320static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1321{
1322 struct rbd_device *dev = (struct rbd_device *)data;
1323 if (!dev)
1324 return;
1325
1326 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1327 notify_id, (int)opcode);
1328}
1329
1330/*
1331 * Request sync osd notify
1332 */
1333static int rbd_req_sync_notify(struct rbd_device *dev,
1334 const char *obj)
1335{
1336 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001337 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 struct ceph_osd_event *event;
1339 struct rbd_notify_info info;
1340 int payload_len = sizeof(u32) + sizeof(u32);
1341 int ret;
1342
1343 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1344 if (ret < 0)
1345 return ret;
1346
1347 info.dev = dev;
1348
1349 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1350 (void *)&info, &event);
1351 if (ret < 0)
1352 goto fail;
1353
1354 ops[0].watch.ver = 1;
1355 ops[0].watch.flag = 1;
1356 ops[0].watch.cookie = event->cookie;
1357 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1358 ops[0].watch.timeout = 12;
1359
1360 ret = rbd_req_sync_op(dev, NULL,
1361 CEPH_NOSNAP,
1362 0,
1363 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1364 ops,
1365 1, obj, 0, 0, NULL, NULL, NULL);
1366 if (ret < 0)
1367 goto fail_event;
1368
1369 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1370 dout("ceph_osdc_wait_event returned %d\n", ret);
1371 rbd_destroy_ops(ops);
1372 return 0;
1373
1374fail_event:
1375 ceph_osdc_cancel_event(event);
1376fail:
1377 rbd_destroy_ops(ops);
1378 return ret;
1379}
1380
1381/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 * Request sync osd read
1383 */
1384static int rbd_req_sync_exec(struct rbd_device *dev,
1385 const char *obj,
1386 const char *cls,
1387 const char *method,
1388 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 int len,
1390 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391{
1392 struct ceph_osd_req_op *ops;
1393 int cls_len = strlen(cls);
1394 int method_len = strlen(method);
1395 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1396 cls_len + method_len + len);
1397 if (ret < 0)
1398 return ret;
1399
1400 ops[0].cls.class_name = cls;
1401 ops[0].cls.class_len = (__u8)cls_len;
1402 ops[0].cls.method_name = method;
1403 ops[0].cls.method_len = (__u8)method_len;
1404 ops[0].cls.argc = 0;
1405 ops[0].cls.indata = data;
1406 ops[0].cls.indata_len = len;
1407
1408 ret = rbd_req_sync_op(dev, NULL,
1409 CEPH_NOSNAP,
1410 0,
1411 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1412 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001413 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414
1415 rbd_destroy_ops(ops);
1416
1417 dout("cls_exec returned %d\n", ret);
1418 return ret;
1419}
1420
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001421static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1422{
1423 struct rbd_req_coll *coll =
1424 kzalloc(sizeof(struct rbd_req_coll) +
1425 sizeof(struct rbd_req_status) * num_reqs,
1426 GFP_ATOMIC);
1427
1428 if (!coll)
1429 return NULL;
1430 coll->total = num_reqs;
1431 kref_init(&coll->kref);
1432 return coll;
1433}
1434
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001435/*
1436 * block device queue callback
1437 */
1438static void rbd_rq_fn(struct request_queue *q)
1439{
1440 struct rbd_device *rbd_dev = q->queuedata;
1441 struct request *rq;
1442 struct bio_pair *bp = NULL;
1443
Alex Elder00f1f362012-02-07 12:03:36 -06001444 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445 struct bio *bio;
1446 struct bio *rq_bio, *next_bio = NULL;
1447 bool do_write;
1448 int size, op_size = 0;
1449 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001450 int num_segs, cur_seg = 0;
1451 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001452
1453 /* peek at request from block layer */
1454 if (!rq)
1455 break;
1456
1457 dout("fetched request\n");
1458
1459 /* filter out block requests we don't understand */
1460 if ((rq->cmd_type != REQ_TYPE_FS)) {
1461 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001462 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 }
1464
1465 /* deduce our operation (read, write) */
1466 do_write = (rq_data_dir(rq) == WRITE);
1467
1468 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001469 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470 rq_bio = rq->bio;
1471 if (do_write && rbd_dev->read_only) {
1472 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001473 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 }
1475
1476 spin_unlock_irq(q->queue_lock);
1477
1478 dout("%s 0x%x bytes at 0x%llx\n",
1479 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001480 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001482 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1483 coll = rbd_alloc_coll(num_segs);
1484 if (!coll) {
1485 spin_lock_irq(q->queue_lock);
1486 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001487 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001488 }
1489
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 do {
1491 /* a bio clone to be passed down to OSD req */
1492 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1493 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001494 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 ofs, size,
1496 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1499 op_size, GFP_ATOMIC);
1500 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001501 rbd_coll_end_req_index(rq, coll, cur_seg,
1502 -ENOMEM, op_size);
1503 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 }
1505
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 /* init OSD command: write or read */
1508 if (do_write)
1509 rbd_req_write(rq, rbd_dev,
1510 rbd_dev->header.snapc,
1511 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001512 op_size, bio,
1513 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514 else
1515 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001516 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 op_size, bio,
1519 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 size -= op_size;
1523 ofs += op_size;
1524
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001525 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 rq_bio = next_bio;
1527 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
1530 if (bp)
1531 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 }
1534}
1535
1536/*
1537 * a queue callback. Makes sure that we don't create a bio that spans across
1538 * multiple osd objects. One exception would be with a single page bios,
1539 * which we handle later at bio_chain_clone
1540 */
1541static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1542 struct bio_vec *bvec)
1543{
1544 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001545 unsigned int chunk_sectors;
1546 sector_t sector;
1547 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 int max;
1549
Alex Elder593a9e72012-02-07 12:03:37 -06001550 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1551 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1552 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1553
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001555 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 if (max < 0)
1557 max = 0; /* bio_add cannot handle a negative return */
1558 if (max <= bvec->bv_len && bio_sectors == 0)
1559 return bvec->bv_len;
1560 return max;
1561}
1562
1563static void rbd_free_disk(struct rbd_device *rbd_dev)
1564{
1565 struct gendisk *disk = rbd_dev->disk;
1566
1567 if (!disk)
1568 return;
1569
1570 rbd_header_free(&rbd_dev->header);
1571
1572 if (disk->flags & GENHD_FL_UP)
1573 del_gendisk(disk);
1574 if (disk->queue)
1575 blk_cleanup_queue(disk->queue);
1576 put_disk(disk);
1577}
1578
1579/*
1580 * reload the ondisk the header
1581 */
1582static int rbd_read_header(struct rbd_device *rbd_dev,
1583 struct rbd_image_header *header)
1584{
1585 ssize_t rc;
1586 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001587 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001588 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001589 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590
Alex Elder00f1f362012-02-07 12:03:36 -06001591 /*
1592 * First reads the fixed-size header to determine the number
1593 * of snapshots, then re-reads it, along with all snapshot
1594 * records as well as their stored names.
1595 */
1596 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 dh = kmalloc(len, GFP_KERNEL);
1599 if (!dh)
1600 return -ENOMEM;
1601
1602 rc = rbd_req_sync_read(rbd_dev,
1603 NULL, CEPH_NOSNAP,
1604 rbd_dev->obj_md_name,
1605 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001606 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 if (rc < 0)
1608 goto out_dh;
1609
1610 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001611 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001612 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001613 pr_warning("unrecognized header format"
1614 " for image %s", rbd_dev->obj);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001616 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617
Alex Elder00f1f362012-02-07 12:03:36 -06001618 if (snap_count == header->total_snaps)
1619 break;
1620
1621 snap_count = header->total_snaps;
1622 len = sizeof (*dh) +
1623 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1624 header->snap_names_len;
1625
1626 rbd_header_free(header);
1627 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001629 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630
1631out_dh:
1632 kfree(dh);
1633 return rc;
1634}
1635
1636/*
1637 * create a snapshot
1638 */
1639static int rbd_header_add_snap(struct rbd_device *dev,
1640 const char *snap_name,
1641 gfp_t gfp_flags)
1642{
1643 int name_len = strlen(snap_name);
1644 u64 new_snapid;
1645 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001646 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001647 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001648 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649
1650 /* we should create a snapshot only if we're pointing at the head */
Josh Durgin77dfe992011-11-21 13:04:42 -08001651 if (dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652 return -EINVAL;
1653
Alex Elder1dbb4392012-01-24 10:08:37 -06001654 monc = &dev->rbd_client->client->monc;
Alex Elder9bb2f332012-07-12 10:46:35 -05001655 ret = ceph_monc_create_snapid(monc, dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656 dout("created snapid=%lld\n", new_snapid);
1657 if (ret < 0)
1658 return ret;
1659
1660 data = kmalloc(name_len + 16, gfp_flags);
1661 if (!data)
1662 return -ENOMEM;
1663
Sage Weil916d4d62011-05-12 16:10:50 -07001664 p = data;
1665 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001666
Sage Weil916d4d62011-05-12 16:10:50 -07001667 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1668 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
1670 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001671 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001672
Sage Weil916d4d62011-05-12 16:10:50 -07001673 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674
1675 if (ret < 0)
1676 return ret;
1677
Josh Durgin403f24d2011-12-05 10:47:13 -08001678 down_write(&dev->header_rwsem);
1679 dev->header.snapc->seq = new_snapid;
1680 up_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681
1682 return 0;
1683bad:
1684 return -ERANGE;
1685}
1686
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001687static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1688{
1689 struct rbd_snap *snap;
1690
1691 while (!list_empty(&rbd_dev->snaps)) {
1692 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1693 __rbd_remove_snap_dev(rbd_dev, snap);
1694 }
1695}
1696
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697/*
1698 * only read the first part of the ondisk header, without the snaps info
1699 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001700static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701{
1702 int ret;
1703 struct rbd_image_header h;
1704 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001705 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001706
1707 ret = rbd_read_header(rbd_dev, &h);
1708 if (ret < 0)
1709 return ret;
1710
Sage Weil9db4b3e2011-04-19 22:49:06 -07001711 /* resized? */
Alex Elder593a9e72012-02-07 12:03:37 -06001712 set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001713
Josh Durginc6666012011-11-21 17:11:12 -08001714 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001715
1716 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001717 if (rbd_dev->header.total_snaps &&
1718 rbd_dev->header.snapc->snaps[0] == snap_seq)
1719 /* pointing at the head, will need to follow that
1720 if head moves */
1721 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001722
Alex Elder849b4262012-07-09 21:04:24 -05001723 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001725 kfree(rbd_dev->header.snap_names);
1726 kfree(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727
1728 rbd_dev->header.total_snaps = h.total_snaps;
1729 rbd_dev->header.snapc = h.snapc;
1730 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001731 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001732 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001733 /* Free the extra copy of the object prefix */
1734 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1735 kfree(h.object_prefix);
1736
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001737 if (follow_seq)
1738 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1739 else
1740 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001742 ret = __rbd_init_snaps_header(rbd_dev);
1743
Josh Durginc6666012011-11-21 17:11:12 -08001744 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001745
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001746 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747}
1748
1749static int rbd_init_disk(struct rbd_device *rbd_dev)
1750{
1751 struct gendisk *disk;
1752 struct request_queue *q;
1753 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001754 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001755 u64 total_size = 0;
1756
1757 /* contact OSD, request size info about the object being mapped */
1758 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1759 if (rc)
1760 return rc;
1761
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001762 /* no need to lock here, as rbd_dev is not registered yet */
1763 rc = __rbd_init_snaps_header(rbd_dev);
1764 if (rc)
1765 return rc;
1766
Josh Durgincc9d7342011-11-21 18:19:13 -08001767 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768 if (rc)
1769 return rc;
1770
1771 /* create gendisk info */
1772 rc = -ENOMEM;
1773 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1774 if (!disk)
1775 goto out;
1776
Alex Elderf0f8cef2012-01-29 13:57:44 -06001777 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001778 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779 disk->major = rbd_dev->major;
1780 disk->first_minor = 0;
1781 disk->fops = &rbd_bd_ops;
1782 disk->private_data = rbd_dev;
1783
1784 /* init rq */
1785 rc = -ENOMEM;
1786 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1787 if (!q)
1788 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001789
Alex Elder593a9e72012-02-07 12:03:37 -06001790 /* We use the default size, but let's be explicit about it. */
1791 blk_queue_physical_block_size(q, SECTOR_SIZE);
1792
Josh Durgin029bcbd2011-07-22 11:35:23 -07001793 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001794 segment_size = rbd_obj_bytes(&rbd_dev->header);
1795 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1796 blk_queue_max_segment_size(q, segment_size);
1797 blk_queue_io_min(q, segment_size);
1798 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001799
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800 blk_queue_merge_bvec(q, rbd_merge_bvec);
1801 disk->queue = q;
1802
1803 q->queuedata = rbd_dev;
1804
1805 rbd_dev->disk = disk;
1806 rbd_dev->q = q;
1807
1808 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001809 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001810 add_disk(disk);
1811
1812 pr_info("%s: added with size 0x%llx\n",
1813 disk->disk_name, (unsigned long long)total_size);
1814 return 0;
1815
1816out_disk:
1817 put_disk(disk);
1818out:
1819 return rc;
1820}
1821
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001822/*
1823 sysfs
1824*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825
Alex Elder593a9e72012-02-07 12:03:37 -06001826static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1827{
1828 return container_of(dev, struct rbd_device, dev);
1829}
1830
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001831static ssize_t rbd_size_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001833{
Alex Elder593a9e72012-02-07 12:03:37 -06001834 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001835
1836 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001837}
1838
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001839static ssize_t rbd_major_show(struct device *dev,
1840 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841{
Alex Elder593a9e72012-02-07 12:03:37 -06001842 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
1844 return sprintf(buf, "%d\n", rbd_dev->major);
1845}
1846
1847static ssize_t rbd_client_id_show(struct device *dev,
1848 struct device_attribute *attr, char *buf)
1849{
Alex Elder593a9e72012-02-07 12:03:37 -06001850 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001851
Alex Elder1dbb4392012-01-24 10:08:37 -06001852 return sprintf(buf, "client%lld\n",
1853 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854}
1855
1856static ssize_t rbd_pool_show(struct device *dev,
1857 struct device_attribute *attr, char *buf)
1858{
Alex Elder593a9e72012-02-07 12:03:37 -06001859 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001860
1861 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1862}
1863
Alex Elder9bb2f332012-07-12 10:46:35 -05001864static ssize_t rbd_pool_id_show(struct device *dev,
1865 struct device_attribute *attr, char *buf)
1866{
1867 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1868
1869 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1870}
1871
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872static ssize_t rbd_name_show(struct device *dev,
1873 struct device_attribute *attr, char *buf)
1874{
Alex Elder593a9e72012-02-07 12:03:37 -06001875 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876
1877 return sprintf(buf, "%s\n", rbd_dev->obj);
1878}
1879
1880static ssize_t rbd_snap_show(struct device *dev,
1881 struct device_attribute *attr,
1882 char *buf)
1883{
Alex Elder593a9e72012-02-07 12:03:37 -06001884 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885
1886 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1887}
1888
1889static ssize_t rbd_image_refresh(struct device *dev,
1890 struct device_attribute *attr,
1891 const char *buf,
1892 size_t size)
1893{
Alex Elder593a9e72012-02-07 12:03:37 -06001894 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001895 int rc;
1896 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897
1898 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1899
Josh Durgin263c6ca2011-12-05 10:43:42 -08001900 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001901 if (rc < 0)
1902 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001904 mutex_unlock(&ctl_mutex);
1905 return ret;
1906}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1909static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1910static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1911static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001912static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1914static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1915static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1916static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917
1918static struct attribute *rbd_attrs[] = {
1919 &dev_attr_size.attr,
1920 &dev_attr_major.attr,
1921 &dev_attr_client_id.attr,
1922 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001923 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924 &dev_attr_name.attr,
1925 &dev_attr_current_snap.attr,
1926 &dev_attr_refresh.attr,
1927 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001928 NULL
1929};
1930
1931static struct attribute_group rbd_attr_group = {
1932 .attrs = rbd_attrs,
1933};
1934
1935static const struct attribute_group *rbd_attr_groups[] = {
1936 &rbd_attr_group,
1937 NULL
1938};
1939
1940static void rbd_sysfs_dev_release(struct device *dev)
1941{
1942}
1943
1944static struct device_type rbd_device_type = {
1945 .name = "rbd",
1946 .groups = rbd_attr_groups,
1947 .release = rbd_sysfs_dev_release,
1948};
1949
1950
1951/*
1952 sysfs - snapshots
1953*/
1954
1955static ssize_t rbd_snap_size_show(struct device *dev,
1956 struct device_attribute *attr,
1957 char *buf)
1958{
1959 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1960
Josh Durgin3591538f2011-12-05 18:25:13 -08001961 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001962}
1963
1964static ssize_t rbd_snap_id_show(struct device *dev,
1965 struct device_attribute *attr,
1966 char *buf)
1967{
1968 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1969
Josh Durgin3591538f2011-12-05 18:25:13 -08001970 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971}
1972
1973static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1974static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1975
1976static struct attribute *rbd_snap_attrs[] = {
1977 &dev_attr_snap_size.attr,
1978 &dev_attr_snap_id.attr,
1979 NULL,
1980};
1981
1982static struct attribute_group rbd_snap_attr_group = {
1983 .attrs = rbd_snap_attrs,
1984};
1985
1986static void rbd_snap_dev_release(struct device *dev)
1987{
1988 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1989 kfree(snap->name);
1990 kfree(snap);
1991}
1992
1993static const struct attribute_group *rbd_snap_attr_groups[] = {
1994 &rbd_snap_attr_group,
1995 NULL
1996};
1997
1998static struct device_type rbd_snap_device_type = {
1999 .groups = rbd_snap_attr_groups,
2000 .release = rbd_snap_dev_release,
2001};
2002
2003static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2004 struct rbd_snap *snap)
2005{
2006 list_del(&snap->node);
2007 device_unregister(&snap->dev);
2008}
2009
2010static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2011 struct rbd_snap *snap,
2012 struct device *parent)
2013{
2014 struct device *dev = &snap->dev;
2015 int ret;
2016
2017 dev->type = &rbd_snap_device_type;
2018 dev->parent = parent;
2019 dev->release = rbd_snap_dev_release;
2020 dev_set_name(dev, "snap_%s", snap->name);
2021 ret = device_register(dev);
2022
2023 return ret;
2024}
2025
2026static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2027 int i, const char *name,
2028 struct rbd_snap **snapp)
2029{
2030 int ret;
2031 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2032 if (!snap)
2033 return -ENOMEM;
2034 snap->name = kstrdup(name, GFP_KERNEL);
2035 snap->size = rbd_dev->header.snap_sizes[i];
2036 snap->id = rbd_dev->header.snapc->snaps[i];
2037 if (device_is_registered(&rbd_dev->dev)) {
2038 ret = rbd_register_snap_dev(rbd_dev, snap,
2039 &rbd_dev->dev);
2040 if (ret < 0)
2041 goto err;
2042 }
2043 *snapp = snap;
2044 return 0;
2045err:
2046 kfree(snap->name);
2047 kfree(snap);
2048 return ret;
2049}
2050
2051/*
2052 * search for the previous snap in a null delimited string list
2053 */
2054const char *rbd_prev_snap_name(const char *name, const char *start)
2055{
2056 if (name < start + 2)
2057 return NULL;
2058
2059 name -= 2;
2060 while (*name) {
2061 if (name == start)
2062 return start;
2063 name--;
2064 }
2065 return name + 1;
2066}
2067
2068/*
2069 * compare the old list of snapshots that we have to what's in the header
2070 * and update it accordingly. Note that the header holds the snapshots
2071 * in a reverse order (from newest to oldest) and we need to go from
2072 * older to new so that we don't get a duplicate snap name when
2073 * doing the process (e.g., removed snapshot and recreated a new
2074 * one with the same name.
2075 */
2076static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2077{
2078 const char *name, *first_name;
2079 int i = rbd_dev->header.total_snaps;
2080 struct rbd_snap *snap, *old_snap = NULL;
2081 int ret;
2082 struct list_head *p, *n;
2083
2084 first_name = rbd_dev->header.snap_names;
2085 name = first_name + rbd_dev->header.snap_names_len;
2086
2087 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2088 u64 cur_id;
2089
2090 old_snap = list_entry(p, struct rbd_snap, node);
2091
2092 if (i)
2093 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2094
2095 if (!i || old_snap->id < cur_id) {
2096 /* old_snap->id was skipped, thus was removed */
2097 __rbd_remove_snap_dev(rbd_dev, old_snap);
2098 continue;
2099 }
2100 if (old_snap->id == cur_id) {
2101 /* we have this snapshot already */
2102 i--;
2103 name = rbd_prev_snap_name(name, first_name);
2104 continue;
2105 }
2106 for (; i > 0;
2107 i--, name = rbd_prev_snap_name(name, first_name)) {
2108 if (!name) {
2109 WARN_ON(1);
2110 return -EINVAL;
2111 }
2112 cur_id = rbd_dev->header.snapc->snaps[i];
2113 /* snapshot removal? handle it above */
2114 if (cur_id >= old_snap->id)
2115 break;
2116 /* a new snapshot */
2117 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2118 if (ret < 0)
2119 return ret;
2120
2121 /* note that we add it backward so using n and not p */
2122 list_add(&snap->node, n);
2123 p = &snap->node;
2124 }
2125 }
2126 /* we're done going over the old snap list, just add what's left */
2127 for (; i > 0; i--) {
2128 name = rbd_prev_snap_name(name, first_name);
2129 if (!name) {
2130 WARN_ON(1);
2131 return -EINVAL;
2132 }
2133 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2134 if (ret < 0)
2135 return ret;
2136 list_add(&snap->node, &rbd_dev->snaps);
2137 }
2138
2139 return 0;
2140}
2141
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2143{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002144 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002145 struct device *dev;
2146 struct rbd_snap *snap;
2147
2148 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2149 dev = &rbd_dev->dev;
2150
2151 dev->bus = &rbd_bus_type;
2152 dev->type = &rbd_device_type;
2153 dev->parent = &rbd_root_dev;
2154 dev->release = rbd_dev_release;
2155 dev_set_name(dev, "%d", rbd_dev->id);
2156 ret = device_register(dev);
2157 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002158 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002159
2160 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2161 ret = rbd_register_snap_dev(rbd_dev, snap,
2162 &rbd_dev->dev);
2163 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002164 break;
2165 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002166out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002167 mutex_unlock(&ctl_mutex);
2168 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002169}
2170
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2172{
2173 device_unregister(&rbd_dev->dev);
2174}
2175
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002176static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2177{
2178 int ret, rc;
2179
2180 do {
2181 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2182 rbd_dev->header.obj_version);
2183 if (ret == -ERANGE) {
2184 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002185 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002186 mutex_unlock(&ctl_mutex);
2187 if (rc < 0)
2188 return rc;
2189 }
2190 } while (ret == -ERANGE);
2191
2192 return ret;
2193}
2194
Alex Elder1ddbe942012-01-29 13:57:44 -06002195static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2196
2197/*
Alex Elder499afd52012-02-02 08:13:29 -06002198 * Get a unique rbd identifier for the given new rbd_dev, and add
2199 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002200 */
Alex Elder499afd52012-02-02 08:13:29 -06002201static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002202{
Alex Elder499afd52012-02-02 08:13:29 -06002203 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2204
2205 spin_lock(&rbd_dev_list_lock);
2206 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2207 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002208}
Alex Elderb7f23c32012-01-29 13:57:43 -06002209
Alex Elder1ddbe942012-01-29 13:57:44 -06002210/*
Alex Elder499afd52012-02-02 08:13:29 -06002211 * Remove an rbd_dev from the global list, and record that its
2212 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002213 */
Alex Elder499afd52012-02-02 08:13:29 -06002214static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002215{
Alex Elderd184f6b2012-01-29 13:57:44 -06002216 struct list_head *tmp;
2217 int rbd_id = rbd_dev->id;
2218 int max_id;
2219
2220 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002221
2222 spin_lock(&rbd_dev_list_lock);
2223 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002224
2225 /*
2226 * If the id being "put" is not the current maximum, there
2227 * is nothing special we need to do.
2228 */
2229 if (rbd_id != atomic64_read(&rbd_id_max)) {
2230 spin_unlock(&rbd_dev_list_lock);
2231 return;
2232 }
2233
2234 /*
2235 * We need to update the current maximum id. Search the
2236 * list to find out what it is. We're more likely to find
2237 * the maximum at the end, so search the list backward.
2238 */
2239 max_id = 0;
2240 list_for_each_prev(tmp, &rbd_dev_list) {
2241 struct rbd_device *rbd_dev;
2242
2243 rbd_dev = list_entry(tmp, struct rbd_device, node);
2244 if (rbd_id > max_id)
2245 max_id = rbd_id;
2246 }
Alex Elder499afd52012-02-02 08:13:29 -06002247 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002248
Alex Elder1ddbe942012-01-29 13:57:44 -06002249 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002250 * The max id could have been updated by rbd_id_get(), in
2251 * which case it now accurately reflects the new maximum.
2252 * Be careful not to overwrite the maximum value in that
2253 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002254 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002255 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002256}
2257
Alex Eldera725f65e2012-02-02 08:13:30 -06002258/*
Alex Eldere28fff262012-02-02 08:13:30 -06002259 * Skips over white space at *buf, and updates *buf to point to the
2260 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002261 * the token (string of non-white space characters) found. Note
2262 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002263 */
2264static inline size_t next_token(const char **buf)
2265{
2266 /*
2267 * These are the characters that produce nonzero for
2268 * isspace() in the "C" and "POSIX" locales.
2269 */
2270 const char *spaces = " \f\n\r\t\v";
2271
2272 *buf += strspn(*buf, spaces); /* Find start of token */
2273
2274 return strcspn(*buf, spaces); /* Return token length */
2275}
2276
2277/*
2278 * Finds the next token in *buf, and if the provided token buffer is
2279 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002280 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2281 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002282 *
2283 * Returns the length of the token found (not including the '\0').
2284 * Return value will be 0 if no token is found, and it will be >=
2285 * token_size if the token would not fit.
2286 *
Alex Elder593a9e72012-02-07 12:03:37 -06002287 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002288 * found token. Note that this occurs even if the token buffer is
2289 * too small to hold it.
2290 */
2291static inline size_t copy_token(const char **buf,
2292 char *token,
2293 size_t token_size)
2294{
2295 size_t len;
2296
2297 len = next_token(buf);
2298 if (len < token_size) {
2299 memcpy(token, *buf, len);
2300 *(token + len) = '\0';
2301 }
2302 *buf += len;
2303
2304 return len;
2305}
2306
2307/*
Alex Elderea3352f2012-07-09 21:04:23 -05002308 * Finds the next token in *buf, dynamically allocates a buffer big
2309 * enough to hold a copy of it, and copies the token into the new
2310 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2311 * that a duplicate buffer is created even for a zero-length token.
2312 *
2313 * Returns a pointer to the newly-allocated duplicate, or a null
2314 * pointer if memory for the duplicate was not available. If
2315 * the lenp argument is a non-null pointer, the length of the token
2316 * (not including the '\0') is returned in *lenp.
2317 *
2318 * If successful, the *buf pointer will be updated to point beyond
2319 * the end of the found token.
2320 *
2321 * Note: uses GFP_KERNEL for allocation.
2322 */
2323static inline char *dup_token(const char **buf, size_t *lenp)
2324{
2325 char *dup;
2326 size_t len;
2327
2328 len = next_token(buf);
2329 dup = kmalloc(len + 1, GFP_KERNEL);
2330 if (!dup)
2331 return NULL;
2332
2333 memcpy(dup, *buf, len);
2334 *(dup + len) = '\0';
2335 *buf += len;
2336
2337 if (lenp)
2338 *lenp = len;
2339
2340 return dup;
2341}
2342
2343/*
Alex Eldera725f65e2012-02-02 08:13:30 -06002344 * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2345 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2346 * on the list of monitor addresses and other options provided via
2347 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002348 *
2349 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002350 */
2351static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2352 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002353 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002354 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002355 char *options,
2356 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002357{
Alex Elderd22f76e2012-07-12 10:46:35 -05002358 size_t len;
2359 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002360
2361 /* The first four tokens are required */
2362
Alex Elder7ef32142012-02-02 08:13:30 -06002363 len = next_token(&buf);
2364 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002365 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002366 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002367 *mon_addrs = buf;
2368
2369 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002370
Alex Eldere28fff262012-02-02 08:13:30 -06002371 len = copy_token(&buf, options, options_size);
2372 if (!len || len >= options_size)
2373 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002374
Alex Elderd22f76e2012-07-12 10:46:35 -05002375 rbd_dev->pool_name = dup_token(&buf, NULL);
2376 if (!rbd_dev->pool_name)
2377 return -ENOMEM;
Alex Eldere28fff262012-02-02 08:13:30 -06002378
2379 len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
Alex Elder849b4262012-07-09 21:04:24 -05002380 if (!len || len >= sizeof (rbd_dev->obj)) {
2381 ret = -EINVAL;
Alex Elderd22f76e2012-07-12 10:46:35 -05002382 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002383 }
Alex Eldere28fff262012-02-02 08:13:30 -06002384
2385 /* We have the object length in hand, save it. */
2386
2387 rbd_dev->obj_len = len;
2388
Alex Elder81a89792012-02-02 08:13:30 -06002389 BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
2390 < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
2391 sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002392
Alex Eldere28fff262012-02-02 08:13:30 -06002393 /*
2394 * The snapshot name is optional, but it's an error if it's
2395 * too long. If no snapshot is supplied, fill in the default.
2396 */
2397 len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2398 if (!len)
2399 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2400 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002401 else if (len >= sizeof (rbd_dev->snap_name)) {
2402 ret = -EINVAL;
Alex Elderd22f76e2012-07-12 10:46:35 -05002403 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002404 }
Alex Eldere28fff262012-02-02 08:13:30 -06002405
Alex Eldera725f65e2012-02-02 08:13:30 -06002406 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002407
2408out_err:
2409 kfree(rbd_dev->pool_name);
2410 rbd_dev->pool_name = NULL;
2411
2412 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002413}
2414
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002415static ssize_t rbd_add(struct bus_type *bus,
2416 const char *buf,
2417 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002418{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002419 struct rbd_device *rbd_dev;
Alex Elder7ef32142012-02-02 08:13:30 -06002420 const char *mon_addrs = NULL;
2421 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002422 char *options = NULL;
2423 struct ceph_osd_client *osdc;
2424 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002425
2426 if (!try_module_get(THIS_MODULE))
2427 return -ENODEV;
2428
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002429 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2430 if (!rbd_dev)
Alex Elder27cc2592012-02-02 08:13:30 -06002431 goto err_nomem;
Alex Elder27cc2592012-02-02 08:13:30 -06002432 options = kmalloc(count, GFP_KERNEL);
2433 if (!options)
2434 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002435
2436 /* static rbd_device initialization */
2437 spin_lock_init(&rbd_dev->lock);
2438 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002439 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002440 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002441
Josh Durginc6666012011-11-21 17:11:12 -08002442 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002443
Alex Elderd184f6b2012-01-29 13:57:44 -06002444 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002445 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002446
Alex Eldera725f65e2012-02-02 08:13:30 -06002447 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002448 BUILD_BUG_ON(DEV_NAME_LEN
2449 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2450 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002451
Alex Eldera725f65e2012-02-02 08:13:30 -06002452 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002453 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002454 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002455 if (rc)
2456 goto err_put_id;
2457
Alex Elder5214ecc2012-02-02 08:13:30 -06002458 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2459 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002460 if (IS_ERR(rbd_dev->rbd_client)) {
2461 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002462 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002463 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002464
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002465 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002466 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002467 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2468 if (rc < 0)
2469 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002470 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471
2472 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002473 rc = register_blkdev(0, rbd_dev->name);
2474 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002475 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002476 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002477
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002478 rc = rbd_bus_add_dev(rbd_dev);
2479 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002480 goto err_out_blkdev;
2481
Alex Elder32eec682012-02-08 16:11:14 -06002482 /*
2483 * At this point cleanup in the event of an error is the job
2484 * of the sysfs code (initiated by rbd_bus_del_dev()).
2485 *
2486 * Set up and announce blkdev mapping.
2487 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002488 rc = rbd_init_disk(rbd_dev);
2489 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002490 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002492 rc = rbd_init_watch_dev(rbd_dev);
2493 if (rc)
2494 goto err_out_bus;
2495
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002496 return count;
2497
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002498err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002499 /* this will also clean up rest of rbd_dev stuff */
2500
2501 rbd_bus_del_dev(rbd_dev);
2502 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002503 return rc;
2504
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002505err_out_blkdev:
2506 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2507err_out_client:
2508 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002509err_put_id:
Alex Elderd22f76e2012-07-12 10:46:35 -05002510 kfree(rbd_dev->pool_name);
Alex Elder499afd52012-02-02 08:13:29 -06002511 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002512err_nomem:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002513 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002514 kfree(rbd_dev);
2515
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002516 dout("Error adding device %s\n", buf);
2517 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002518
2519 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002520}
2521
2522static struct rbd_device *__rbd_get_dev(unsigned long id)
2523{
2524 struct list_head *tmp;
2525 struct rbd_device *rbd_dev;
2526
Alex Eldere124a82f2012-01-29 13:57:44 -06002527 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002528 list_for_each(tmp, &rbd_dev_list) {
2529 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a82f2012-01-29 13:57:44 -06002530 if (rbd_dev->id == id) {
2531 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002532 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002533 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002534 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002535 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002536 return NULL;
2537}
2538
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002539static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002540{
Alex Elder593a9e72012-02-07 12:03:37 -06002541 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002542
Alex Elder1dbb4392012-01-24 10:08:37 -06002543 if (rbd_dev->watch_request) {
2544 struct ceph_client *client = rbd_dev->rbd_client->client;
2545
2546 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002547 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002548 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002549 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002550 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002551
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002552 rbd_put_client(rbd_dev);
2553
2554 /* clean up and free blkdev */
2555 rbd_free_disk(rbd_dev);
2556 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002557
2558 /* done with the id, and with the rbd_dev */
Alex Elderd22f76e2012-07-12 10:46:35 -05002559 kfree(rbd_dev->pool_name);
Alex Elder32eec682012-02-08 16:11:14 -06002560 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002561 kfree(rbd_dev);
2562
2563 /* release module ref */
2564 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002565}
2566
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002567static ssize_t rbd_remove(struct bus_type *bus,
2568 const char *buf,
2569 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570{
2571 struct rbd_device *rbd_dev = NULL;
2572 int target_id, rc;
2573 unsigned long ul;
2574 int ret = count;
2575
2576 rc = strict_strtoul(buf, 10, &ul);
2577 if (rc)
2578 return rc;
2579
2580 /* convert to int; abort if we lost anything in the conversion */
2581 target_id = (int) ul;
2582 if (target_id != ul)
2583 return -EINVAL;
2584
2585 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2586
2587 rbd_dev = __rbd_get_dev(target_id);
2588 if (!rbd_dev) {
2589 ret = -ENOENT;
2590 goto done;
2591 }
2592
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002593 __rbd_remove_all_snaps(rbd_dev);
2594 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002595
2596done:
2597 mutex_unlock(&ctl_mutex);
2598 return ret;
2599}
2600
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002601static ssize_t rbd_snap_add(struct device *dev,
2602 struct device_attribute *attr,
2603 const char *buf,
2604 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002605{
Alex Elder593a9e72012-02-07 12:03:37 -06002606 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002607 int ret;
2608 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002609 if (!name)
2610 return -ENOMEM;
2611
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002613
2614 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2615
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616 ret = rbd_header_add_snap(rbd_dev,
2617 name, GFP_KERNEL);
2618 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002619 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620
Josh Durgin263c6ca2011-12-05 10:43:42 -08002621 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002622 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002623 goto err_unlock;
2624
2625 /* shouldn't hold ctl_mutex when notifying.. notify might
2626 trigger a watch callback that would need to get that mutex */
2627 mutex_unlock(&ctl_mutex);
2628
2629 /* make a best effort, don't error if failed */
2630 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002631
2632 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002633 kfree(name);
2634 return ret;
2635
2636err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002637 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638 kfree(name);
2639 return ret;
2640}
2641
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642/*
2643 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002644 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002645 */
2646static int rbd_sysfs_init(void)
2647{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002648 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649
Alex Elderfed4c142012-02-07 12:03:36 -06002650 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002651 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002652 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002653
Alex Elderfed4c142012-02-07 12:03:36 -06002654 ret = bus_register(&rbd_bus_type);
2655 if (ret < 0)
2656 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002657
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002658 return ret;
2659}
2660
2661static void rbd_sysfs_cleanup(void)
2662{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002664 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002665}
2666
2667int __init rbd_init(void)
2668{
2669 int rc;
2670
2671 rc = rbd_sysfs_init();
2672 if (rc)
2673 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002674 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675 return 0;
2676}
2677
2678void __exit rbd_exit(void)
2679{
2680 rbd_sysfs_cleanup();
2681}
2682
2683module_init(rbd_init);
2684module_exit(rbd_exit);
2685
2686MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2687MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2688MODULE_DESCRIPTION("rados block device");
2689
2690/* following authorship retained from original osdblk.c */
2691MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2692
2693MODULE_LICENSE("GPL");