blob: 62df67a11321ec37358ef514b2db46acd998c3c9 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500169 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500170 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500171 bool read_only;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174/*
175 * a single device
176 */
177struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500178 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
Alex Eldera30b71b2012-07-10 20:30:11 -0500183 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184 struct rbd_client *rbd_client;
185
186 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
187
188 spinlock_t lock; /* queue lock */
189
190 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500191 bool exists;
Alex Elder589d30e2012-07-10 20:30:11 -0500192 char *image_id;
193 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500197 char *pool_name;
Alex Elder86992092012-10-25 23:34:41 -0500198 u64 pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Alex Elder971f8392012-10-25 23:34:41 -0500200 char *snap_name;
201 u64 snap_id;
202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500232static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb2302012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d82012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elder78cea762012-10-25 23:34:41 -0500456static int rbd_get_client(struct rbd_device *rbd_dev,
457 struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elderf8c38922012-08-10 13:12:07 -0700459 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700460
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600463 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500464 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700465 } else {
466 rbdc = rbd_client_create(ceph_opts);
467 if (IS_ERR(rbdc))
468 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 }
Alex Elderf8c38922012-08-10 13:12:07 -0700470 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471
Alex Elderf8c38922012-08-10 13:12:07 -0700472 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473}
474
475/*
476 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600477 *
Alex Elder432b8582012-01-29 13:57:44 -0600478 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479 */
480static void rbd_client_release(struct kref *kref)
481{
482 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
483
484 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500485 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500487 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488
489 ceph_destroy_client(rbdc->client);
490 kfree(rbdc);
491}
492
493/*
494 * Drop reference to ceph client node. If it's not referenced anymore, release
495 * it.
496 */
497static void rbd_put_client(struct rbd_device *rbd_dev)
498{
499 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
500 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501}
502
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700503/*
504 * Destroy requests collection
505 */
506static void rbd_coll_release(struct kref *kref)
507{
508 struct rbd_req_coll *coll =
509 container_of(kref, struct rbd_req_coll, kref);
510
511 dout("rbd_coll_release %p\n", coll);
512 kfree(coll);
513}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Eldera30b71b2012-07-10 20:30:11 -0500515static bool rbd_image_format_valid(u32 image_format)
516{
517 return image_format == 1 || image_format == 2;
518}
519
Alex Elder8e94af82012-07-25 09:32:40 -0500520static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
521{
Alex Elder103a1502012-08-02 11:29:45 -0500522 size_t size;
523 u32 snap_count;
524
525 /* The header has to start with the magic rbd header text */
526 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
527 return false;
528
Alex Elderdb2388b2012-10-20 22:17:27 -0500529 /* The bio layer requires at least sector-sized I/O */
530
531 if (ondisk->options.order < SECTOR_SHIFT)
532 return false;
533
534 /* If we use u64 in a few spots we may be able to loosen this */
535
536 if (ondisk->options.order > 8 * sizeof (int) - 1)
537 return false;
538
Alex Elder103a1502012-08-02 11:29:45 -0500539 /*
540 * The size of a snapshot header has to fit in a size_t, and
541 * that limits the number of snapshots.
542 */
543 snap_count = le32_to_cpu(ondisk->snap_count);
544 size = SIZE_MAX - sizeof (struct ceph_snap_context);
545 if (snap_count > size / sizeof (__le64))
546 return false;
547
548 /*
549 * Not only that, but the size of the entire the snapshot
550 * header must also be representable in a size_t.
551 */
552 size -= snap_count * sizeof (__le64);
553 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
554 return false;
555
556 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500557}
558
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559/*
560 * Create a new header structure, translate header format from the on-disk
561 * header.
562 */
563static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d9982012-08-02 11:29:46 -0500564 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565{
Alex Elderccece232012-07-10 20:30:10 -0500566 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500567 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500568 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500569 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder6a523252012-07-19 17:12:59 -0500571 memset(header, 0, sizeof (*header));
572
Alex Elder103a1502012-08-02 11:29:45 -0500573 snap_count = le32_to_cpu(ondisk->snap_count);
574
Alex Elder58c17b02012-08-23 23:22:06 -0500575 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
576 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500577 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500579 memcpy(header->object_prefix, ondisk->object_prefix, len);
580 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600581
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500583 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
584
Alex Elder621901d2012-08-23 23:22:06 -0500585 /* Save a copy of the snapshot names */
586
Alex Elderf785cc12012-08-23 23:22:06 -0500587 if (snap_names_len > (u64) SIZE_MAX)
588 return -EIO;
589 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500591 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500592 /*
593 * Note that rbd_dev_v1_header_read() guarantees
594 * the ondisk buffer we're working with has
595 * snap_names_len bytes beyond the end of the
596 * snapshot id array, this memcpy() is safe.
597 */
598 memcpy(header->snap_names, &ondisk->snaps[snap_count],
599 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Record each snapshot's size */
602
Alex Elderd2bb24e2012-07-26 23:37:14 -0500603 size = snap_count * sizeof (*header->snap_sizes);
604 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500606 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snap_sizes[i] =
609 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 } else {
Alex Elderccece232012-07-10 20:30:10 -0500611 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snap_names = NULL;
613 header->snap_sizes = NULL;
614 }
Alex Elder849b4262012-07-09 21:04:24 -0500615
Alex Elder34b13182012-07-13 20:35:12 -0500616 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 header->obj_order = ondisk->options.order;
618 header->crypt_type = ondisk->options.crypt_type;
619 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500620
Alex Elder621901d2012-08-23 23:22:06 -0500621 /* Allocate and fill in the snapshot context */
622
Alex Elderf84344f2012-08-31 17:29:51 -0500623 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500624 size = sizeof (struct ceph_snap_context);
625 size += snap_count * sizeof (header->snapc->snaps[0]);
626 header->snapc = kzalloc(size, GFP_KERNEL);
627 if (!header->snapc)
628 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629
630 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500631 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500633 for (i = 0; i < snap_count; i++)
634 header->snapc->snaps[i] =
635 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
637 return 0;
638
Alex Elder6a523252012-07-19 17:12:59 -0500639out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500640 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500641 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500643 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500644 kfree(header->object_prefix);
645 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500646
Alex Elder00f1f362012-02-07 12:03:36 -0600647 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648}
649
Alex Elder8836b992012-08-30 14:42:15 -0500650static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Eldere86924a2012-07-10 20:30:11 -0500653 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600654
Alex Eldere86924a2012-07-10 20:30:11 -0500655 list_for_each_entry(snap, &rbd_dev->snaps, node) {
656 if (!strcmp(snap_name, snap->name)) {
Alex Elder971f8392012-10-25 23:34:41 -0500657 rbd_dev->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500658 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500659 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600660
Alex Eldere86924a2012-07-10 20:30:11 -0500661 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600662 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 }
Alex Eldere86924a2012-07-10 20:30:11 -0500664
Alex Elder00f1f362012-02-07 12:03:36 -0600665 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666}
667
Alex Elder5ed16172012-08-29 17:11:07 -0500668static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669{
Alex Elder78dc4472012-07-19 08:49:18 -0500670 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671
Alex Elder4e1105a2012-08-31 17:29:52 -0500672 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800673 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder971f8392012-10-25 23:34:41 -0500674 rbd_dev->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500675 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500676 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500677 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500679 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 if (ret < 0)
681 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500682 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 }
Alex Elder971f8392012-10-25 23:34:41 -0500684 rbd_dev->snap_name = snap_name;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500685 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 return ret;
688}
689
690static void rbd_header_free(struct rbd_image_header *header)
691{
Alex Elder849b4262012-07-09 21:04:24 -0500692 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500693 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500695 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500696 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500697 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800698 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500699 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700}
701
Alex Elder65ccfe22012-08-09 10:33:26 -0700702static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703{
Alex Elder65ccfe22012-08-09 10:33:26 -0700704 char *name;
705 u64 segment;
706 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707
Alex Elder65ccfe22012-08-09 10:33:26 -0700708 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
709 if (!name)
710 return NULL;
711 segment = offset >> rbd_dev->header.obj_order;
712 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
713 rbd_dev->header.object_prefix, segment);
714 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
715 pr_err("error formatting segment name for #%llu (%d)\n",
716 segment, ret);
717 kfree(name);
718 name = NULL;
719 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 return name;
722}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
Alex Elder65ccfe22012-08-09 10:33:26 -0700724static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
725{
726 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727
Alex Elder65ccfe22012-08-09 10:33:26 -0700728 return offset & (segment_size - 1);
729}
730
731static u64 rbd_segment_length(struct rbd_device *rbd_dev,
732 u64 offset, u64 length)
733{
734 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
735
736 offset &= segment_size - 1;
737
Alex Elderaafb2302012-09-06 16:00:54 -0500738 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700739 if (offset + length > segment_size)
740 length = segment_size - offset;
741
742 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743}
744
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700745static int rbd_get_num_segments(struct rbd_image_header *header,
746 u64 ofs, u64 len)
747{
Alex Elderdf111be2012-08-09 10:33:26 -0700748 u64 start_seg;
749 u64 end_seg;
750
751 if (!len)
752 return 0;
753 if (len - 1 > U64_MAX - ofs)
754 return -ERANGE;
755
756 start_seg = ofs >> header->obj_order;
757 end_seg = (ofs + len - 1) >> header->obj_order;
758
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700759 return end_seg - start_seg + 1;
760}
761
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700763 * returns the size of an object in the image
764 */
765static u64 rbd_obj_bytes(struct rbd_image_header *header)
766{
767 return 1 << header->obj_order;
768}
769
770/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771 * bio helpers
772 */
773
774static void bio_chain_put(struct bio *chain)
775{
776 struct bio *tmp;
777
778 while (chain) {
779 tmp = chain;
780 chain = chain->bi_next;
781 bio_put(tmp);
782 }
783}
784
785/*
786 * zeros a bio chain, starting at specific offset
787 */
788static void zero_bio_chain(struct bio *chain, int start_ofs)
789{
790 struct bio_vec *bv;
791 unsigned long flags;
792 void *buf;
793 int i;
794 int pos = 0;
795
796 while (chain) {
797 bio_for_each_segment(bv, chain, i) {
798 if (pos + bv->bv_len > start_ofs) {
799 int remainder = max(start_ofs - pos, 0);
800 buf = bvec_kmap_irq(bv, &flags);
801 memset(buf + remainder, 0,
802 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200803 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804 }
805 pos += bv->bv_len;
806 }
807
808 chain = chain->bi_next;
809 }
810}
811
812/*
Alex Elderf7760da2012-10-20 22:17:27 -0500813 * Clone a portion of a bio, starting at the given byte offset
814 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 */
Alex Elderf7760da2012-10-20 22:17:27 -0500816static struct bio *bio_clone_range(struct bio *bio_src,
817 unsigned int offset,
818 unsigned int len,
819 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820{
Alex Elderf7760da2012-10-20 22:17:27 -0500821 struct bio_vec *bv;
822 unsigned int resid;
823 unsigned short idx;
824 unsigned int voff;
825 unsigned short end_idx;
826 unsigned short vcnt;
827 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828
Alex Elderf7760da2012-10-20 22:17:27 -0500829 /* Handle the easy case for the caller */
830
831 if (!offset && len == bio_src->bi_size)
832 return bio_clone(bio_src, gfpmask);
833
834 if (WARN_ON_ONCE(!len))
835 return NULL;
836 if (WARN_ON_ONCE(len > bio_src->bi_size))
837 return NULL;
838 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
839 return NULL;
840
841 /* Find first affected segment... */
842
843 resid = offset;
844 __bio_for_each_segment(bv, bio_src, idx, 0) {
845 if (resid < bv->bv_len)
846 break;
847 resid -= bv->bv_len;
848 }
849 voff = resid;
850
851 /* ...and the last affected segment */
852
853 resid += len;
854 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
855 if (resid <= bv->bv_len)
856 break;
857 resid -= bv->bv_len;
858 }
859 vcnt = end_idx - idx + 1;
860
861 /* Build the clone */
862
863 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
864 if (!bio)
865 return NULL; /* ENOMEM */
866
867 bio->bi_bdev = bio_src->bi_bdev;
868 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
869 bio->bi_rw = bio_src->bi_rw;
870 bio->bi_flags |= 1 << BIO_CLONED;
871
872 /*
873 * Copy over our part of the bio_vec, then update the first
874 * and last (or only) entries.
875 */
876 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
877 vcnt * sizeof (struct bio_vec));
878 bio->bi_io_vec[0].bv_offset += voff;
879 if (vcnt > 1) {
880 bio->bi_io_vec[0].bv_len -= voff;
881 bio->bi_io_vec[vcnt - 1].bv_len = resid;
882 } else {
883 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 }
885
Alex Elderf7760da2012-10-20 22:17:27 -0500886 bio->bi_vcnt = vcnt;
887 bio->bi_size = len;
888 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700889
Alex Elderf7760da2012-10-20 22:17:27 -0500890 return bio;
891}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Alex Elderf7760da2012-10-20 22:17:27 -0500893/*
894 * Clone a portion of a bio chain, starting at the given byte offset
895 * into the first bio in the source chain and continuing for the
896 * number of bytes indicated. The result is another bio chain of
897 * exactly the given length, or a null pointer on error.
898 *
899 * The bio_src and offset parameters are both in-out. On entry they
900 * refer to the first source bio and the offset into that bio where
901 * the start of data to be cloned is located.
902 *
903 * On return, bio_src is updated to refer to the bio in the source
904 * chain that contains first un-cloned byte, and *offset will
905 * contain the offset of that byte within that bio.
906 */
907static struct bio *bio_chain_clone_range(struct bio **bio_src,
908 unsigned int *offset,
909 unsigned int len,
910 gfp_t gfpmask)
911{
912 struct bio *bi = *bio_src;
913 unsigned int off = *offset;
914 struct bio *chain = NULL;
915 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916
Alex Elderf7760da2012-10-20 22:17:27 -0500917 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918
Alex Elderf7760da2012-10-20 22:17:27 -0500919 if (!bi || off >= bi->bi_size || !len)
920 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921
Alex Elderf7760da2012-10-20 22:17:27 -0500922 end = &chain;
923 while (len) {
924 unsigned int bi_size;
925 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926
Alex Elderf7760da2012-10-20 22:17:27 -0500927 if (!bi)
928 goto out_err; /* EINVAL; ran out of bio's */
929 bi_size = min_t(unsigned int, bi->bi_size - off, len);
930 bio = bio_clone_range(bi, off, bi_size, gfpmask);
931 if (!bio)
932 goto out_err; /* ENOMEM */
933
934 *end = bio;
935 end = &bio->bi_next;
936
937 off += bi_size;
938 if (off == bi->bi_size) {
939 bi = bi->bi_next;
940 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941 }
Alex Elderf7760da2012-10-20 22:17:27 -0500942 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943 }
Alex Elderf7760da2012-10-20 22:17:27 -0500944 *bio_src = bi;
945 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946
Alex Elderf7760da2012-10-20 22:17:27 -0500947 return chain;
948out_err:
949 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951 return NULL;
952}
953
954/*
955 * helpers for osd request op vectors.
956 */
Alex Elder57cfc102012-06-26 12:57:03 -0700957static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
958 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959{
Alex Elder57cfc102012-06-26 12:57:03 -0700960 struct ceph_osd_req_op *ops;
961
962 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
963 if (!ops)
964 return NULL;
965
966 ops[0].op = opcode;
967
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 /*
969 * op extent offset and length will be set later on
970 * in calc_raw_layout()
971 */
Alex Elder57cfc102012-06-26 12:57:03 -0700972 ops[0].payload_len = payload_len;
973
974 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975}
976
977static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
978{
979 kfree(ops);
980}
981
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700982static void rbd_coll_end_req_index(struct request *rq,
983 struct rbd_req_coll *coll,
984 int index,
985 int ret, u64 len)
986{
987 struct request_queue *q;
988 int min, max, i;
989
Alex Elderbd919d42012-07-13 20:35:11 -0500990 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
991 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700992
993 if (!rq)
994 return;
995
996 if (!coll) {
997 blk_end_request(rq, ret, len);
998 return;
999 }
1000
1001 q = rq->q;
1002
1003 spin_lock_irq(q->queue_lock);
1004 coll->status[index].done = 1;
1005 coll->status[index].rc = ret;
1006 coll->status[index].bytes = len;
1007 max = min = coll->num_done;
1008 while (max < coll->total && coll->status[max].done)
1009 max++;
1010
1011 for (i = min; i<max; i++) {
1012 __blk_end_request(rq, coll->status[i].rc,
1013 coll->status[i].bytes);
1014 coll->num_done++;
1015 kref_put(&coll->kref, rbd_coll_release);
1016 }
1017 spin_unlock_irq(q->queue_lock);
1018}
1019
1020static void rbd_coll_end_req(struct rbd_request *req,
1021 int ret, u64 len)
1022{
1023 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1024}
1025
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026/*
1027 * Send ceph osd request
1028 */
1029static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001030 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031 struct ceph_snap_context *snapc,
1032 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001033 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 struct bio *bio,
1035 struct page **pages,
1036 int num_pages,
1037 int flags,
1038 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001039 struct rbd_req_coll *coll,
1040 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 struct ceph_msg *msg),
1043 struct ceph_osd_request **linger_req,
1044 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045{
1046 struct ceph_osd_request *req;
1047 struct ceph_file_layout *layout;
1048 int ret;
1049 u64 bno;
1050 struct timespec mtime = CURRENT_TIME;
1051 struct rbd_request *req_data;
1052 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001053 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001056 if (!req_data) {
1057 if (coll)
1058 rbd_coll_end_req_index(rq, coll, coll_index,
1059 -ENOMEM, len);
1060 return -ENOMEM;
1061 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001063 if (coll) {
1064 req_data->coll = coll;
1065 req_data->coll_index = coll_index;
1066 }
1067
Alex Elderf7760da2012-10-20 22:17:27 -05001068 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1069 object_name, (unsigned long long) ofs,
1070 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071
Alex Elder0ce1a792012-07-03 16:01:18 -05001072 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001073 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1074 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001075 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001076 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077 goto done_pages;
1078 }
1079
1080 req->r_callback = rbd_cb;
1081
1082 req_data->rq = rq;
1083 req_data->bio = bio;
1084 req_data->pages = pages;
1085 req_data->len = len;
1086
1087 req->r_priv = req_data;
1088
1089 reqhead = req->r_request->front.iov_base;
1090 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1091
Alex Elderaded07e2012-07-03 16:01:18 -05001092 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093 req->r_oid_len = strlen(req->r_oid);
1094
1095 layout = &req->r_file_layout;
1096 memset(layout, 0, sizeof(*layout));
1097 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1098 layout->fl_stripe_count = cpu_to_le32(1);
1099 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder86992092012-10-25 23:34:41 -05001100 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001101 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1102 req, ops);
1103 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104
1105 ceph_osdc_build_request(req, ofs, &len,
1106 ops,
1107 snapc,
1108 &mtime,
1109 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001111 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001112 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001113 *linger_req = req;
1114 }
1115
Alex Elder1dbb4392012-01-24 10:08:37 -06001116 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117 if (ret < 0)
1118 goto done_err;
1119
1120 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001121 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001122 if (ver)
1123 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001124 dout("reassert_ver=%llu\n",
1125 (unsigned long long)
1126 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001127 ceph_osdc_put_request(req);
1128 }
1129 return ret;
1130
1131done_err:
1132 bio_chain_put(req_data->bio);
1133 ceph_osdc_put_request(req);
1134done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001135 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 return ret;
1138}
1139
1140/*
1141 * Ceph osd op callback
1142 */
1143static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1144{
1145 struct rbd_request *req_data = req->r_priv;
1146 struct ceph_osd_reply_head *replyhead;
1147 struct ceph_osd_op *op;
1148 __s32 rc;
1149 u64 bytes;
1150 int read_op;
1151
1152 /* parse reply */
1153 replyhead = msg->front.iov_base;
1154 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1155 op = (void *)(replyhead + 1);
1156 rc = le32_to_cpu(replyhead->result);
1157 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001158 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159
Alex Elderbd919d42012-07-13 20:35:11 -05001160 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1161 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162
1163 if (rc == -ENOENT && read_op) {
1164 zero_bio_chain(req_data->bio, 0);
1165 rc = 0;
1166 } else if (rc == 0 && read_op && bytes < req_data->len) {
1167 zero_bio_chain(req_data->bio, bytes);
1168 bytes = req_data->len;
1169 }
1170
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001171 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172
1173 if (req_data->bio)
1174 bio_chain_put(req_data->bio);
1175
1176 ceph_osdc_put_request(req);
1177 kfree(req_data);
1178}
1179
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1181{
1182 ceph_osdc_put_request(req);
1183}
1184
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185/*
1186 * Do a synchronous ceph osd operation
1187 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001188static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189 struct ceph_snap_context *snapc,
1190 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001192 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001193 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001194 u64 ofs, u64 inbound_size,
1195 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 struct ceph_osd_request **linger_req,
1197 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198{
1199 int ret;
1200 struct page **pages;
1201 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001202
Alex Elderaafb2302012-09-06 16:00:54 -05001203 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204
Alex Elderf8d4de62012-07-03 16:01:19 -05001205 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001207 if (IS_ERR(pages))
1208 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209
Alex Elder0ce1a792012-07-03 16:01:18 -05001210 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001211 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212 pages, num_pages,
1213 flags,
1214 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 NULL,
1217 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001218 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001219 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220
Alex Elderf8d4de62012-07-03 16:01:19 -05001221 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1222 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001224done:
1225 ceph_release_page_vector(pages, num_pages);
1226 return ret;
1227}
1228
1229/*
1230 * Do an asynchronous ceph osd operation
1231 */
1232static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001233 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001236 struct bio *bio,
1237 struct rbd_req_coll *coll,
1238 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239{
1240 char *seg_name;
1241 u64 seg_ofs;
1242 u64 seg_len;
1243 int ret;
1244 struct ceph_osd_req_op *ops;
1245 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001246 int opcode;
1247 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001248 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249
Alex Elder65ccfe22012-08-09 10:33:26 -07001250 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251 if (!seg_name)
1252 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001253 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1254 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255
Alex Elderff2e4bb2012-10-10 18:59:29 -07001256 if (rq_data_dir(rq) == WRITE) {
1257 opcode = CEPH_OSD_OP_WRITE;
1258 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001259 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001260 payload_len = seg_len;
1261 } else {
1262 opcode = CEPH_OSD_OP_READ;
1263 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001264 snapc = NULL;
Alex Elder971f8392012-10-25 23:34:41 -05001265 snapid = rbd_dev->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001266 payload_len = 0;
1267 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268
Alex Elder57cfc102012-06-26 12:57:03 -07001269 ret = -ENOMEM;
1270 ops = rbd_create_rw_ops(1, opcode, payload_len);
1271 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272 goto done;
1273
1274 /* we've taken care of segment sizes earlier when we
1275 cloned the bios. We should never have a segment
1276 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001277 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278
1279 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1280 seg_name, seg_ofs, seg_len,
1281 bio,
1282 NULL, 0,
1283 flags,
1284 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001285 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001287
1288 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289done:
1290 kfree(seg_name);
1291 return ret;
1292}
1293
1294/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295 * Request sync osd read
1296 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001297static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001298 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001299 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001300 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001301 char *buf,
1302 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303{
Alex Elder913d2fd2012-06-26 12:57:03 -07001304 struct ceph_osd_req_op *ops;
1305 int ret;
1306
1307 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1308 if (!ops)
1309 return -ENOMEM;
1310
1311 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001312 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001314 ops, object_name, ofs, len, buf, NULL, ver);
1315 rbd_destroy_ops(ops);
1316
1317 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001318}
1319
1320/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 * Request sync osd watch
1322 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001323static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001325 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326{
1327 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001328 int ret;
1329
Alex Elder57cfc102012-06-26 12:57:03 -07001330 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1331 if (!ops)
1332 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333
Josh Durgina71b8912011-12-05 18:10:44 -08001334 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335 ops[0].watch.cookie = notify_id;
1336 ops[0].watch.flag = 0;
1337
Alex Elder0ce1a792012-07-03 16:01:18 -05001338 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001339 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001340 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 CEPH_OSD_FLAG_READ,
1342 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001343 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001344 rbd_simple_req_cb, 0, NULL);
1345
1346 rbd_destroy_ops(ops);
1347 return ret;
1348}
1349
1350static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1351{
Alex Elder0ce1a792012-07-03 16:01:18 -05001352 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001353 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001354 int rc;
1355
Alex Elder0ce1a792012-07-03 16:01:18 -05001356 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357 return;
1358
Alex Elderbd919d42012-07-13 20:35:11 -05001359 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1360 rbd_dev->header_name, (unsigned long long) notify_id,
1361 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001362 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001363 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001364 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001365 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366
Alex Elder7f0a24d2012-07-25 09:32:40 -05001367 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368}
1369
1370/*
1371 * Request sync osd watch
1372 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001373static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001374{
1375 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001376 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001377 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001378
Alex Elder57cfc102012-06-26 12:57:03 -07001379 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1380 if (!ops)
1381 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382
1383 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001384 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385 if (ret < 0)
1386 goto fail;
1387
Alex Elder0e6f3222012-07-25 09:32:40 -05001388 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001389 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001390 ops[0].watch.flag = 1;
1391
Alex Elder0ce1a792012-07-03 16:01:18 -05001392 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1395 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001396 rbd_dev->header_name,
1397 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001398 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399
1400 if (ret < 0)
1401 goto fail_event;
1402
1403 rbd_destroy_ops(ops);
1404 return 0;
1405
1406fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001407 ceph_osdc_cancel_event(rbd_dev->watch_event);
1408 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001409fail:
1410 rbd_destroy_ops(ops);
1411 return ret;
1412}
1413
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001414/*
1415 * Request sync osd unwatch
1416 */
Alex Elder070c6332012-07-25 09:32:41 -05001417static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001418{
1419 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001420 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001421
Alex Elder57cfc102012-06-26 12:57:03 -07001422 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1423 if (!ops)
1424 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001425
1426 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001427 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001428 ops[0].watch.flag = 0;
1429
Alex Elder0ce1a792012-07-03 16:01:18 -05001430 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001432 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1433 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001434 rbd_dev->header_name,
1435 0, 0, NULL, NULL, NULL);
1436
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001437
1438 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001439 ceph_osdc_cancel_event(rbd_dev->watch_event);
1440 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001441 return ret;
1442}
1443
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001444/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001445 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001447static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001448 const char *object_name,
1449 const char *class_name,
1450 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001451 const char *outbound,
1452 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001453 char *inbound,
1454 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001455 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001456 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457{
1458 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001459 int class_name_len = strlen(class_name);
1460 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001461 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001462 int ret;
1463
Alex Elder3cb4a682012-06-26 12:57:03 -07001464 /*
1465 * Any input parameters required by the method we're calling
1466 * will be sent along with the class and method names as
1467 * part of the message payload. That data and its size are
1468 * supplied via the indata and indata_len fields (named from
1469 * the perspective of the server side) in the OSD request
1470 * operation.
1471 */
1472 payload_size = class_name_len + method_name_len + outbound_size;
1473 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001474 if (!ops)
1475 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476
Alex Elderaded07e2012-07-03 16:01:18 -05001477 ops[0].cls.class_name = class_name;
1478 ops[0].cls.class_len = (__u8) class_name_len;
1479 ops[0].cls.method_name = method_name;
1480 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001482 ops[0].cls.indata = outbound;
1483 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001484
Alex Elder0ce1a792012-07-03 16:01:18 -05001485 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001487 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001488 object_name, 0, inbound_size, inbound,
1489 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490
1491 rbd_destroy_ops(ops);
1492
1493 dout("cls_exec returned %d\n", ret);
1494 return ret;
1495}
1496
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1498{
1499 struct rbd_req_coll *coll =
1500 kzalloc(sizeof(struct rbd_req_coll) +
1501 sizeof(struct rbd_req_status) * num_reqs,
1502 GFP_ATOMIC);
1503
1504 if (!coll)
1505 return NULL;
1506 coll->total = num_reqs;
1507 kref_init(&coll->kref);
1508 return coll;
1509}
1510
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511/*
1512 * block device queue callback
1513 */
1514static void rbd_rq_fn(struct request_queue *q)
1515{
1516 struct rbd_device *rbd_dev = q->queuedata;
1517 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518
Alex Elder00f1f362012-02-07 12:03:36 -06001519 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001522 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 int num_segs, cur_seg = 0;
1525 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001526 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001527 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 dout("fetched request\n");
1530
1531 /* filter out block requests we don't understand */
1532 if ((rq->cmd_type != REQ_TYPE_FS)) {
1533 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001534 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 }
1536
1537 /* deduce our operation (read, write) */
1538 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001539 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001541 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542 }
1543
1544 spin_unlock_irq(q->queue_lock);
1545
Josh Durgind1d25642011-12-05 14:03:05 -08001546 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001547
Alex Elderdaba5fd2012-10-26 17:25:23 -05001548 if (!rbd_dev->exists) {
1549 rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001550 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001551 dout("request for non-existent snapshot");
1552 spin_lock_irq(q->queue_lock);
1553 __blk_end_request_all(rq, -ENXIO);
1554 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001555 }
1556
Josh Durgind1d25642011-12-05 14:03:05 -08001557 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1558
1559 up_read(&rbd_dev->header_rwsem);
1560
Alex Elderf7760da2012-10-20 22:17:27 -05001561 size = blk_rq_bytes(rq);
1562 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1563 bio = rq->bio;
1564
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 dout("%s 0x%x bytes at 0x%llx\n",
1566 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001567 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001568
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001569 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001570 if (num_segs <= 0) {
1571 spin_lock_irq(q->queue_lock);
1572 __blk_end_request_all(rq, num_segs);
1573 ceph_put_snap_context(snapc);
1574 continue;
1575 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001576 coll = rbd_alloc_coll(num_segs);
1577 if (!coll) {
1578 spin_lock_irq(q->queue_lock);
1579 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001580 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001581 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001582 }
1583
Alex Elderf7760da2012-10-20 22:17:27 -05001584 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001586 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1587 unsigned int chain_size;
1588 struct bio *bio_chain;
1589
1590 BUG_ON(limit > (u64) UINT_MAX);
1591 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001592 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001593
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001594 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001595
1596 /* Pass a cloned bio chain via an osd request */
1597
1598 bio_chain = bio_chain_clone_range(&bio,
1599 &bio_offset, chain_size,
1600 GFP_ATOMIC);
1601 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001602 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001603 ofs, chain_size,
1604 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001605 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001606 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001607 -ENOMEM, chain_size);
1608 size -= chain_size;
1609 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001613 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001616
1617 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 }
1619}
1620
1621/*
1622 * a queue callback. Makes sure that we don't create a bio that spans across
1623 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001624 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001625 */
1626static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1627 struct bio_vec *bvec)
1628{
1629 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001630 sector_t sector_offset;
1631 sector_t sectors_per_obj;
1632 sector_t obj_sector_offset;
1633 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634
Alex Eldere5cfeed22012-10-20 22:17:27 -05001635 /*
1636 * Find how far into its rbd object the partition-relative
1637 * bio start sector is to offset relative to the enclosing
1638 * device.
1639 */
1640 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1641 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1642 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001643
Alex Eldere5cfeed22012-10-20 22:17:27 -05001644 /*
1645 * Compute the number of bytes from that offset to the end
1646 * of the object. Account for what's already used by the bio.
1647 */
1648 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1649 if (ret > bmd->bi_size)
1650 ret -= bmd->bi_size;
1651 else
1652 ret = 0;
1653
1654 /*
1655 * Don't send back more than was asked for. And if the bio
1656 * was empty, let the whole thing through because: "Note
1657 * that a block device *must* allow a single page to be
1658 * added to an empty bio."
1659 */
1660 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1661 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1662 ret = (int) bvec->bv_len;
1663
1664 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665}
1666
1667static void rbd_free_disk(struct rbd_device *rbd_dev)
1668{
1669 struct gendisk *disk = rbd_dev->disk;
1670
1671 if (!disk)
1672 return;
1673
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674 if (disk->flags & GENHD_FL_UP)
1675 del_gendisk(disk);
1676 if (disk->queue)
1677 blk_cleanup_queue(disk->queue);
1678 put_disk(disk);
1679}
1680
1681/*
Alex Elder4156d9982012-08-02 11:29:46 -05001682 * Read the complete header for the given rbd device.
1683 *
1684 * Returns a pointer to a dynamically-allocated buffer containing
1685 * the complete and validated header. Caller can pass the address
1686 * of a variable that will be filled in with the version of the
1687 * header object at the time it was read.
1688 *
1689 * Returns a pointer-coded errno if a failure occurs.
1690 */
1691static struct rbd_image_header_ondisk *
1692rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1693{
1694 struct rbd_image_header_ondisk *ondisk = NULL;
1695 u32 snap_count = 0;
1696 u64 names_size = 0;
1697 u32 want_count;
1698 int ret;
1699
1700 /*
1701 * The complete header will include an array of its 64-bit
1702 * snapshot ids, followed by the names of those snapshots as
1703 * a contiguous block of NUL-terminated strings. Note that
1704 * the number of snapshots could change by the time we read
1705 * it in, in which case we re-read it.
1706 */
1707 do {
1708 size_t size;
1709
1710 kfree(ondisk);
1711
1712 size = sizeof (*ondisk);
1713 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1714 size += names_size;
1715 ondisk = kmalloc(size, GFP_KERNEL);
1716 if (!ondisk)
1717 return ERR_PTR(-ENOMEM);
1718
1719 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1720 rbd_dev->header_name,
1721 0, size,
1722 (char *) ondisk, version);
1723
1724 if (ret < 0)
1725 goto out_err;
1726 if (WARN_ON((size_t) ret < size)) {
1727 ret = -ENXIO;
1728 pr_warning("short header read for image %s"
1729 " (want %zd got %d)\n",
1730 rbd_dev->image_name, size, ret);
1731 goto out_err;
1732 }
1733 if (!rbd_dev_ondisk_valid(ondisk)) {
1734 ret = -ENXIO;
1735 pr_warning("invalid header for image %s\n",
1736 rbd_dev->image_name);
1737 goto out_err;
1738 }
1739
1740 names_size = le64_to_cpu(ondisk->snap_names_len);
1741 want_count = snap_count;
1742 snap_count = le32_to_cpu(ondisk->snap_count);
1743 } while (snap_count != want_count);
1744
1745 return ondisk;
1746
1747out_err:
1748 kfree(ondisk);
1749
1750 return ERR_PTR(ret);
1751}
1752
1753/*
1754 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001755 */
1756static int rbd_read_header(struct rbd_device *rbd_dev,
1757 struct rbd_image_header *header)
1758{
Alex Elder4156d9982012-08-02 11:29:46 -05001759 struct rbd_image_header_ondisk *ondisk;
1760 u64 ver = 0;
1761 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762
Alex Elder4156d9982012-08-02 11:29:46 -05001763 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1764 if (IS_ERR(ondisk))
1765 return PTR_ERR(ondisk);
1766 ret = rbd_header_from_disk(header, ondisk);
1767 if (ret >= 0)
1768 header->obj_version = ver;
1769 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770
Alex Elder4156d9982012-08-02 11:29:46 -05001771 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772}
1773
Alex Elder41f38c22012-10-25 23:34:40 -05001774static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001775{
1776 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001777 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001778
Alex Eldera0593292012-07-19 09:09:27 -05001779 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001780 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001781}
1782
Alex Elder94785542012-10-09 13:50:17 -07001783static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1784{
1785 sector_t size;
1786
Alex Elder971f8392012-10-25 23:34:41 -05001787 if (rbd_dev->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001788 return;
1789
1790 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1791 dout("setting size to %llu sectors", (unsigned long long) size);
1792 rbd_dev->mapping.size = (u64) size;
1793 set_capacity(rbd_dev->disk, size);
1794}
1795
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001796/*
1797 * only read the first part of the ondisk header, without the snaps info
1798 */
Alex Elder117973f2012-08-31 17:29:55 -05001799static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800{
1801 int ret;
1802 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803
1804 ret = rbd_read_header(rbd_dev, &h);
1805 if (ret < 0)
1806 return ret;
1807
Josh Durgina51aa0c2011-12-05 10:35:04 -08001808 down_write(&rbd_dev->header_rwsem);
1809
Alex Elder94785542012-10-09 13:50:17 -07001810 /* Update image size, and check for resize of mapped image */
1811 rbd_dev->header.image_size = h.image_size;
1812 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001813
Alex Elder849b4262012-07-09 21:04:24 -05001814 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001816 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001817 /* osd requests may still refer to snapc */
1818 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
Alex Elderb8136232012-07-25 09:32:41 -05001820 if (hver)
1821 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001822 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001823 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001824 rbd_dev->header.snapc = h.snapc;
1825 rbd_dev->header.snap_names = h.snap_names;
1826 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001827 /* Free the extra copy of the object prefix */
1828 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1829 kfree(h.object_prefix);
1830
Alex Elder304f6802012-08-31 17:29:52 -05001831 ret = rbd_dev_snaps_update(rbd_dev);
1832 if (!ret)
1833 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834
Josh Durginc6666012011-11-21 17:11:12 -08001835 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001837 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838}
1839
Alex Elder117973f2012-08-31 17:29:55 -05001840static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001841{
1842 int ret;
1843
Alex Elder117973f2012-08-31 17:29:55 -05001844 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001845 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001846 if (rbd_dev->image_format == 1)
1847 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1848 else
1849 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001850 mutex_unlock(&ctl_mutex);
1851
1852 return ret;
1853}
1854
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855static int rbd_init_disk(struct rbd_device *rbd_dev)
1856{
1857 struct gendisk *disk;
1858 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001859 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001860
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001862 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1863 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001864 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001865
Alex Elderf0f8cef2012-01-29 13:57:44 -06001866 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001867 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001868 disk->major = rbd_dev->major;
1869 disk->first_minor = 0;
1870 disk->fops = &rbd_bd_ops;
1871 disk->private_data = rbd_dev;
1872
1873 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1875 if (!q)
1876 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001877
Alex Elder593a9e72012-02-07 12:03:37 -06001878 /* We use the default size, but let's be explicit about it. */
1879 blk_queue_physical_block_size(q, SECTOR_SIZE);
1880
Josh Durgin029bcbd2011-07-22 11:35:23 -07001881 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001882 segment_size = rbd_obj_bytes(&rbd_dev->header);
1883 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1884 blk_queue_max_segment_size(q, segment_size);
1885 blk_queue_io_min(q, segment_size);
1886 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001887
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001888 blk_queue_merge_bvec(q, rbd_merge_bvec);
1889 disk->queue = q;
1890
1891 q->queuedata = rbd_dev;
1892
1893 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001894
Alex Elder12f02942012-08-29 17:11:07 -05001895 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1896
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898out_disk:
1899 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001900
1901 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902}
1903
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001904/*
1905 sysfs
1906*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907
Alex Elder593a9e72012-02-07 12:03:37 -06001908static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1909{
1910 return container_of(dev, struct rbd_device, dev);
1911}
1912
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913static ssize_t rbd_size_show(struct device *dev,
1914 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915{
Alex Elder593a9e72012-02-07 12:03:37 -06001916 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001917 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001918
Josh Durgina51aa0c2011-12-05 10:35:04 -08001919 down_read(&rbd_dev->header_rwsem);
1920 size = get_capacity(rbd_dev->disk);
1921 up_read(&rbd_dev->header_rwsem);
1922
1923 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924}
1925
Alex Elder34b13182012-07-13 20:35:12 -05001926/*
1927 * Note this shows the features for whatever's mapped, which is not
1928 * necessarily the base image.
1929 */
1930static ssize_t rbd_features_show(struct device *dev,
1931 struct device_attribute *attr, char *buf)
1932{
1933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1934
1935 return sprintf(buf, "0x%016llx\n",
1936 (unsigned long long) rbd_dev->mapping.features);
1937}
1938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939static ssize_t rbd_major_show(struct device *dev,
1940 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001941{
Alex Elder593a9e72012-02-07 12:03:37 -06001942 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943
1944 return sprintf(buf, "%d\n", rbd_dev->major);
1945}
1946
1947static ssize_t rbd_client_id_show(struct device *dev,
1948 struct device_attribute *attr, char *buf)
1949{
Alex Elder593a9e72012-02-07 12:03:37 -06001950 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001951
Alex Elder1dbb4392012-01-24 10:08:37 -06001952 return sprintf(buf, "client%lld\n",
1953 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954}
1955
1956static ssize_t rbd_pool_show(struct device *dev,
1957 struct device_attribute *attr, char *buf)
1958{
Alex Elder593a9e72012-02-07 12:03:37 -06001959 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960
1961 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1962}
1963
Alex Elder9bb2f332012-07-12 10:46:35 -05001964static ssize_t rbd_pool_id_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
1966{
1967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1968
Alex Elder86992092012-10-25 23:34:41 -05001969 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001970}
1971
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972static ssize_t rbd_name_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001976
Alex Elder0bed54d2012-07-03 16:01:18 -05001977 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001978}
1979
Alex Elder589d30e2012-07-10 20:30:11 -05001980static ssize_t rbd_image_id_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1984
1985 return sprintf(buf, "%s\n", rbd_dev->image_id);
1986}
1987
Alex Elder34b13182012-07-13 20:35:12 -05001988/*
1989 * Shows the name of the currently-mapped snapshot (or
1990 * RBD_SNAP_HEAD_NAME for the base image).
1991 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992static ssize_t rbd_snap_show(struct device *dev,
1993 struct device_attribute *attr,
1994 char *buf)
1995{
Alex Elder593a9e72012-02-07 12:03:37 -06001996 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997
Alex Elder971f8392012-10-25 23:34:41 -05001998 return sprintf(buf, "%s\n", rbd_dev->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999}
2000
2001static ssize_t rbd_image_refresh(struct device *dev,
2002 struct device_attribute *attr,
2003 const char *buf,
2004 size_t size)
2005{
Alex Elder593a9e72012-02-07 12:03:37 -06002006 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002007 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002008
Alex Elder117973f2012-08-31 17:29:55 -05002009 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002010
2011 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002013
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002015static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002016static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2017static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2018static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002019static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002020static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002021static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2023static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024
2025static struct attribute *rbd_attrs[] = {
2026 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002027 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028 &dev_attr_major.attr,
2029 &dev_attr_client_id.attr,
2030 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002031 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002032 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002033 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034 &dev_attr_current_snap.attr,
2035 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036 NULL
2037};
2038
2039static struct attribute_group rbd_attr_group = {
2040 .attrs = rbd_attrs,
2041};
2042
2043static const struct attribute_group *rbd_attr_groups[] = {
2044 &rbd_attr_group,
2045 NULL
2046};
2047
2048static void rbd_sysfs_dev_release(struct device *dev)
2049{
2050}
2051
2052static struct device_type rbd_device_type = {
2053 .name = "rbd",
2054 .groups = rbd_attr_groups,
2055 .release = rbd_sysfs_dev_release,
2056};
2057
2058
2059/*
2060 sysfs - snapshots
2061*/
2062
2063static ssize_t rbd_snap_size_show(struct device *dev,
2064 struct device_attribute *attr,
2065 char *buf)
2066{
2067 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2068
Josh Durgin3591538f2011-12-05 18:25:13 -08002069 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002070}
2071
2072static ssize_t rbd_snap_id_show(struct device *dev,
2073 struct device_attribute *attr,
2074 char *buf)
2075{
2076 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2077
Josh Durgin3591538f2011-12-05 18:25:13 -08002078 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079}
2080
Alex Elder34b13182012-07-13 20:35:12 -05002081static ssize_t rbd_snap_features_show(struct device *dev,
2082 struct device_attribute *attr,
2083 char *buf)
2084{
2085 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2086
2087 return sprintf(buf, "0x%016llx\n",
2088 (unsigned long long) snap->features);
2089}
2090
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2092static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002093static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094
2095static struct attribute *rbd_snap_attrs[] = {
2096 &dev_attr_snap_size.attr,
2097 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002098 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002099 NULL,
2100};
2101
2102static struct attribute_group rbd_snap_attr_group = {
2103 .attrs = rbd_snap_attrs,
2104};
2105
2106static void rbd_snap_dev_release(struct device *dev)
2107{
2108 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2109 kfree(snap->name);
2110 kfree(snap);
2111}
2112
2113static const struct attribute_group *rbd_snap_attr_groups[] = {
2114 &rbd_snap_attr_group,
2115 NULL
2116};
2117
2118static struct device_type rbd_snap_device_type = {
2119 .groups = rbd_snap_attr_groups,
2120 .release = rbd_snap_dev_release,
2121};
2122
Alex Elder304f6802012-08-31 17:29:52 -05002123static bool rbd_snap_registered(struct rbd_snap *snap)
2124{
2125 bool ret = snap->dev.type == &rbd_snap_device_type;
2126 bool reg = device_is_registered(&snap->dev);
2127
2128 rbd_assert(!ret ^ reg);
2129
2130 return ret;
2131}
2132
Alex Elder41f38c22012-10-25 23:34:40 -05002133static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002134{
2135 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002136 if (device_is_registered(&snap->dev))
2137 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002138}
2139
Alex Elder14e70852012-07-19 09:09:27 -05002140static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141 struct device *parent)
2142{
2143 struct device *dev = &snap->dev;
2144 int ret;
2145
2146 dev->type = &rbd_snap_device_type;
2147 dev->parent = parent;
2148 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002149 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002150 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2151
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152 ret = device_register(dev);
2153
2154 return ret;
2155}
2156
Alex Elder4e891e02012-07-10 20:30:10 -05002157static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002158 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002159 u64 snap_id, u64 snap_size,
2160 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161{
Alex Elder4e891e02012-07-10 20:30:10 -05002162 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002164
2165 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002167 return ERR_PTR(-ENOMEM);
2168
2169 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002170 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002171 if (!snap->name)
2172 goto err;
2173
Alex Elderc8d18422012-07-10 20:30:11 -05002174 snap->id = snap_id;
2175 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002176 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002177
2178 return snap;
2179
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180err:
2181 kfree(snap->name);
2182 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002183
2184 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185}
2186
Alex Eldercd892122012-07-03 16:01:19 -05002187static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2188 u64 *snap_size, u64 *snap_features)
2189{
2190 char *snap_name;
2191
2192 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2193
2194 *snap_size = rbd_dev->header.snap_sizes[which];
2195 *snap_features = 0; /* No features for v1 */
2196
2197 /* Skip over names until we find the one we are looking for */
2198
2199 snap_name = rbd_dev->header.snap_names;
2200 while (which--)
2201 snap_name += strlen(snap_name) + 1;
2202
2203 return snap_name;
2204}
2205
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002206/*
Alex Elder9d475de2012-07-03 16:01:19 -05002207 * Get the size and object order for an image snapshot, or if
2208 * snap_id is CEPH_NOSNAP, gets this information for the base
2209 * image.
2210 */
2211static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2212 u8 *order, u64 *snap_size)
2213{
2214 __le64 snapid = cpu_to_le64(snap_id);
2215 int ret;
2216 struct {
2217 u8 order;
2218 __le64 size;
2219 } __attribute__ ((packed)) size_buf = { 0 };
2220
2221 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2222 "rbd", "get_size",
2223 (char *) &snapid, sizeof (snapid),
2224 (char *) &size_buf, sizeof (size_buf),
2225 CEPH_OSD_FLAG_READ, NULL);
2226 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2227 if (ret < 0)
2228 return ret;
2229
2230 *order = size_buf.order;
2231 *snap_size = le64_to_cpu(size_buf.size);
2232
2233 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2234 (unsigned long long) snap_id, (unsigned int) *order,
2235 (unsigned long long) *snap_size);
2236
2237 return 0;
2238}
2239
2240static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2241{
2242 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2243 &rbd_dev->header.obj_order,
2244 &rbd_dev->header.image_size);
2245}
2246
Alex Elder1e130192012-07-03 16:01:19 -05002247static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2248{
2249 void *reply_buf;
2250 int ret;
2251 void *p;
2252
2253 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2254 if (!reply_buf)
2255 return -ENOMEM;
2256
2257 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2258 "rbd", "get_object_prefix",
2259 NULL, 0,
2260 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2261 CEPH_OSD_FLAG_READ, NULL);
2262 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2263 if (ret < 0)
2264 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002265 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002266
2267 p = reply_buf;
2268 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2269 p + RBD_OBJ_PREFIX_LEN_MAX,
2270 NULL, GFP_NOIO);
2271
2272 if (IS_ERR(rbd_dev->header.object_prefix)) {
2273 ret = PTR_ERR(rbd_dev->header.object_prefix);
2274 rbd_dev->header.object_prefix = NULL;
2275 } else {
2276 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2277 }
2278
2279out:
2280 kfree(reply_buf);
2281
2282 return ret;
2283}
2284
Alex Elderb1b54022012-07-03 16:01:19 -05002285static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2286 u64 *snap_features)
2287{
2288 __le64 snapid = cpu_to_le64(snap_id);
2289 struct {
2290 __le64 features;
2291 __le64 incompat;
2292 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002293 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002294 int ret;
2295
2296 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2297 "rbd", "get_features",
2298 (char *) &snapid, sizeof (snapid),
2299 (char *) &features_buf, sizeof (features_buf),
2300 CEPH_OSD_FLAG_READ, NULL);
2301 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2302 if (ret < 0)
2303 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002304
2305 incompat = le64_to_cpu(features_buf.incompat);
2306 if (incompat & ~RBD_FEATURES_ALL)
2307 return -ENOTSUPP;
2308
Alex Elderb1b54022012-07-03 16:01:19 -05002309 *snap_features = le64_to_cpu(features_buf.features);
2310
2311 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2312 (unsigned long long) snap_id,
2313 (unsigned long long) *snap_features,
2314 (unsigned long long) le64_to_cpu(features_buf.incompat));
2315
2316 return 0;
2317}
2318
2319static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2320{
2321 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2322 &rbd_dev->header.features);
2323}
2324
Alex Elder6e14b1a2012-07-03 16:01:19 -05002325static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002326{
2327 size_t size;
2328 int ret;
2329 void *reply_buf;
2330 void *p;
2331 void *end;
2332 u64 seq;
2333 u32 snap_count;
2334 struct ceph_snap_context *snapc;
2335 u32 i;
2336
2337 /*
2338 * We'll need room for the seq value (maximum snapshot id),
2339 * snapshot count, and array of that many snapshot ids.
2340 * For now we have a fixed upper limit on the number we're
2341 * prepared to receive.
2342 */
2343 size = sizeof (__le64) + sizeof (__le32) +
2344 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2345 reply_buf = kzalloc(size, GFP_KERNEL);
2346 if (!reply_buf)
2347 return -ENOMEM;
2348
2349 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2350 "rbd", "get_snapcontext",
2351 NULL, 0,
2352 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002353 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002354 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2355 if (ret < 0)
2356 goto out;
2357
2358 ret = -ERANGE;
2359 p = reply_buf;
2360 end = (char *) reply_buf + size;
2361 ceph_decode_64_safe(&p, end, seq, out);
2362 ceph_decode_32_safe(&p, end, snap_count, out);
2363
2364 /*
2365 * Make sure the reported number of snapshot ids wouldn't go
2366 * beyond the end of our buffer. But before checking that,
2367 * make sure the computed size of the snapshot context we
2368 * allocate is representable in a size_t.
2369 */
2370 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2371 / sizeof (u64)) {
2372 ret = -EINVAL;
2373 goto out;
2374 }
2375 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2376 goto out;
2377
2378 size = sizeof (struct ceph_snap_context) +
2379 snap_count * sizeof (snapc->snaps[0]);
2380 snapc = kmalloc(size, GFP_KERNEL);
2381 if (!snapc) {
2382 ret = -ENOMEM;
2383 goto out;
2384 }
2385
2386 atomic_set(&snapc->nref, 1);
2387 snapc->seq = seq;
2388 snapc->num_snaps = snap_count;
2389 for (i = 0; i < snap_count; i++)
2390 snapc->snaps[i] = ceph_decode_64(&p);
2391
2392 rbd_dev->header.snapc = snapc;
2393
2394 dout(" snap context seq = %llu, snap_count = %u\n",
2395 (unsigned long long) seq, (unsigned int) snap_count);
2396
2397out:
2398 kfree(reply_buf);
2399
2400 return 0;
2401}
2402
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002403static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2404{
2405 size_t size;
2406 void *reply_buf;
2407 __le64 snap_id;
2408 int ret;
2409 void *p;
2410 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002411 char *snap_name;
2412
2413 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2414 reply_buf = kmalloc(size, GFP_KERNEL);
2415 if (!reply_buf)
2416 return ERR_PTR(-ENOMEM);
2417
2418 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2419 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2420 "rbd", "get_snapshot_name",
2421 (char *) &snap_id, sizeof (snap_id),
2422 reply_buf, size,
2423 CEPH_OSD_FLAG_READ, NULL);
2424 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2425 if (ret < 0)
2426 goto out;
2427
2428 p = reply_buf;
2429 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002430 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002431 if (IS_ERR(snap_name)) {
2432 ret = PTR_ERR(snap_name);
2433 goto out;
2434 } else {
2435 dout(" snap_id 0x%016llx snap_name = %s\n",
2436 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2437 }
2438 kfree(reply_buf);
2439
2440 return snap_name;
2441out:
2442 kfree(reply_buf);
2443
2444 return ERR_PTR(ret);
2445}
2446
2447static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2448 u64 *snap_size, u64 *snap_features)
2449{
2450 __le64 snap_id;
2451 u8 order;
2452 int ret;
2453
2454 snap_id = rbd_dev->header.snapc->snaps[which];
2455 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2456 if (ret)
2457 return ERR_PTR(ret);
2458 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2459 if (ret)
2460 return ERR_PTR(ret);
2461
2462 return rbd_dev_v2_snap_name(rbd_dev, which);
2463}
2464
2465static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2466 u64 *snap_size, u64 *snap_features)
2467{
2468 if (rbd_dev->image_format == 1)
2469 return rbd_dev_v1_snap_info(rbd_dev, which,
2470 snap_size, snap_features);
2471 if (rbd_dev->image_format == 2)
2472 return rbd_dev_v2_snap_info(rbd_dev, which,
2473 snap_size, snap_features);
2474 return ERR_PTR(-EINVAL);
2475}
2476
Alex Elder117973f2012-08-31 17:29:55 -05002477static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2478{
2479 int ret;
2480 __u8 obj_order;
2481
2482 down_write(&rbd_dev->header_rwsem);
2483
2484 /* Grab old order first, to see if it changes */
2485
2486 obj_order = rbd_dev->header.obj_order,
2487 ret = rbd_dev_v2_image_size(rbd_dev);
2488 if (ret)
2489 goto out;
2490 if (rbd_dev->header.obj_order != obj_order) {
2491 ret = -EIO;
2492 goto out;
2493 }
2494 rbd_update_mapping_size(rbd_dev);
2495
2496 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2497 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2498 if (ret)
2499 goto out;
2500 ret = rbd_dev_snaps_update(rbd_dev);
2501 dout("rbd_dev_snaps_update returned %d\n", ret);
2502 if (ret)
2503 goto out;
2504 ret = rbd_dev_snaps_register(rbd_dev);
2505 dout("rbd_dev_snaps_register returned %d\n", ret);
2506out:
2507 up_write(&rbd_dev->header_rwsem);
2508
2509 return ret;
2510}
2511
Alex Elder9d475de2012-07-03 16:01:19 -05002512/*
Alex Elder35938152012-08-02 11:29:46 -05002513 * Scan the rbd device's current snapshot list and compare it to the
2514 * newly-received snapshot context. Remove any existing snapshots
2515 * not present in the new snapshot context. Add a new snapshot for
2516 * any snaphots in the snapshot context not in the current list.
2517 * And verify there are no changes to snapshots we already know
2518 * about.
2519 *
2520 * Assumes the snapshots in the snapshot context are sorted by
2521 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2522 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002523 */
Alex Elder304f6802012-08-31 17:29:52 -05002524static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002525{
Alex Elder35938152012-08-02 11:29:46 -05002526 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2527 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002528 struct list_head *head = &rbd_dev->snaps;
2529 struct list_head *links = head->next;
2530 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002531
Alex Elder9fcbb802012-08-23 23:48:49 -05002532 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002533 while (index < snap_count || links != head) {
2534 u64 snap_id;
2535 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002536 char *snap_name;
2537 u64 snap_size = 0;
2538 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002539
Alex Elder35938152012-08-02 11:29:46 -05002540 snap_id = index < snap_count ? snapc->snaps[index]
2541 : CEPH_NOSNAP;
2542 snap = links != head ? list_entry(links, struct rbd_snap, node)
2543 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002544 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002545
Alex Elder35938152012-08-02 11:29:46 -05002546 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2547 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002548
Alex Elder35938152012-08-02 11:29:46 -05002549 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002550
Alex Elder971f8392012-10-25 23:34:41 -05002551 if (rbd_dev->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002552 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002553 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002554 dout("%ssnap id %llu has been removed\n",
Alex Elder971f8392012-10-25 23:34:41 -05002555 rbd_dev->snap_id == snap->id ? "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002556 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002557
Alex Elder35938152012-08-02 11:29:46 -05002558 /* Done with this list entry; advance */
2559
2560 links = next;
2561 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002562 }
Alex Elder35938152012-08-02 11:29:46 -05002563
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002564 snap_name = rbd_dev_snap_info(rbd_dev, index,
2565 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002566 if (IS_ERR(snap_name))
2567 return PTR_ERR(snap_name);
2568
Alex Elder9fcbb802012-08-23 23:48:49 -05002569 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2570 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002571 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2572 struct rbd_snap *new_snap;
2573
2574 /* We haven't seen this snapshot before */
2575
Alex Elderc8d18422012-07-10 20:30:11 -05002576 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002577 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002578 if (IS_ERR(new_snap)) {
2579 int err = PTR_ERR(new_snap);
2580
2581 dout(" failed to add dev, error %d\n", err);
2582
2583 return err;
2584 }
Alex Elder35938152012-08-02 11:29:46 -05002585
2586 /* New goes before existing, or at end of list */
2587
Alex Elder9fcbb802012-08-23 23:48:49 -05002588 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002589 if (snap)
2590 list_add_tail(&new_snap->node, &snap->node);
2591 else
Alex Elder523f3252012-08-30 00:16:37 -05002592 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002593 } else {
2594 /* Already have this one */
2595
Alex Elder9fcbb802012-08-23 23:48:49 -05002596 dout(" already present\n");
2597
Alex Eldercd892122012-07-03 16:01:19 -05002598 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002599 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002600 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002601
2602 /* Done with this list entry; advance */
2603
2604 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002605 }
Alex Elder35938152012-08-02 11:29:46 -05002606
2607 /* Advance to the next entry in the snapshot context */
2608
2609 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002610 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002611 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612
2613 return 0;
2614}
2615
Alex Elder304f6802012-08-31 17:29:52 -05002616/*
2617 * Scan the list of snapshots and register the devices for any that
2618 * have not already been registered.
2619 */
2620static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2621{
2622 struct rbd_snap *snap;
2623 int ret = 0;
2624
2625 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002626 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2627 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002628
2629 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2630 if (!rbd_snap_registered(snap)) {
2631 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2632 if (ret < 0)
2633 break;
2634 }
2635 }
2636 dout("%s: returning %d\n", __func__, ret);
2637
2638 return ret;
2639}
2640
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002641static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2642{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002643 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002644 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002645
2646 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002647
Alex Eldercd789ab2012-08-30 00:16:38 -05002648 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002649 dev->bus = &rbd_bus_type;
2650 dev->type = &rbd_device_type;
2651 dev->parent = &rbd_root_dev;
2652 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002653 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002654 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002655
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002656 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002657
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002658 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002659}
2660
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002661static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2662{
2663 device_unregister(&rbd_dev->dev);
2664}
2665
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002666static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2667{
2668 int ret, rc;
2669
2670 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002671 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002672 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002673 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002674 if (rc < 0)
2675 return rc;
2676 }
2677 } while (ret == -ERANGE);
2678
2679 return ret;
2680}
2681
Alex Eldere2839302012-08-29 17:11:06 -05002682static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002683
2684/*
Alex Elder499afd52012-02-02 08:13:29 -06002685 * Get a unique rbd identifier for the given new rbd_dev, and add
2686 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002687 */
Alex Eldere2839302012-08-29 17:11:06 -05002688static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002689{
Alex Eldere2839302012-08-29 17:11:06 -05002690 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002691
2692 spin_lock(&rbd_dev_list_lock);
2693 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2694 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002695 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2696 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002697}
Alex Elderb7f23c32012-01-29 13:57:43 -06002698
Alex Elder1ddbe942012-01-29 13:57:44 -06002699/*
Alex Elder499afd52012-02-02 08:13:29 -06002700 * Remove an rbd_dev from the global list, and record that its
2701 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002702 */
Alex Eldere2839302012-08-29 17:11:06 -05002703static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002704{
Alex Elderd184f6b2012-01-29 13:57:44 -06002705 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002706 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002707 int max_id;
2708
Alex Elderaafb2302012-09-06 16:00:54 -05002709 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002710
Alex Eldere2839302012-08-29 17:11:06 -05002711 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2712 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002713 spin_lock(&rbd_dev_list_lock);
2714 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002715
2716 /*
2717 * If the id being "put" is not the current maximum, there
2718 * is nothing special we need to do.
2719 */
Alex Eldere2839302012-08-29 17:11:06 -05002720 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002721 spin_unlock(&rbd_dev_list_lock);
2722 return;
2723 }
2724
2725 /*
2726 * We need to update the current maximum id. Search the
2727 * list to find out what it is. We're more likely to find
2728 * the maximum at the end, so search the list backward.
2729 */
2730 max_id = 0;
2731 list_for_each_prev(tmp, &rbd_dev_list) {
2732 struct rbd_device *rbd_dev;
2733
2734 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002735 if (rbd_dev->dev_id > max_id)
2736 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002737 }
Alex Elder499afd52012-02-02 08:13:29 -06002738 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002739
Alex Elder1ddbe942012-01-29 13:57:44 -06002740 /*
Alex Eldere2839302012-08-29 17:11:06 -05002741 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002742 * which case it now accurately reflects the new maximum.
2743 * Be careful not to overwrite the maximum value in that
2744 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002745 */
Alex Eldere2839302012-08-29 17:11:06 -05002746 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2747 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002748}
2749
Alex Eldera725f65e2012-02-02 08:13:30 -06002750/*
Alex Eldere28fff262012-02-02 08:13:30 -06002751 * Skips over white space at *buf, and updates *buf to point to the
2752 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002753 * the token (string of non-white space characters) found. Note
2754 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002755 */
2756static inline size_t next_token(const char **buf)
2757{
2758 /*
2759 * These are the characters that produce nonzero for
2760 * isspace() in the "C" and "POSIX" locales.
2761 */
2762 const char *spaces = " \f\n\r\t\v";
2763
2764 *buf += strspn(*buf, spaces); /* Find start of token */
2765
2766 return strcspn(*buf, spaces); /* Return token length */
2767}
2768
2769/*
2770 * Finds the next token in *buf, and if the provided token buffer is
2771 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002772 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2773 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002774 *
2775 * Returns the length of the token found (not including the '\0').
2776 * Return value will be 0 if no token is found, and it will be >=
2777 * token_size if the token would not fit.
2778 *
Alex Elder593a9e72012-02-07 12:03:37 -06002779 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002780 * found token. Note that this occurs even if the token buffer is
2781 * too small to hold it.
2782 */
2783static inline size_t copy_token(const char **buf,
2784 char *token,
2785 size_t token_size)
2786{
2787 size_t len;
2788
2789 len = next_token(buf);
2790 if (len < token_size) {
2791 memcpy(token, *buf, len);
2792 *(token + len) = '\0';
2793 }
2794 *buf += len;
2795
2796 return len;
2797}
2798
2799/*
Alex Elderea3352f2012-07-09 21:04:23 -05002800 * Finds the next token in *buf, dynamically allocates a buffer big
2801 * enough to hold a copy of it, and copies the token into the new
2802 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2803 * that a duplicate buffer is created even for a zero-length token.
2804 *
2805 * Returns a pointer to the newly-allocated duplicate, or a null
2806 * pointer if memory for the duplicate was not available. If
2807 * the lenp argument is a non-null pointer, the length of the token
2808 * (not including the '\0') is returned in *lenp.
2809 *
2810 * If successful, the *buf pointer will be updated to point beyond
2811 * the end of the found token.
2812 *
2813 * Note: uses GFP_KERNEL for allocation.
2814 */
2815static inline char *dup_token(const char **buf, size_t *lenp)
2816{
2817 char *dup;
2818 size_t len;
2819
2820 len = next_token(buf);
2821 dup = kmalloc(len + 1, GFP_KERNEL);
2822 if (!dup)
2823 return NULL;
2824
2825 memcpy(dup, *buf, len);
2826 *(dup + len) = '\0';
2827 *buf += len;
2828
2829 if (lenp)
2830 *lenp = len;
2831
2832 return dup;
2833}
2834
2835/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002836 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2837 * rbd_md_name, and name fields of the given rbd_dev, based on the
2838 * list of monitor addresses and other options provided via
2839 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2840 * copy of the snapshot name to map if successful, or a
2841 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002842 *
2843 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002844 */
Alex Elder0ddebc02012-10-25 23:34:41 -05002845static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
2846 const char *buf,
Alex Eldere5c35532012-10-25 23:34:41 -05002847 char **snap_name)
Alex Eldera725f65e2012-02-02 08:13:30 -06002848{
Alex Elderd22f76e2012-07-12 10:46:35 -05002849 size_t len;
Alex Elder0ddebc02012-10-25 23:34:41 -05002850 const char *mon_addrs;
2851 size_t mon_addrs_size;
Alex Elderf28e5652012-10-25 23:34:41 -05002852 char *options;
2853 struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
Alex Elder0ddebc02012-10-25 23:34:41 -05002854 struct rbd_options rbd_opts;
2855 struct ceph_options *ceph_opts;
Alex Eldere28fff262012-02-02 08:13:30 -06002856
2857 /* The first four tokens are required */
2858
Alex Elder7ef32142012-02-02 08:13:30 -06002859 len = next_token(&buf);
2860 if (!len)
Alex Elderf28e5652012-10-25 23:34:41 -05002861 return err_ptr; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05002862 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05002863 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002864 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002865
Alex Elderf28e5652012-10-25 23:34:41 -05002866 options = dup_token(&buf, NULL);
2867 if (!options)
2868 goto out_mem;
2869 if (!*options)
2870 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06002871
Alex Elderd22f76e2012-07-12 10:46:35 -05002872 rbd_dev->pool_name = dup_token(&buf, NULL);
2873 if (!rbd_dev->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002874 goto out_mem;
2875 if (!*rbd_dev->pool_name)
2876 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06002877
Alex Elder0bed54d2012-07-03 16:01:18 -05002878 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2879 if (!rbd_dev->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002880 goto out_mem;
2881 if (!*rbd_dev->image_name)
2882 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06002883
Alex Elderf28e5652012-10-25 23:34:41 -05002884 /*
2885 * Snapshot name is optional; default is to use "-"
2886 * (indicating the head/no snapshot).
2887 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002888 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002889 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002890 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2891 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05002892 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
2893 err_ptr = ERR_PTR(-ENAMETOOLONG);
2894 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002895 }
Alex Elder0ddebc02012-10-25 23:34:41 -05002896 *snap_name = kmalloc(len + 1, GFP_KERNEL);
2897 if (!*snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002898 goto out_mem;
Alex Elder0ddebc02012-10-25 23:34:41 -05002899 memcpy(*snap_name, buf, len);
2900 *(*snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05002901
Alex Elder0ddebc02012-10-25 23:34:41 -05002902 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06002903
Alex Elder0ddebc02012-10-25 23:34:41 -05002904 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05002905
Alex Elder0ddebc02012-10-25 23:34:41 -05002906 ceph_opts = ceph_parse_options(options, mon_addrs,
2907 mon_addrs + mon_addrs_size - 1,
2908 parse_rbd_opts_token, &rbd_opts);
Alex Elderf28e5652012-10-25 23:34:41 -05002909 kfree(options);
Alex Elder0ddebc02012-10-25 23:34:41 -05002910
2911 /* Record the parsed rbd options */
2912
Alex Elderf28e5652012-10-25 23:34:41 -05002913 if (!IS_ERR(ceph_opts))
Alex Elder0ddebc02012-10-25 23:34:41 -05002914 rbd_dev->mapping.read_only = rbd_opts.read_only;
Alex Elder0ddebc02012-10-25 23:34:41 -05002915
2916 return ceph_opts;
Alex Elderf28e5652012-10-25 23:34:41 -05002917out_mem:
2918 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002919out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002920 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002921 rbd_dev->image_name = NULL;
2922 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002923 kfree(rbd_dev->pool_name);
2924 rbd_dev->pool_name = NULL;
Alex Elderf28e5652012-10-25 23:34:41 -05002925 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05002926
Alex Elder3feeb8942012-08-31 17:29:52 -05002927 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002928}
2929
Alex Elder589d30e2012-07-10 20:30:11 -05002930/*
2931 * An rbd format 2 image has a unique identifier, distinct from the
2932 * name given to it by the user. Internally, that identifier is
2933 * what's used to specify the names of objects related to the image.
2934 *
2935 * A special "rbd id" object is used to map an rbd image name to its
2936 * id. If that object doesn't exist, then there is no v2 rbd image
2937 * with the supplied name.
2938 *
2939 * This function will record the given rbd_dev's image_id field if
2940 * it can be determined, and in that case will return 0. If any
2941 * errors occur a negative errno will be returned and the rbd_dev's
2942 * image_id field will be unchanged (and should be NULL).
2943 */
2944static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2945{
2946 int ret;
2947 size_t size;
2948 char *object_name;
2949 void *response;
2950 void *p;
2951
2952 /*
2953 * First, see if the format 2 image id file exists, and if
2954 * so, get the image's persistent id from it.
2955 */
2956 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2957 object_name = kmalloc(size, GFP_NOIO);
2958 if (!object_name)
2959 return -ENOMEM;
2960 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2961 dout("rbd id object name is %s\n", object_name);
2962
2963 /* Response will be an encoded string, which includes a length */
2964
2965 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2966 response = kzalloc(size, GFP_NOIO);
2967 if (!response) {
2968 ret = -ENOMEM;
2969 goto out;
2970 }
2971
2972 ret = rbd_req_sync_exec(rbd_dev, object_name,
2973 "rbd", "get_id",
2974 NULL, 0,
2975 response, RBD_IMAGE_ID_LEN_MAX,
2976 CEPH_OSD_FLAG_READ, NULL);
2977 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2978 if (ret < 0)
2979 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002980 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002981
2982 p = response;
2983 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2984 p + RBD_IMAGE_ID_LEN_MAX,
2985 &rbd_dev->image_id_len,
2986 GFP_NOIO);
2987 if (IS_ERR(rbd_dev->image_id)) {
2988 ret = PTR_ERR(rbd_dev->image_id);
2989 rbd_dev->image_id = NULL;
2990 } else {
2991 dout("image_id is %s\n", rbd_dev->image_id);
2992 }
2993out:
2994 kfree(response);
2995 kfree(object_name);
2996
2997 return ret;
2998}
2999
Alex Eldera30b71b2012-07-10 20:30:11 -05003000static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3001{
3002 int ret;
3003 size_t size;
3004
3005 /* Version 1 images have no id; empty string is used */
3006
3007 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3008 if (!rbd_dev->image_id)
3009 return -ENOMEM;
3010 rbd_dev->image_id_len = 0;
3011
3012 /* Record the header object name for this rbd image. */
3013
3014 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3015 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3016 if (!rbd_dev->header_name) {
3017 ret = -ENOMEM;
3018 goto out_err;
3019 }
3020 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3021
3022 /* Populate rbd image metadata */
3023
3024 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3025 if (ret < 0)
3026 goto out_err;
3027 rbd_dev->image_format = 1;
3028
3029 dout("discovered version 1 image, header name is %s\n",
3030 rbd_dev->header_name);
3031
3032 return 0;
3033
3034out_err:
3035 kfree(rbd_dev->header_name);
3036 rbd_dev->header_name = NULL;
3037 kfree(rbd_dev->image_id);
3038 rbd_dev->image_id = NULL;
3039
3040 return ret;
3041}
3042
3043static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3044{
3045 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003046 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003047 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003048
3049 /*
3050 * Image id was filled in by the caller. Record the header
3051 * object name for this rbd image.
3052 */
3053 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3054 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3055 if (!rbd_dev->header_name)
3056 return -ENOMEM;
3057 sprintf(rbd_dev->header_name, "%s%s",
3058 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003059
3060 /* Get the size and object order for the image */
3061
3062 ret = rbd_dev_v2_image_size(rbd_dev);
3063 if (ret < 0)
3064 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003065
3066 /* Get the object prefix (a.k.a. block_name) for the image */
3067
3068 ret = rbd_dev_v2_object_prefix(rbd_dev);
3069 if (ret < 0)
3070 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003071
Alex Elderd8891402012-10-09 13:50:17 -07003072 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003073
3074 ret = rbd_dev_v2_features(rbd_dev);
3075 if (ret < 0)
3076 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003077
Alex Elder6e14b1a2012-07-03 16:01:19 -05003078 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003079
Alex Elder6e14b1a2012-07-03 16:01:19 -05003080 rbd_dev->header.crypt_type = 0;
3081 rbd_dev->header.comp_type = 0;
3082
3083 /* Get the snapshot context, plus the header version */
3084
3085 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003086 if (ret)
3087 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003088 rbd_dev->header.obj_version = ver;
3089
Alex Eldera30b71b2012-07-10 20:30:11 -05003090 rbd_dev->image_format = 2;
3091
3092 dout("discovered version 2 image, header name is %s\n",
3093 rbd_dev->header_name);
3094
Alex Elder35152972012-08-31 17:29:55 -05003095 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003096out_err:
3097 kfree(rbd_dev->header_name);
3098 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003099 kfree(rbd_dev->header.object_prefix);
3100 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003101
3102 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003103}
3104
3105/*
3106 * Probe for the existence of the header object for the given rbd
3107 * device. For format 2 images this includes determining the image
3108 * id.
3109 */
3110static int rbd_dev_probe(struct rbd_device *rbd_dev)
3111{
3112 int ret;
3113
3114 /*
3115 * Get the id from the image id object. If it's not a
3116 * format 2 image, we'll get ENOENT back, and we'll assume
3117 * it's a format 1 image.
3118 */
3119 ret = rbd_dev_image_id(rbd_dev);
3120 if (ret)
3121 ret = rbd_dev_v1_probe(rbd_dev);
3122 else
3123 ret = rbd_dev_v2_probe(rbd_dev);
3124 if (ret)
3125 dout("probe failed, returning %d\n", ret);
3126
3127 return ret;
3128}
3129
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003130static ssize_t rbd_add(struct bus_type *bus,
3131 const char *buf,
3132 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133{
Alex Eldercb8627c2012-07-09 21:04:23 -05003134 struct rbd_device *rbd_dev = NULL;
Alex Elder78cea762012-10-25 23:34:41 -05003135 char *snap_name;
Alex Elder78cea762012-10-25 23:34:41 -05003136 struct ceph_options *ceph_opts;
Alex Elder27cc2592012-02-02 08:13:30 -06003137 struct ceph_osd_client *osdc;
3138 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139
3140 if (!try_module_get(THIS_MODULE))
3141 return -ENODEV;
3142
Alex Eldercb8627c2012-07-09 21:04:23 -05003143 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3144 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003145 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003146
3147 /* static rbd_device initialization */
3148 spin_lock_init(&rbd_dev->lock);
3149 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003150 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003151 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003152
Alex Eldera725f65e2012-02-02 08:13:30 -06003153 /* parse add command */
Alex Elderf28e5652012-10-25 23:34:41 -05003154 ceph_opts = rbd_add_parse_args(rbd_dev, buf, &snap_name);
Alex Elder0ddebc02012-10-25 23:34:41 -05003155 if (IS_ERR(ceph_opts)) {
3156 rc = PTR_ERR(ceph_opts);
Alex Elder85ae8922012-07-26 23:37:14 -05003157 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003158 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003159
Alex Elder78cea762012-10-25 23:34:41 -05003160 rc = rbd_get_client(rbd_dev, ceph_opts);
3161 if (rc < 0)
Alex Elder0ddebc02012-10-25 23:34:41 -05003162 goto err_out_args;
Alex Elder78cea762012-10-25 23:34:41 -05003163 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003165 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003166 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003167 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3168 if (rc < 0)
3169 goto err_out_client;
Alex Elder86992092012-10-25 23:34:41 -05003170 rbd_dev->pool_id = (u64) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003171
Alex Eldera30b71b2012-07-10 20:30:11 -05003172 rc = rbd_dev_probe(rbd_dev);
3173 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003174 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003175
3176 /* no need to lock here, as rbd_dev is not registered yet */
3177 rc = rbd_dev_snaps_update(rbd_dev);
3178 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003179 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003180
3181 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3182 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003183 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003184
Alex Elder85ae8922012-07-26 23:37:14 -05003185 /* generate unique id: find highest unique id, add one */
3186 rbd_dev_id_get(rbd_dev);
3187
3188 /* Fill in the device name, now that we have its id. */
3189 BUILD_BUG_ON(DEV_NAME_LEN
3190 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3191 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3192
3193 /* Get our block major device number. */
3194
Alex Elder27cc2592012-02-02 08:13:30 -06003195 rc = register_blkdev(0, rbd_dev->name);
3196 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003197 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003198 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003199
Alex Elder0f308a32012-08-29 17:11:07 -05003200 /* Set up the blkdev mapping. */
3201
3202 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003203 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003204 goto err_out_blkdev;
3205
Alex Elder0f308a32012-08-29 17:11:07 -05003206 rc = rbd_bus_add_dev(rbd_dev);
3207 if (rc)
3208 goto err_out_disk;
3209
Alex Elder32eec682012-02-08 16:11:14 -06003210 /*
3211 * At this point cleanup in the event of an error is the job
3212 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003213 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003214
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003215 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003216 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003217 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003218 if (rc)
3219 goto err_out_bus;
3220
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003221 rc = rbd_init_watch_dev(rbd_dev);
3222 if (rc)
3223 goto err_out_bus;
3224
Alex Elder3ee40012012-08-29 17:11:07 -05003225 /* Everything's ready. Announce the disk to the world. */
3226
3227 add_disk(rbd_dev->disk);
3228
3229 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3230 (unsigned long long) rbd_dev->mapping.size);
3231
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003232 return count;
3233
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003234err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003235 /* this will also clean up rest of rbd_dev stuff */
3236
3237 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003238 return rc;
3239
Alex Elder0f308a32012-08-29 17:11:07 -05003240err_out_disk:
3241 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003242err_out_blkdev:
3243 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003244err_out_id:
3245 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003246err_out_snaps:
3247 rbd_remove_all_snaps(rbd_dev);
3248err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003249 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003250err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003251 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003252 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003253 kfree(rbd_dev->image_id);
Alex Elder0ddebc02012-10-25 23:34:41 -05003254err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003255 if (ceph_opts)
3256 ceph_destroy_options(ceph_opts);
Alex Elder971f8392012-10-25 23:34:41 -05003257 kfree(rbd_dev->snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003258 kfree(rbd_dev->image_name);
3259 kfree(rbd_dev->pool_name);
3260err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003261 kfree(rbd_dev);
3262
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003263 dout("Error adding device %s\n", buf);
3264 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003265
3266 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003267}
3268
Alex Elderde71a292012-07-03 16:01:19 -05003269static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003270{
3271 struct list_head *tmp;
3272 struct rbd_device *rbd_dev;
3273
Alex Eldere124a82f2012-01-29 13:57:44 -06003274 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003275 list_for_each(tmp, &rbd_dev_list) {
3276 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003277 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003278 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003279 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003280 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003281 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003282 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003283 return NULL;
3284}
3285
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003286static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003287{
Alex Elder593a9e72012-02-07 12:03:37 -06003288 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003289
Alex Elder1dbb4392012-01-24 10:08:37 -06003290 if (rbd_dev->watch_request) {
3291 struct ceph_client *client = rbd_dev->rbd_client->client;
3292
3293 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003294 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003295 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003296 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003297 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003298
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003299 rbd_put_client(rbd_dev);
3300
3301 /* clean up and free blkdev */
3302 rbd_free_disk(rbd_dev);
3303 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003304
Alex Elder2ac4e752012-07-10 20:30:10 -05003305 /* release allocated disk header fields */
3306 rbd_header_free(&rbd_dev->header);
3307
Alex Elder32eec682012-02-08 16:11:14 -06003308 /* done with the id, and with the rbd_dev */
Alex Elder971f8392012-10-25 23:34:41 -05003309 kfree(rbd_dev->snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003310 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003311 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003312 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003313 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003314 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003315 kfree(rbd_dev);
3316
3317 /* release module ref */
3318 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003319}
3320
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003321static ssize_t rbd_remove(struct bus_type *bus,
3322 const char *buf,
3323 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003324{
3325 struct rbd_device *rbd_dev = NULL;
3326 int target_id, rc;
3327 unsigned long ul;
3328 int ret = count;
3329
3330 rc = strict_strtoul(buf, 10, &ul);
3331 if (rc)
3332 return rc;
3333
3334 /* convert to int; abort if we lost anything in the conversion */
3335 target_id = (int) ul;
3336 if (target_id != ul)
3337 return -EINVAL;
3338
3339 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3340
3341 rbd_dev = __rbd_get_dev(target_id);
3342 if (!rbd_dev) {
3343 ret = -ENOENT;
3344 goto done;
3345 }
3346
Alex Elder41f38c22012-10-25 23:34:40 -05003347 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003348 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003349
3350done:
3351 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003352
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003353 return ret;
3354}
3355
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003356/*
3357 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003358 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003359 */
3360static int rbd_sysfs_init(void)
3361{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003362 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003363
Alex Elderfed4c142012-02-07 12:03:36 -06003364 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003365 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003366 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003367
Alex Elderfed4c142012-02-07 12:03:36 -06003368 ret = bus_register(&rbd_bus_type);
3369 if (ret < 0)
3370 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003371
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003372 return ret;
3373}
3374
3375static void rbd_sysfs_cleanup(void)
3376{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003377 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003378 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003379}
3380
3381int __init rbd_init(void)
3382{
3383 int rc;
3384
3385 rc = rbd_sysfs_init();
3386 if (rc)
3387 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003388 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003389 return 0;
3390}
3391
3392void __exit rbd_exit(void)
3393{
3394 rbd_sysfs_cleanup();
3395}
3396
3397module_init(rbd_init);
3398module_exit(rbd_exit);
3399
3400MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3401MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3402MODULE_DESCRIPTION("rados block device");
3403
3404/* following authorship retained from original osdblk.c */
3405MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3406
3407MODULE_LICENSE("GPL");