blob: 6b1e9a9f2f726dc76cd27424284d41ddc1fdeab1 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderf0f8cef2012-01-29 13:57:44 -060055#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070057
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
Alex Elderd4b125e2012-07-03 16:01:19 -050060#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
Alex Elder35d489f2012-07-03 16:01:19 -050064#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070065
66#define RBD_SNAP_HEAD_NAME "-"
67
Alex Elder9e15b772012-10-30 19:40:33 -050068/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050071
Alex Elder1e130192012-07-03 16:01:19 -050072#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050073
Alex Elderd8891402012-10-09 13:50:17 -070074/* Feature bits */
75
Alex Elder5cbf6f122013-04-11 09:29:48 -050076#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070080
81/* Features supported by this (client software) implementation. */
82
Alex Elder770eba62012-10-25 23:34:40 -050083#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070084
Alex Elder81a89792012-02-02 08:13:30 -060085/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060092#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050098 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050099 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500100 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 char *snap_names;
109 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700110
111 u64 obj_version;
112};
113
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
Alex Elderbf0d5f502012-11-22 00:00:08 -0600161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
Alex Elder9969ebc2013-01-18 12:31:10 -0600169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600172
Alex Elder926f9b32013-02-11 12:33:24 -0600173enum obj_req_flags {
174 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600175 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600176 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
177 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600178};
179
Alex Elderbf0d5f502012-11-22 00:00:08 -0600180struct rbd_obj_request {
181 const char *object_name;
182 u64 offset; /* object start byte */
183 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600184 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600185
Alex Elderc5b5ef62013-02-11 12:33:24 -0600186 /*
187 * An object request associated with an image will have its
188 * img_data flag set; a standalone object request will not.
189 *
190 * A standalone object request will have which == BAD_WHICH
191 * and a null obj_request pointer.
192 *
193 * An object request initiated in support of a layered image
194 * object (to check for its existence before a write) will
195 * have which == BAD_WHICH and a non-null obj_request pointer.
196 *
197 * Finally, an object request for rbd image data will have
198 * which != BAD_WHICH, and will have a non-null img_request
199 * pointer. The value of which will be in the range
200 * 0..(img_request->obj_request_count-1).
201 */
202 union {
203 struct rbd_obj_request *obj_request; /* STAT op */
204 struct {
205 struct rbd_img_request *img_request;
206 u64 img_offset;
207 /* links for img_request->obj_requests list */
208 struct list_head links;
209 };
210 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211 u32 which; /* posn image request list */
212
213 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600214 union {
215 struct bio *bio_list;
216 struct {
217 struct page **pages;
218 u32 page_count;
219 };
220 };
Alex Elder0eefd472013-04-19 15:34:50 -0500221 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600222
223 struct ceph_osd_request *osd_req;
224
225 u64 xferred; /* bytes transferred */
226 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800227 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600230 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 struct kref kref;
233};
234
Alex Elder0c425242013-02-08 09:55:49 -0600235enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600236 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
237 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600238 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242 struct rbd_device *rbd_dev;
243 u64 offset; /* starting image byte offset */
244 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600245 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246 union {
Alex Elder9849e982013-01-24 16:13:36 -0600247 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600249 };
250 union {
251 struct request *rq; /* block request */
252 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500254 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 spinlock_t completion_lock;/* protects next_completion */
256 u32 next_completion;
257 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500258 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600259 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260
261 u32 obj_request_count;
262 struct list_head obj_requests; /* rbd_obj_request structs */
263
264 struct kref kref;
265};
266
267#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600268 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600270 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600271#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600272 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600273
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274struct rbd_snap {
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800275 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800276 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800277 struct list_head node;
278 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500279 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800280};
281
Alex Elderf84344f2012-08-31 17:29:51 -0500282struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500283 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500284 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500285 bool read_only;
286};
287
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288/*
289 * a single device
290 */
291struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500292 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293
294 int major; /* blkdev assigned major */
295 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
Alex Eldera30b71b2012-07-10 20:30:11 -0500297 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298 struct rbd_client *rbd_client;
299
300 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
301
Alex Elderb82d1672013-01-14 12:43:31 -0600302 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303
304 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600305 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500306 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500308 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500309
Alex Elder0903e872012-11-14 12:25:19 -0600310 struct ceph_file_layout layout;
311
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700312 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600313 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700314
Alex Elder86b00e02012-10-25 23:34:42 -0500315 struct rbd_spec *parent_spec;
316 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500317 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500318
Alex Eldercc070d52013-04-21 12:14:45 -0500319 u64 stripe_unit;
320 u64 stripe_count;
321
Josh Durginc6666012011-11-21 17:11:12 -0800322 /* protects updating the header */
323 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500324
325 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700326
327 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800328
329 /* list of snapshots */
330 struct list_head snaps;
331
332 /* sysfs related */
333 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600334 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800335};
336
Alex Elderb82d1672013-01-14 12:43:31 -0600337/*
338 * Flag bits for rbd_dev->flags. If atomicity is required,
339 * rbd_dev->lock is used to protect access.
340 *
341 * Currently, only the "removing" flag (which is coupled with the
342 * "open_count" field) requires atomic access.
343 */
Alex Elder6d292902013-01-14 12:43:31 -0600344enum rbd_dev_flags {
345 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600346 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600347};
348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600352static DEFINE_SPINLOCK(rbd_dev_list_lock);
353
Alex Elder432b8582012-01-29 13:57:44 -0600354static LIST_HEAD(rbd_client_list); /* clients */
355static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356
Alex Elder3d7efd12013-04-19 15:34:50 -0500357static int rbd_img_request_submit(struct rbd_img_request *img_request);
358
Alex Elder304f6802012-08-31 17:29:52 -0500359static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
Alex Elder304f6802012-08-31 17:29:52 -0500360
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800361static void rbd_dev_release(struct device *dev);
Alex Elder6087b512013-04-25 15:09:41 -0500362static void rbd_snap_destroy(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800363
Alex Elderf0f8cef2012-01-29 13:57:44 -0600364static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365 size_t count);
366static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367 size_t count);
Alex Elder2f82ee52012-10-30 19:40:33 -0500368static int rbd_dev_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600369
370static struct bus_attribute rbd_bus_attrs[] = {
371 __ATTR(add, S_IWUSR, NULL, rbd_add),
372 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
373 __ATTR_NULL
374};
375
376static struct bus_type rbd_bus_type = {
377 .name = "rbd",
378 .bus_attrs = rbd_bus_attrs,
379};
380
381static void rbd_root_dev_release(struct device *dev)
382{
383}
384
385static struct device rbd_root_dev = {
386 .init_name = "rbd",
387 .release = rbd_root_dev_release,
388};
389
Alex Elder06ecc6c2012-11-01 10:17:15 -0500390static __printf(2, 3)
391void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
392{
393 struct va_format vaf;
394 va_list args;
395
396 va_start(args, fmt);
397 vaf.fmt = fmt;
398 vaf.va = &args;
399
400 if (!rbd_dev)
401 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
402 else if (rbd_dev->disk)
403 printk(KERN_WARNING "%s: %s: %pV\n",
404 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
405 else if (rbd_dev->spec && rbd_dev->spec->image_name)
406 printk(KERN_WARNING "%s: image %s: %pV\n",
407 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
408 else if (rbd_dev->spec && rbd_dev->spec->image_id)
409 printk(KERN_WARNING "%s: id %s: %pV\n",
410 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
411 else /* punt */
412 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
413 RBD_DRV_NAME, rbd_dev, &vaf);
414 va_end(args);
415}
416
Alex Elderaafb2302012-09-06 16:00:54 -0500417#ifdef RBD_DEBUG
418#define rbd_assert(expr) \
419 if (unlikely(!(expr))) { \
420 printk(KERN_ERR "\nAssertion failure in %s() " \
421 "at line %d:\n\n" \
422 "\trbd_assert(%s);\n\n", \
423 __func__, __LINE__, #expr); \
424 BUG(); \
425 }
426#else /* !RBD_DEBUG */
427# define rbd_assert(expr) ((void) 0)
428#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800429
Alex Elder8b3e1a52013-01-24 16:13:36 -0600430static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500431static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600432
Alex Elder117973f2012-08-31 17:29:55 -0500433static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
434static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700435
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436static int rbd_open(struct block_device *bdev, fmode_t mode)
437{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600438 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600439 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderf84344f2012-08-31 17:29:51 -0500441 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442 return -EROFS;
443
Alex Eldera14ea262013-02-05 13:23:12 -0600444 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600445 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
446 removing = true;
447 else
448 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600449 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600450 if (removing)
451 return -ENOENT;
452
Alex Elder42382b72012-11-16 09:29:16 -0600453 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600454 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500455 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600456 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700457
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 return 0;
459}
460
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800461static int rbd_release(struct gendisk *disk, fmode_t mode)
462{
463 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600464 unsigned long open_count_before;
465
Alex Eldera14ea262013-02-05 13:23:12 -0600466 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600467 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600468 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600469 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800470
Alex Elder42382b72012-11-16 09:29:16 -0600471 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600472 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600473 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800474
475 return 0;
476}
477
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478static const struct block_device_operations rbd_bd_ops = {
479 .owner = THIS_MODULE,
480 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800481 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700482};
483
484/*
485 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500486 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487 */
Alex Elderf8c38922012-08-10 13:12:07 -0700488static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489{
490 struct rbd_client *rbdc;
491 int ret = -ENOMEM;
492
Alex Elder37206ee2013-02-20 17:32:08 -0600493 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
495 if (!rbdc)
496 goto out_opt;
497
498 kref_init(&rbdc->kref);
499 INIT_LIST_HEAD(&rbdc->node);
500
Alex Elderbc534d82012-01-29 13:57:44 -0600501 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
502
Alex Elder43ae4702012-07-03 16:01:18 -0500503 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600505 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500506 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700507
508 ret = ceph_open_session(rbdc->client);
509 if (ret < 0)
510 goto out_err;
511
Alex Elder432b8582012-01-29 13:57:44 -0600512 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600514 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515
Alex Elderbc534d82012-01-29 13:57:44 -0600516 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600517 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600518
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700519 return rbdc;
520
521out_err:
522 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600523out_mutex:
524 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700525 kfree(rbdc);
526out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500527 if (ceph_opts)
528 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600529 dout("%s: error %d\n", __func__, ret);
530
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400531 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532}
533
Alex Elder2f82ee52012-10-30 19:40:33 -0500534static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
535{
536 kref_get(&rbdc->kref);
537
538 return rbdc;
539}
540
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700541/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700542 * Find a ceph client with specific addr and configuration. If
543 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700545static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546{
547 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700548 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549
Alex Elder43ae4702012-07-03 16:01:18 -0500550 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 return NULL;
552
Alex Elder1f7ba332012-08-10 13:12:07 -0700553 spin_lock(&rbd_client_list_lock);
554 list_for_each_entry(client_node, &rbd_client_list, node) {
555 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500556 __rbd_get_client(client_node);
557
Alex Elder1f7ba332012-08-10 13:12:07 -0700558 found = true;
559 break;
560 }
561 }
562 spin_unlock(&rbd_client_list_lock);
563
564 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565}
566
567/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700568 * mount options
569 */
570enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700571 Opt_last_int,
572 /* int args above */
573 Opt_last_string,
574 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700575 Opt_read_only,
576 Opt_read_write,
577 /* Boolean args above */
578 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700579};
580
Alex Elder43ae4702012-07-03 16:01:18 -0500581static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700582 /* int args above */
583 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500584 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700585 {Opt_read_only, "ro"}, /* Alternate spelling */
586 {Opt_read_write, "read_write"},
587 {Opt_read_write, "rw"}, /* Alternate spelling */
588 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700589 {-1, NULL}
590};
591
Alex Elder98571b52013-01-20 14:44:42 -0600592struct rbd_options {
593 bool read_only;
594};
595
596#define RBD_READ_ONLY_DEFAULT false
597
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700598static int parse_rbd_opts_token(char *c, void *private)
599{
Alex Elder43ae4702012-07-03 16:01:18 -0500600 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700601 substring_t argstr[MAX_OPT_ARGS];
602 int token, intval, ret;
603
Alex Elder43ae4702012-07-03 16:01:18 -0500604 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700605 if (token < 0)
606 return -EINVAL;
607
608 if (token < Opt_last_int) {
609 ret = match_int(&argstr[0], &intval);
610 if (ret < 0) {
611 pr_err("bad mount option arg (not int) "
612 "at '%s'\n", c);
613 return ret;
614 }
615 dout("got int token %d val %d\n", token, intval);
616 } else if (token > Opt_last_int && token < Opt_last_string) {
617 dout("got string token %d val %s\n", token,
618 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700619 } else if (token > Opt_last_string && token < Opt_last_bool) {
620 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700621 } else {
622 dout("got token %d\n", token);
623 }
624
625 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700626 case Opt_read_only:
627 rbd_opts->read_only = true;
628 break;
629 case Opt_read_write:
630 rbd_opts->read_only = false;
631 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700632 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500633 rbd_assert(false);
634 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700635 }
636 return 0;
637}
638
639/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640 * Get a ceph client with specific addr and configuration, if one does
641 * not exist create it.
642 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500643static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644{
Alex Elderf8c38922012-08-10 13:12:07 -0700645 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700646
Alex Elder1f7ba332012-08-10 13:12:07 -0700647 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500648 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500649 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500650 else
Alex Elderf8c38922012-08-10 13:12:07 -0700651 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Elder9d3997f2012-10-25 23:34:42 -0500653 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654}
655
656/*
657 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600658 *
Alex Elder432b8582012-01-29 13:57:44 -0600659 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 */
661static void rbd_client_release(struct kref *kref)
662{
663 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
664
Alex Elder37206ee2013-02-20 17:32:08 -0600665 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500666 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500668 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669
670 ceph_destroy_client(rbdc->client);
671 kfree(rbdc);
672}
673
674/*
675 * Drop reference to ceph client node. If it's not referenced anymore, release
676 * it.
677 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500678static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679{
Alex Elderc53d5892012-10-25 23:34:42 -0500680 if (rbdc)
681 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682}
683
Alex Eldera30b71b2012-07-10 20:30:11 -0500684static bool rbd_image_format_valid(u32 image_format)
685{
686 return image_format == 1 || image_format == 2;
687}
688
Alex Elder8e94af82012-07-25 09:32:40 -0500689static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
690{
Alex Elder103a1502012-08-02 11:29:45 -0500691 size_t size;
692 u32 snap_count;
693
694 /* The header has to start with the magic rbd header text */
695 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
696 return false;
697
Alex Elderdb2388b2012-10-20 22:17:27 -0500698 /* The bio layer requires at least sector-sized I/O */
699
700 if (ondisk->options.order < SECTOR_SHIFT)
701 return false;
702
703 /* If we use u64 in a few spots we may be able to loosen this */
704
705 if (ondisk->options.order > 8 * sizeof (int) - 1)
706 return false;
707
Alex Elder103a1502012-08-02 11:29:45 -0500708 /*
709 * The size of a snapshot header has to fit in a size_t, and
710 * that limits the number of snapshots.
711 */
712 snap_count = le32_to_cpu(ondisk->snap_count);
713 size = SIZE_MAX - sizeof (struct ceph_snap_context);
714 if (snap_count > size / sizeof (__le64))
715 return false;
716
717 /*
718 * Not only that, but the size of the entire the snapshot
719 * header must also be representable in a size_t.
720 */
721 size -= snap_count * sizeof (__le64);
722 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
723 return false;
724
725 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500726}
727
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728/*
729 * Create a new header structure, translate header format from the on-disk
730 * header.
731 */
732static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d9982012-08-02 11:29:46 -0500733 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700734{
Alex Elderccece232012-07-10 20:30:10 -0500735 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500736 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500737 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500738 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder6a523252012-07-19 17:12:59 -0500740 memset(header, 0, sizeof (*header));
741
Alex Elder103a1502012-08-02 11:29:45 -0500742 snap_count = le32_to_cpu(ondisk->snap_count);
743
Alex Elder58c17b02012-08-23 23:22:06 -0500744 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
745 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500746 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500748 memcpy(header->object_prefix, ondisk->object_prefix, len);
749 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600750
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500752 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
753
Alex Elder621901d2012-08-23 23:22:06 -0500754 /* Save a copy of the snapshot names */
755
Alex Elderf785cc12012-08-23 23:22:06 -0500756 if (snap_names_len > (u64) SIZE_MAX)
757 return -EIO;
758 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500760 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500761 /*
762 * Note that rbd_dev_v1_header_read() guarantees
763 * the ondisk buffer we're working with has
764 * snap_names_len bytes beyond the end of the
765 * snapshot id array, this memcpy() is safe.
766 */
767 memcpy(header->snap_names, &ondisk->snaps[snap_count],
768 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500769
Alex Elder621901d2012-08-23 23:22:06 -0500770 /* Record each snapshot's size */
771
Alex Elderd2bb24e2012-07-26 23:37:14 -0500772 size = snap_count * sizeof (*header->snap_sizes);
773 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500775 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500776 for (i = 0; i < snap_count; i++)
777 header->snap_sizes[i] =
778 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elderccece232012-07-10 20:30:10 -0500780 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 header->snap_names = NULL;
782 header->snap_sizes = NULL;
783 }
Alex Elder849b4262012-07-09 21:04:24 -0500784
Alex Elder34b13182012-07-13 20:35:12 -0500785 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786 header->obj_order = ondisk->options.order;
787 header->crypt_type = ondisk->options.crypt_type;
788 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500789
Alex Elder621901d2012-08-23 23:22:06 -0500790 /* Allocate and fill in the snapshot context */
791
Alex Elderf84344f2012-08-31 17:29:51 -0500792 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500793 size = sizeof (struct ceph_snap_context);
794 size += snap_count * sizeof (header->snapc->snaps[0]);
795 header->snapc = kzalloc(size, GFP_KERNEL);
796 if (!header->snapc)
797 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798
799 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500800 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500802 for (i = 0; i < snap_count; i++)
803 header->snapc->snaps[i] =
804 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700805
806 return 0;
807
Alex Elder6a523252012-07-19 17:12:59 -0500808out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500809 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500810 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500812 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500813 kfree(header->object_prefix);
814 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500815
Alex Elder00f1f362012-02-07 12:03:36 -0600816 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700817}
818
Alex Elder9e15b772012-10-30 19:40:33 -0500819static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
820{
821 struct rbd_snap *snap;
822
823 if (snap_id == CEPH_NOSNAP)
824 return RBD_SNAP_HEAD_NAME;
825
826 list_for_each_entry(snap, &rbd_dev->snaps, node)
827 if (snap_id == snap->id)
828 return snap->name;
829
830 return NULL;
831}
832
Alex Elder8b0241f2013-04-25 23:15:08 -0500833static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev,
834 const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835{
Alex Eldere86924a2012-07-10 20:30:11 -0500836 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600837
Alex Elder8b0241f2013-04-25 23:15:08 -0500838 list_for_each_entry(snap, &rbd_dev->snaps, node)
839 if (!strcmp(snap_name, snap->name))
840 return snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600841
Alex Elder8b0241f2013-04-25 23:15:08 -0500842 return NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
Alex Elder819d52b2012-10-25 23:34:41 -0500845static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846{
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500847 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800848 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500849 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500850 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500851 rbd_dev->mapping.features = rbd_dev->header.features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 } else {
Alex Elder8b0241f2013-04-25 23:15:08 -0500853 struct rbd_snap *snap;
854
855 snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
856 if (!snap)
857 return -ENOENT;
858 rbd_dev->spec->snap_id = snap->id;
859 rbd_dev->mapping.size = snap->size;
860 rbd_dev->mapping.features = snap->features;
Alex Elderf84344f2012-08-31 17:29:51 -0500861 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 }
Alex Elder6d292902013-01-14 12:43:31 -0600863 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
864
Alex Elder8b0241f2013-04-25 23:15:08 -0500865 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700866}
867
868static void rbd_header_free(struct rbd_image_header *header)
869{
Alex Elder849b4262012-07-09 21:04:24 -0500870 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500871 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500873 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500874 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500875 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800876 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500877 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878}
879
Alex Elder98571b52013-01-20 14:44:42 -0600880static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881{
Alex Elder65ccfe22012-08-09 10:33:26 -0700882 char *name;
883 u64 segment;
884 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885
Alex Elder2fd82b92012-11-09 15:05:54 -0600886 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700887 if (!name)
888 return NULL;
889 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600890 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700891 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600892 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700893 pr_err("error formatting segment name for #%llu (%d)\n",
894 segment, ret);
895 kfree(name);
896 name = NULL;
897 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898
Alex Elder65ccfe22012-08-09 10:33:26 -0700899 return name;
900}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901
Alex Elder65ccfe22012-08-09 10:33:26 -0700902static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
903{
904 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
Alex Elder65ccfe22012-08-09 10:33:26 -0700906 return offset & (segment_size - 1);
907}
908
909static u64 rbd_segment_length(struct rbd_device *rbd_dev,
910 u64 offset, u64 length)
911{
912 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
913
914 offset &= segment_size - 1;
915
Alex Elderaafb2302012-09-06 16:00:54 -0500916 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700917 if (offset + length > segment_size)
918 length = segment_size - offset;
919
920 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921}
922
923/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700924 * returns the size of an object in the image
925 */
926static u64 rbd_obj_bytes(struct rbd_image_header *header)
927{
928 return 1 << header->obj_order;
929}
930
931/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932 * bio helpers
933 */
934
935static void bio_chain_put(struct bio *chain)
936{
937 struct bio *tmp;
938
939 while (chain) {
940 tmp = chain;
941 chain = chain->bi_next;
942 bio_put(tmp);
943 }
944}
945
946/*
947 * zeros a bio chain, starting at specific offset
948 */
949static void zero_bio_chain(struct bio *chain, int start_ofs)
950{
951 struct bio_vec *bv;
952 unsigned long flags;
953 void *buf;
954 int i;
955 int pos = 0;
956
957 while (chain) {
958 bio_for_each_segment(bv, chain, i) {
959 if (pos + bv->bv_len > start_ofs) {
960 int remainder = max(start_ofs - pos, 0);
961 buf = bvec_kmap_irq(bv, &flags);
962 memset(buf + remainder, 0,
963 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200964 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965 }
966 pos += bv->bv_len;
967 }
968
969 chain = chain->bi_next;
970 }
971}
972
973/*
Alex Elderb9434c52013-04-19 15:34:50 -0500974 * similar to zero_bio_chain(), zeros data defined by a page array,
975 * starting at the given byte offset from the start of the array and
976 * continuing up to the given end offset. The pages array is
977 * assumed to be big enough to hold all bytes up to the end.
978 */
979static void zero_pages(struct page **pages, u64 offset, u64 end)
980{
981 struct page **page = &pages[offset >> PAGE_SHIFT];
982
983 rbd_assert(end > offset);
984 rbd_assert(end - offset <= (u64)SIZE_MAX);
985 while (offset < end) {
986 size_t page_offset;
987 size_t length;
988 unsigned long flags;
989 void *kaddr;
990
991 page_offset = (size_t)(offset & ~PAGE_MASK);
992 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
993 local_irq_save(flags);
994 kaddr = kmap_atomic(*page);
995 memset(kaddr + page_offset, 0, length);
996 kunmap_atomic(kaddr);
997 local_irq_restore(flags);
998
999 offset += length;
1000 page++;
1001 }
1002}
1003
1004/*
Alex Elderf7760da2012-10-20 22:17:27 -05001005 * Clone a portion of a bio, starting at the given byte offset
1006 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007 */
Alex Elderf7760da2012-10-20 22:17:27 -05001008static struct bio *bio_clone_range(struct bio *bio_src,
1009 unsigned int offset,
1010 unsigned int len,
1011 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001012{
Alex Elderf7760da2012-10-20 22:17:27 -05001013 struct bio_vec *bv;
1014 unsigned int resid;
1015 unsigned short idx;
1016 unsigned int voff;
1017 unsigned short end_idx;
1018 unsigned short vcnt;
1019 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001020
Alex Elderf7760da2012-10-20 22:17:27 -05001021 /* Handle the easy case for the caller */
1022
1023 if (!offset && len == bio_src->bi_size)
1024 return bio_clone(bio_src, gfpmask);
1025
1026 if (WARN_ON_ONCE(!len))
1027 return NULL;
1028 if (WARN_ON_ONCE(len > bio_src->bi_size))
1029 return NULL;
1030 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1031 return NULL;
1032
1033 /* Find first affected segment... */
1034
1035 resid = offset;
1036 __bio_for_each_segment(bv, bio_src, idx, 0) {
1037 if (resid < bv->bv_len)
1038 break;
1039 resid -= bv->bv_len;
1040 }
1041 voff = resid;
1042
1043 /* ...and the last affected segment */
1044
1045 resid += len;
1046 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1047 if (resid <= bv->bv_len)
1048 break;
1049 resid -= bv->bv_len;
1050 }
1051 vcnt = end_idx - idx + 1;
1052
1053 /* Build the clone */
1054
1055 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1056 if (!bio)
1057 return NULL; /* ENOMEM */
1058
1059 bio->bi_bdev = bio_src->bi_bdev;
1060 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1061 bio->bi_rw = bio_src->bi_rw;
1062 bio->bi_flags |= 1 << BIO_CLONED;
1063
1064 /*
1065 * Copy over our part of the bio_vec, then update the first
1066 * and last (or only) entries.
1067 */
1068 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1069 vcnt * sizeof (struct bio_vec));
1070 bio->bi_io_vec[0].bv_offset += voff;
1071 if (vcnt > 1) {
1072 bio->bi_io_vec[0].bv_len -= voff;
1073 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1074 } else {
1075 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 }
1077
Alex Elderf7760da2012-10-20 22:17:27 -05001078 bio->bi_vcnt = vcnt;
1079 bio->bi_size = len;
1080 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001081
Alex Elderf7760da2012-10-20 22:17:27 -05001082 return bio;
1083}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084
Alex Elderf7760da2012-10-20 22:17:27 -05001085/*
1086 * Clone a portion of a bio chain, starting at the given byte offset
1087 * into the first bio in the source chain and continuing for the
1088 * number of bytes indicated. The result is another bio chain of
1089 * exactly the given length, or a null pointer on error.
1090 *
1091 * The bio_src and offset parameters are both in-out. On entry they
1092 * refer to the first source bio and the offset into that bio where
1093 * the start of data to be cloned is located.
1094 *
1095 * On return, bio_src is updated to refer to the bio in the source
1096 * chain that contains first un-cloned byte, and *offset will
1097 * contain the offset of that byte within that bio.
1098 */
1099static struct bio *bio_chain_clone_range(struct bio **bio_src,
1100 unsigned int *offset,
1101 unsigned int len,
1102 gfp_t gfpmask)
1103{
1104 struct bio *bi = *bio_src;
1105 unsigned int off = *offset;
1106 struct bio *chain = NULL;
1107 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108
Alex Elderf7760da2012-10-20 22:17:27 -05001109 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
Alex Elderf7760da2012-10-20 22:17:27 -05001111 if (!bi || off >= bi->bi_size || !len)
1112 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001113
Alex Elderf7760da2012-10-20 22:17:27 -05001114 end = &chain;
1115 while (len) {
1116 unsigned int bi_size;
1117 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118
Alex Elderf5400b72012-11-01 10:17:15 -05001119 if (!bi) {
1120 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001121 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001122 }
Alex Elderf7760da2012-10-20 22:17:27 -05001123 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1124 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1125 if (!bio)
1126 goto out_err; /* ENOMEM */
1127
1128 *end = bio;
1129 end = &bio->bi_next;
1130
1131 off += bi_size;
1132 if (off == bi->bi_size) {
1133 bi = bi->bi_next;
1134 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 }
Alex Elderf7760da2012-10-20 22:17:27 -05001136 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 }
Alex Elderf7760da2012-10-20 22:17:27 -05001138 *bio_src = bi;
1139 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140
Alex Elderf7760da2012-10-20 22:17:27 -05001141 return chain;
1142out_err:
1143 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145 return NULL;
1146}
1147
Alex Elder926f9b32013-02-11 12:33:24 -06001148/*
1149 * The default/initial value for all object request flags is 0. For
1150 * each flag, once its value is set to 1 it is never reset to 0
1151 * again.
1152 */
Alex Elder6365d332013-02-11 12:33:24 -06001153static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1154{
1155 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001156 struct rbd_device *rbd_dev;
1157
Alex Elder57acbaa2013-02-11 12:33:24 -06001158 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001159 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1160 obj_request);
1161 }
1162}
1163
1164static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1165{
1166 smp_mb();
1167 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1168}
1169
Alex Elder57acbaa2013-02-11 12:33:24 -06001170static void obj_request_done_set(struct rbd_obj_request *obj_request)
1171{
1172 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1173 struct rbd_device *rbd_dev = NULL;
1174
1175 if (obj_request_img_data_test(obj_request))
1176 rbd_dev = obj_request->img_request->rbd_dev;
1177 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1178 obj_request);
1179 }
1180}
1181
1182static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1183{
1184 smp_mb();
1185 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1186}
1187
Alex Elder5679c592013-02-11 12:33:24 -06001188/*
1189 * This sets the KNOWN flag after (possibly) setting the EXISTS
1190 * flag. The latter is set based on the "exists" value provided.
1191 *
1192 * Note that for our purposes once an object exists it never goes
1193 * away again. It's possible that the response from two existence
1194 * checks are separated by the creation of the target object, and
1195 * the first ("doesn't exist") response arrives *after* the second
1196 * ("does exist"). In that case we ignore the second one.
1197 */
1198static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1199 bool exists)
1200{
1201 if (exists)
1202 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1203 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1204 smp_mb();
1205}
1206
1207static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1208{
1209 smp_mb();
1210 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1211}
1212
1213static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1214{
1215 smp_mb();
1216 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1217}
1218
Alex Elderbf0d5f502012-11-22 00:00:08 -06001219static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1220{
Alex Elder37206ee2013-02-20 17:32:08 -06001221 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1222 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001223 kref_get(&obj_request->kref);
1224}
1225
1226static void rbd_obj_request_destroy(struct kref *kref);
1227static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1228{
1229 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001230 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1231 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001232 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1233}
1234
1235static void rbd_img_request_get(struct rbd_img_request *img_request)
1236{
Alex Elder37206ee2013-02-20 17:32:08 -06001237 dout("%s: img %p (was %d)\n", __func__, img_request,
1238 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001239 kref_get(&img_request->kref);
1240}
1241
1242static void rbd_img_request_destroy(struct kref *kref);
1243static void rbd_img_request_put(struct rbd_img_request *img_request)
1244{
1245 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001246 dout("%s: img %p (was %d)\n", __func__, img_request,
1247 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001248 kref_put(&img_request->kref, rbd_img_request_destroy);
1249}
1250
1251static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1252 struct rbd_obj_request *obj_request)
1253{
Alex Elder25dcf952013-01-25 17:08:55 -06001254 rbd_assert(obj_request->img_request == NULL);
1255
Alex Elderb155e862013-04-15 14:50:37 -05001256 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001257 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001258 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001259 rbd_assert(!obj_request_img_data_test(obj_request));
1260 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001261 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001262 img_request->obj_request_count++;
1263 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001264 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1265 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001266}
1267
1268static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1269 struct rbd_obj_request *obj_request)
1270{
1271 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001272
Alex Elder37206ee2013-02-20 17:32:08 -06001273 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1274 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001275 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001276 rbd_assert(img_request->obj_request_count > 0);
1277 img_request->obj_request_count--;
1278 rbd_assert(obj_request->which == img_request->obj_request_count);
1279 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001280 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001281 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001282 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001283 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001284 rbd_obj_request_put(obj_request);
1285}
1286
1287static bool obj_request_type_valid(enum obj_request_type type)
1288{
1289 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001290 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001291 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001292 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001293 return true;
1294 default:
1295 return false;
1296 }
1297}
1298
Alex Elderbf0d5f502012-11-22 00:00:08 -06001299static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1300 struct rbd_obj_request *obj_request)
1301{
Alex Elder37206ee2013-02-20 17:32:08 -06001302 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1303
Alex Elderbf0d5f502012-11-22 00:00:08 -06001304 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1305}
1306
1307static void rbd_img_request_complete(struct rbd_img_request *img_request)
1308{
Alex Elder55f27e02013-04-10 12:34:25 -05001309
Alex Elder37206ee2013-02-20 17:32:08 -06001310 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001311
1312 /*
1313 * If no error occurred, compute the aggregate transfer
1314 * count for the image request. We could instead use
1315 * atomic64_cmpxchg() to update it as each object request
1316 * completes; not clear which way is better off hand.
1317 */
1318 if (!img_request->result) {
1319 struct rbd_obj_request *obj_request;
1320 u64 xferred = 0;
1321
1322 for_each_obj_request(img_request, obj_request)
1323 xferred += obj_request->xferred;
1324 img_request->xferred = xferred;
1325 }
1326
Alex Elderbf0d5f502012-11-22 00:00:08 -06001327 if (img_request->callback)
1328 img_request->callback(img_request);
1329 else
1330 rbd_img_request_put(img_request);
1331}
1332
Alex Elder788e2df2013-01-17 12:25:27 -06001333/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1334
1335static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1336{
Alex Elder37206ee2013-02-20 17:32:08 -06001337 dout("%s: obj %p\n", __func__, obj_request);
1338
Alex Elder788e2df2013-01-17 12:25:27 -06001339 return wait_for_completion_interruptible(&obj_request->completion);
1340}
1341
Alex Elder0c425242013-02-08 09:55:49 -06001342/*
1343 * The default/initial value for all image request flags is 0. Each
1344 * is conditionally set to 1 at image request initialization time
1345 * and currently never change thereafter.
1346 */
1347static void img_request_write_set(struct rbd_img_request *img_request)
1348{
1349 set_bit(IMG_REQ_WRITE, &img_request->flags);
1350 smp_mb();
1351}
1352
1353static bool img_request_write_test(struct rbd_img_request *img_request)
1354{
1355 smp_mb();
1356 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1357}
1358
Alex Elder9849e982013-01-24 16:13:36 -06001359static void img_request_child_set(struct rbd_img_request *img_request)
1360{
1361 set_bit(IMG_REQ_CHILD, &img_request->flags);
1362 smp_mb();
1363}
1364
1365static bool img_request_child_test(struct rbd_img_request *img_request)
1366{
1367 smp_mb();
1368 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1369}
1370
Alex Elderd0b2e942013-01-24 16:13:36 -06001371static void img_request_layered_set(struct rbd_img_request *img_request)
1372{
1373 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1374 smp_mb();
1375}
1376
1377static bool img_request_layered_test(struct rbd_img_request *img_request)
1378{
1379 smp_mb();
1380 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1381}
1382
Alex Elder6e2a4502013-03-27 09:16:30 -05001383static void
1384rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1385{
Alex Elderb9434c52013-04-19 15:34:50 -05001386 u64 xferred = obj_request->xferred;
1387 u64 length = obj_request->length;
1388
Alex Elder6e2a4502013-03-27 09:16:30 -05001389 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1390 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001391 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001392 /*
1393 * ENOENT means a hole in the image. We zero-fill the
1394 * entire length of the request. A short read also implies
1395 * zero-fill to the end of the request. Either way we
1396 * update the xferred count to indicate the whole request
1397 * was satisfied.
1398 */
Alex Elderb9434c52013-04-19 15:34:50 -05001399 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001400 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001401 if (obj_request->type == OBJ_REQUEST_BIO)
1402 zero_bio_chain(obj_request->bio_list, 0);
1403 else
1404 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001405 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001406 obj_request->xferred = length;
1407 } else if (xferred < length && !obj_request->result) {
1408 if (obj_request->type == OBJ_REQUEST_BIO)
1409 zero_bio_chain(obj_request->bio_list, xferred);
1410 else
1411 zero_pages(obj_request->pages, xferred, length);
1412 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001413 }
1414 obj_request_done_set(obj_request);
1415}
1416
Alex Elderbf0d5f502012-11-22 00:00:08 -06001417static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1418{
Alex Elder37206ee2013-02-20 17:32:08 -06001419 dout("%s: obj %p cb %p\n", __func__, obj_request,
1420 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421 if (obj_request->callback)
1422 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001423 else
1424 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001425}
1426
Alex Elderc47f9372013-02-26 14:23:07 -06001427static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001428{
1429 dout("%s: obj %p\n", __func__, obj_request);
1430 obj_request_done_set(obj_request);
1431}
1432
Alex Elderc47f9372013-02-26 14:23:07 -06001433static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001434{
Alex Elder57acbaa2013-02-11 12:33:24 -06001435 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001436 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001437 bool layered = false;
1438
1439 if (obj_request_img_data_test(obj_request)) {
1440 img_request = obj_request->img_request;
1441 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001442 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001443 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001444
1445 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1446 obj_request, img_request, obj_request->result,
1447 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001448 if (layered && obj_request->result == -ENOENT &&
1449 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001450 rbd_img_parent_read(obj_request);
1451 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001452 rbd_img_obj_request_read_callback(obj_request);
1453 else
1454 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001455}
1456
Alex Elderc47f9372013-02-26 14:23:07 -06001457static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001458{
Sage Weil1b83bef2013-02-25 16:11:12 -08001459 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1460 obj_request->result, obj_request->length);
1461 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001462 * There is no such thing as a successful short write. Set
1463 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001464 */
1465 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001466 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001467}
1468
Alex Elderfbfab532013-02-08 09:55:48 -06001469/*
1470 * For a simple stat call there's nothing to do. We'll do more if
1471 * this is part of a write sequence for a layered image.
1472 */
Alex Elderc47f9372013-02-26 14:23:07 -06001473static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001474{
Alex Elder37206ee2013-02-20 17:32:08 -06001475 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001476 obj_request_done_set(obj_request);
1477}
1478
Alex Elderbf0d5f502012-11-22 00:00:08 -06001479static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1480 struct ceph_msg *msg)
1481{
1482 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483 u16 opcode;
1484
Alex Elder37206ee2013-02-20 17:32:08 -06001485 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001486 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001487 if (obj_request_img_data_test(obj_request)) {
1488 rbd_assert(obj_request->img_request);
1489 rbd_assert(obj_request->which != BAD_WHICH);
1490 } else {
1491 rbd_assert(obj_request->which == BAD_WHICH);
1492 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001493
Sage Weil1b83bef2013-02-25 16:11:12 -08001494 if (osd_req->r_result < 0)
1495 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001496 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1497
Alex Elder0eefd472013-04-19 15:34:50 -05001498 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001499
Alex Elderc47f9372013-02-26 14:23:07 -06001500 /*
1501 * We support a 64-bit length, but ultimately it has to be
1502 * passed to blk_end_request(), which takes an unsigned int.
1503 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001504 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001505 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001506 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001507 switch (opcode) {
1508 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001509 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001510 break;
1511 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001512 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001514 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001515 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001516 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001517 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001518 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001519 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001520 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001521 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522 default:
1523 rbd_warn(NULL, "%s: unsupported op %hu\n",
1524 obj_request->object_name, (unsigned short) opcode);
1525 break;
1526 }
1527
Alex Elder07741302013-02-05 23:41:50 -06001528 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001529 rbd_obj_request_complete(obj_request);
1530}
1531
Alex Elder9d4df012013-04-19 15:34:50 -05001532static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001533{
1534 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001535 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001536 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001537
Alex Elder8c042b02013-04-03 01:28:58 -05001538 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001539
Alex Elder9d4df012013-04-19 15:34:50 -05001540 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001541 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001542 NULL, snap_id, NULL);
1543}
1544
1545static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1546{
1547 struct rbd_img_request *img_request = obj_request->img_request;
1548 struct ceph_osd_request *osd_req = obj_request->osd_req;
1549 struct ceph_snap_context *snapc;
1550 struct timespec mtime = CURRENT_TIME;
1551
1552 rbd_assert(osd_req != NULL);
1553
1554 snapc = img_request ? img_request->snapc : NULL;
1555 ceph_osdc_build_request(osd_req, obj_request->offset,
1556 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001557}
1558
Alex Elderbf0d5f502012-11-22 00:00:08 -06001559static struct ceph_osd_request *rbd_osd_req_create(
1560 struct rbd_device *rbd_dev,
1561 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001562 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001563{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 struct ceph_snap_context *snapc = NULL;
1565 struct ceph_osd_client *osdc;
1566 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567
Alex Elder6365d332013-02-11 12:33:24 -06001568 if (obj_request_img_data_test(obj_request)) {
1569 struct rbd_img_request *img_request = obj_request->img_request;
1570
Alex Elder0c425242013-02-08 09:55:49 -06001571 rbd_assert(write_request ==
1572 img_request_write_test(img_request));
1573 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001574 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001575 }
1576
1577 /* Allocate and initialize the request, for the single op */
1578
1579 osdc = &rbd_dev->rbd_client->client->osdc;
1580 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1581 if (!osd_req)
1582 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001583
Alex Elder430c28c2013-04-03 21:32:51 -05001584 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001586 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001587 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001588
1589 osd_req->r_callback = rbd_osd_req_callback;
1590 osd_req->r_priv = obj_request;
1591
1592 osd_req->r_oid_len = strlen(obj_request->object_name);
1593 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1594 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1595
1596 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1597
Alex Elderbf0d5f502012-11-22 00:00:08 -06001598 return osd_req;
1599}
1600
Alex Elder0eefd472013-04-19 15:34:50 -05001601/*
1602 * Create a copyup osd request based on the information in the
1603 * object request supplied. A copyup request has two osd ops,
1604 * a copyup method call, and a "normal" write request.
1605 */
1606static struct ceph_osd_request *
1607rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1608{
1609 struct rbd_img_request *img_request;
1610 struct ceph_snap_context *snapc;
1611 struct rbd_device *rbd_dev;
1612 struct ceph_osd_client *osdc;
1613 struct ceph_osd_request *osd_req;
1614
1615 rbd_assert(obj_request_img_data_test(obj_request));
1616 img_request = obj_request->img_request;
1617 rbd_assert(img_request);
1618 rbd_assert(img_request_write_test(img_request));
1619
1620 /* Allocate and initialize the request, for the two ops */
1621
1622 snapc = img_request->snapc;
1623 rbd_dev = img_request->rbd_dev;
1624 osdc = &rbd_dev->rbd_client->client->osdc;
1625 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1626 if (!osd_req)
1627 return NULL; /* ENOMEM */
1628
1629 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1630 osd_req->r_callback = rbd_osd_req_callback;
1631 osd_req->r_priv = obj_request;
1632
1633 osd_req->r_oid_len = strlen(obj_request->object_name);
1634 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1635 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1636
1637 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1638
1639 return osd_req;
1640}
1641
1642
Alex Elderbf0d5f502012-11-22 00:00:08 -06001643static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1644{
1645 ceph_osdc_put_request(osd_req);
1646}
1647
1648/* object_name is assumed to be a non-null pointer and NUL-terminated */
1649
1650static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1651 u64 offset, u64 length,
1652 enum obj_request_type type)
1653{
1654 struct rbd_obj_request *obj_request;
1655 size_t size;
1656 char *name;
1657
1658 rbd_assert(obj_request_type_valid(type));
1659
1660 size = strlen(object_name) + 1;
1661 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1662 if (!obj_request)
1663 return NULL;
1664
1665 name = (char *)(obj_request + 1);
1666 obj_request->object_name = memcpy(name, object_name, size);
1667 obj_request->offset = offset;
1668 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001669 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001670 obj_request->which = BAD_WHICH;
1671 obj_request->type = type;
1672 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001673 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 kref_init(&obj_request->kref);
1675
Alex Elder37206ee2013-02-20 17:32:08 -06001676 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1677 offset, length, (int)type, obj_request);
1678
Alex Elderbf0d5f502012-11-22 00:00:08 -06001679 return obj_request;
1680}
1681
1682static void rbd_obj_request_destroy(struct kref *kref)
1683{
1684 struct rbd_obj_request *obj_request;
1685
1686 obj_request = container_of(kref, struct rbd_obj_request, kref);
1687
Alex Elder37206ee2013-02-20 17:32:08 -06001688 dout("%s: obj %p\n", __func__, obj_request);
1689
Alex Elderbf0d5f502012-11-22 00:00:08 -06001690 rbd_assert(obj_request->img_request == NULL);
1691 rbd_assert(obj_request->which == BAD_WHICH);
1692
1693 if (obj_request->osd_req)
1694 rbd_osd_req_destroy(obj_request->osd_req);
1695
1696 rbd_assert(obj_request_type_valid(obj_request->type));
1697 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001698 case OBJ_REQUEST_NODATA:
1699 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001700 case OBJ_REQUEST_BIO:
1701 if (obj_request->bio_list)
1702 bio_chain_put(obj_request->bio_list);
1703 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001704 case OBJ_REQUEST_PAGES:
1705 if (obj_request->pages)
1706 ceph_release_page_vector(obj_request->pages,
1707 obj_request->page_count);
1708 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001709 }
1710
1711 kfree(obj_request);
1712}
1713
1714/*
1715 * Caller is responsible for filling in the list of object requests
1716 * that comprises the image request, and the Linux request pointer
1717 * (if there is one).
1718 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001719static struct rbd_img_request *rbd_img_request_create(
1720 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001721 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001722 bool write_request,
1723 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001724{
1725 struct rbd_img_request *img_request;
1726 struct ceph_snap_context *snapc = NULL;
1727
1728 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1729 if (!img_request)
1730 return NULL;
1731
1732 if (write_request) {
1733 down_read(&rbd_dev->header_rwsem);
1734 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1735 up_read(&rbd_dev->header_rwsem);
1736 if (WARN_ON(!snapc)) {
1737 kfree(img_request);
1738 return NULL; /* Shouldn't happen */
1739 }
Alex Elder0c425242013-02-08 09:55:49 -06001740
Alex Elderbf0d5f502012-11-22 00:00:08 -06001741 }
1742
1743 img_request->rq = NULL;
1744 img_request->rbd_dev = rbd_dev;
1745 img_request->offset = offset;
1746 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001747 img_request->flags = 0;
1748 if (write_request) {
1749 img_request_write_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001750 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001751 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001752 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001753 }
Alex Elder9849e982013-01-24 16:13:36 -06001754 if (child_request)
1755 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001756 if (rbd_dev->parent_spec)
1757 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001758 spin_lock_init(&img_request->completion_lock);
1759 img_request->next_completion = 0;
1760 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001761 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001762 img_request->obj_request_count = 0;
1763 INIT_LIST_HEAD(&img_request->obj_requests);
1764 kref_init(&img_request->kref);
1765
1766 rbd_img_request_get(img_request); /* Avoid a warning */
1767 rbd_img_request_put(img_request); /* TEMPORARY */
1768
Alex Elder37206ee2013-02-20 17:32:08 -06001769 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1770 write_request ? "write" : "read", offset, length,
1771 img_request);
1772
Alex Elderbf0d5f502012-11-22 00:00:08 -06001773 return img_request;
1774}
1775
1776static void rbd_img_request_destroy(struct kref *kref)
1777{
1778 struct rbd_img_request *img_request;
1779 struct rbd_obj_request *obj_request;
1780 struct rbd_obj_request *next_obj_request;
1781
1782 img_request = container_of(kref, struct rbd_img_request, kref);
1783
Alex Elder37206ee2013-02-20 17:32:08 -06001784 dout("%s: img %p\n", __func__, img_request);
1785
Alex Elderbf0d5f502012-11-22 00:00:08 -06001786 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1787 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001788 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001789
Alex Elder0c425242013-02-08 09:55:49 -06001790 if (img_request_write_test(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001791 ceph_put_snap_context(img_request->snapc);
1792
Alex Elder8b3e1a52013-01-24 16:13:36 -06001793 if (img_request_child_test(img_request))
1794 rbd_obj_request_put(img_request->obj_request);
1795
Alex Elderbf0d5f502012-11-22 00:00:08 -06001796 kfree(img_request);
1797}
1798
Alex Elder12178572013-02-08 09:55:49 -06001799static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1800{
Alex Elder6365d332013-02-11 12:33:24 -06001801 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001802 unsigned int xferred;
1803 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001804 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001805
Alex Elder6365d332013-02-11 12:33:24 -06001806 rbd_assert(obj_request_img_data_test(obj_request));
1807 img_request = obj_request->img_request;
1808
Alex Elder12178572013-02-08 09:55:49 -06001809 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1810 xferred = (unsigned int)obj_request->xferred;
1811 result = obj_request->result;
1812 if (result) {
1813 struct rbd_device *rbd_dev = img_request->rbd_dev;
1814
1815 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1816 img_request_write_test(img_request) ? "write" : "read",
1817 obj_request->length, obj_request->img_offset,
1818 obj_request->offset);
1819 rbd_warn(rbd_dev, " result %d xferred %x\n",
1820 result, xferred);
1821 if (!img_request->result)
1822 img_request->result = result;
1823 }
1824
Alex Elderf1a47392013-04-19 15:34:50 -05001825 /* Image object requests don't own their page array */
1826
1827 if (obj_request->type == OBJ_REQUEST_PAGES) {
1828 obj_request->pages = NULL;
1829 obj_request->page_count = 0;
1830 }
1831
Alex Elder8b3e1a52013-01-24 16:13:36 -06001832 if (img_request_child_test(img_request)) {
1833 rbd_assert(img_request->obj_request != NULL);
1834 more = obj_request->which < img_request->obj_request_count - 1;
1835 } else {
1836 rbd_assert(img_request->rq != NULL);
1837 more = blk_end_request(img_request->rq, result, xferred);
1838 }
1839
1840 return more;
Alex Elder12178572013-02-08 09:55:49 -06001841}
1842
Alex Elder21692382013-04-05 01:27:12 -05001843static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1844{
1845 struct rbd_img_request *img_request;
1846 u32 which = obj_request->which;
1847 bool more = true;
1848
Alex Elder6365d332013-02-11 12:33:24 -06001849 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001850 img_request = obj_request->img_request;
1851
1852 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1853 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001854 rbd_assert(img_request->obj_request_count > 0);
1855 rbd_assert(which != BAD_WHICH);
1856 rbd_assert(which < img_request->obj_request_count);
1857 rbd_assert(which >= img_request->next_completion);
1858
1859 spin_lock_irq(&img_request->completion_lock);
1860 if (which != img_request->next_completion)
1861 goto out;
1862
1863 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001864 rbd_assert(more);
1865 rbd_assert(which < img_request->obj_request_count);
1866
1867 if (!obj_request_done_test(obj_request))
1868 break;
Alex Elder12178572013-02-08 09:55:49 -06001869 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001870 which++;
1871 }
1872
1873 rbd_assert(more ^ (which == img_request->obj_request_count));
1874 img_request->next_completion = which;
1875out:
1876 spin_unlock_irq(&img_request->completion_lock);
1877
1878 if (!more)
1879 rbd_img_request_complete(img_request);
1880}
1881
Alex Elderf1a47392013-04-19 15:34:50 -05001882/*
1883 * Split up an image request into one or more object requests, each
1884 * to a different object. The "type" parameter indicates whether
1885 * "data_desc" is the pointer to the head of a list of bio
1886 * structures, or the base of a page array. In either case this
1887 * function assumes data_desc describes memory sufficient to hold
1888 * all data described by the image request.
1889 */
1890static int rbd_img_request_fill(struct rbd_img_request *img_request,
1891 enum obj_request_type type,
1892 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001893{
1894 struct rbd_device *rbd_dev = img_request->rbd_dev;
1895 struct rbd_obj_request *obj_request = NULL;
1896 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001897 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001898 struct bio *bio_list;
1899 unsigned int bio_offset = 0;
1900 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001901 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001902 u64 resid;
1903 u16 opcode;
1904
Alex Elderf1a47392013-04-19 15:34:50 -05001905 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1906 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001907
Alex Elder430c28c2013-04-03 21:32:51 -05001908 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001909 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001910 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001911 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001912
1913 if (type == OBJ_REQUEST_BIO) {
1914 bio_list = data_desc;
1915 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1916 } else {
1917 rbd_assert(type == OBJ_REQUEST_PAGES);
1918 pages = data_desc;
1919 }
1920
Alex Elderbf0d5f502012-11-22 00:00:08 -06001921 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001922 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001923 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001924 u64 offset;
1925 u64 length;
1926
Alex Elder7da22d22013-01-24 16:13:36 -06001927 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001928 if (!object_name)
1929 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001930 offset = rbd_segment_offset(rbd_dev, img_offset);
1931 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001932 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001933 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001934 kfree(object_name); /* object request has its own copy */
1935 if (!obj_request)
1936 goto out_unwind;
1937
Alex Elderf1a47392013-04-19 15:34:50 -05001938 if (type == OBJ_REQUEST_BIO) {
1939 unsigned int clone_size;
1940
1941 rbd_assert(length <= (u64)UINT_MAX);
1942 clone_size = (unsigned int)length;
1943 obj_request->bio_list =
1944 bio_chain_clone_range(&bio_list,
1945 &bio_offset,
1946 clone_size,
1947 GFP_ATOMIC);
1948 if (!obj_request->bio_list)
1949 goto out_partial;
1950 } else {
1951 unsigned int page_count;
1952
1953 obj_request->pages = pages;
1954 page_count = (u32)calc_pages_for(offset, length);
1955 obj_request->page_count = page_count;
1956 if ((offset + length) & ~PAGE_MASK)
1957 page_count--; /* more on last page */
1958 pages += page_count;
1959 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001960
Alex Elder2fa12322013-04-05 01:27:12 -05001961 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1962 obj_request);
1963 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001964 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001965 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001966 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001967
Alex Elder2fa12322013-04-05 01:27:12 -05001968 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1969 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001970 if (type == OBJ_REQUEST_BIO)
1971 osd_req_op_extent_osd_data_bio(osd_req, 0,
1972 obj_request->bio_list, length);
1973 else
1974 osd_req_op_extent_osd_data_pages(osd_req, 0,
1975 obj_request->pages, length,
1976 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05001977
1978 if (write_request)
1979 rbd_osd_req_format_write(obj_request);
1980 else
1981 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05001982
Alex Elder7da22d22013-01-24 16:13:36 -06001983 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001984 rbd_img_obj_request_add(img_request, obj_request);
1985
Alex Elder7da22d22013-01-24 16:13:36 -06001986 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001987 resid -= length;
1988 }
1989
1990 return 0;
1991
1992out_partial:
1993 rbd_obj_request_put(obj_request);
1994out_unwind:
1995 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1996 rbd_obj_request_put(obj_request);
1997
1998 return -ENOMEM;
1999}
2000
Alex Elder3d7efd12013-04-19 15:34:50 -05002001static void
Alex Elder0eefd472013-04-19 15:34:50 -05002002rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2003{
2004 struct rbd_img_request *img_request;
2005 struct rbd_device *rbd_dev;
2006 u64 length;
2007 u32 page_count;
2008
2009 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2010 rbd_assert(obj_request_img_data_test(obj_request));
2011 img_request = obj_request->img_request;
2012 rbd_assert(img_request);
2013
2014 rbd_dev = img_request->rbd_dev;
2015 rbd_assert(rbd_dev);
2016 length = (u64)1 << rbd_dev->header.obj_order;
2017 page_count = (u32)calc_pages_for(0, length);
2018
2019 rbd_assert(obj_request->copyup_pages);
2020 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2021 obj_request->copyup_pages = NULL;
2022
2023 /*
2024 * We want the transfer count to reflect the size of the
2025 * original write request. There is no such thing as a
2026 * successful short write, so if the request was successful
2027 * we can just set it to the originally-requested length.
2028 */
2029 if (!obj_request->result)
2030 obj_request->xferred = obj_request->length;
2031
2032 /* Finish up with the normal image object callback */
2033
2034 rbd_img_obj_callback(obj_request);
2035}
2036
2037static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002038rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2039{
2040 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002041 struct ceph_osd_request *osd_req;
2042 struct ceph_osd_client *osdc;
2043 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002044 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002045 int result;
2046 u64 obj_size;
2047 u64 xferred;
2048
2049 rbd_assert(img_request_child_test(img_request));
2050
2051 /* First get what we need from the image request */
2052
2053 pages = img_request->copyup_pages;
2054 rbd_assert(pages != NULL);
2055 img_request->copyup_pages = NULL;
2056
2057 orig_request = img_request->obj_request;
2058 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002059 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002060 result = img_request->result;
2061 obj_size = img_request->length;
2062 xferred = img_request->xferred;
2063
Alex Elder0eefd472013-04-19 15:34:50 -05002064 rbd_dev = img_request->rbd_dev;
2065 rbd_assert(rbd_dev);
2066 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2067
Alex Elder3d7efd12013-04-19 15:34:50 -05002068 rbd_img_request_put(img_request);
2069
Alex Elder0eefd472013-04-19 15:34:50 -05002070 if (result)
2071 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002072
Alex Elder0eefd472013-04-19 15:34:50 -05002073 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002074
Alex Elder0eefd472013-04-19 15:34:50 -05002075 result = -ENOMEM;
2076 rbd_assert(!orig_request->osd_req);
2077 osd_req = rbd_osd_req_create_copyup(orig_request);
2078 if (!osd_req)
2079 goto out_err;
2080 orig_request->osd_req = osd_req;
2081 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002082
Alex Elder0eefd472013-04-19 15:34:50 -05002083 /* Initialize the copyup op */
2084
2085 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2086 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2087 false, false);
2088
2089 /* Then the original write request op */
2090
2091 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2092 orig_request->offset,
2093 orig_request->length, 0, 0);
2094 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2095 orig_request->length);
2096
2097 rbd_osd_req_format_write(orig_request);
2098
2099 /* All set, send it off. */
2100
2101 orig_request->callback = rbd_img_obj_copyup_callback;
2102 osdc = &rbd_dev->rbd_client->client->osdc;
2103 result = rbd_obj_request_submit(osdc, orig_request);
2104 if (!result)
2105 return;
2106out_err:
2107 /* Record the error code and complete the request */
2108
2109 orig_request->result = result;
2110 orig_request->xferred = 0;
2111 obj_request_done_set(orig_request);
2112 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002113}
2114
2115/*
2116 * Read from the parent image the range of data that covers the
2117 * entire target of the given object request. This is used for
2118 * satisfying a layered image write request when the target of an
2119 * object request from the image request does not exist.
2120 *
2121 * A page array big enough to hold the returned data is allocated
2122 * and supplied to rbd_img_request_fill() as the "data descriptor."
2123 * When the read completes, this page array will be transferred to
2124 * the original object request for the copyup operation.
2125 *
2126 * If an error occurs, record it as the result of the original
2127 * object request and mark it done so it gets completed.
2128 */
2129static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2130{
2131 struct rbd_img_request *img_request = NULL;
2132 struct rbd_img_request *parent_request = NULL;
2133 struct rbd_device *rbd_dev;
2134 u64 img_offset;
2135 u64 length;
2136 struct page **pages = NULL;
2137 u32 page_count;
2138 int result;
2139
2140 rbd_assert(obj_request_img_data_test(obj_request));
2141 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2142
2143 img_request = obj_request->img_request;
2144 rbd_assert(img_request != NULL);
2145 rbd_dev = img_request->rbd_dev;
2146 rbd_assert(rbd_dev->parent != NULL);
2147
2148 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002149 * First things first. The original osd request is of no
2150 * use to use any more, we'll need a new one that can hold
2151 * the two ops in a copyup request. We'll get that later,
2152 * but for now we can release the old one.
2153 */
2154 rbd_osd_req_destroy(obj_request->osd_req);
2155 obj_request->osd_req = NULL;
2156
2157 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002158 * Determine the byte range covered by the object in the
2159 * child image to which the original request was to be sent.
2160 */
2161 img_offset = obj_request->img_offset - obj_request->offset;
2162 length = (u64)1 << rbd_dev->header.obj_order;
2163
2164 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002165 * There is no defined parent data beyond the parent
2166 * overlap, so limit what we read at that boundary if
2167 * necessary.
2168 */
2169 if (img_offset + length > rbd_dev->parent_overlap) {
2170 rbd_assert(img_offset < rbd_dev->parent_overlap);
2171 length = rbd_dev->parent_overlap - img_offset;
2172 }
2173
2174 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002175 * Allocate a page array big enough to receive the data read
2176 * from the parent.
2177 */
2178 page_count = (u32)calc_pages_for(0, length);
2179 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2180 if (IS_ERR(pages)) {
2181 result = PTR_ERR(pages);
2182 pages = NULL;
2183 goto out_err;
2184 }
2185
2186 result = -ENOMEM;
2187 parent_request = rbd_img_request_create(rbd_dev->parent,
2188 img_offset, length,
2189 false, true);
2190 if (!parent_request)
2191 goto out_err;
2192 rbd_obj_request_get(obj_request);
2193 parent_request->obj_request = obj_request;
2194
2195 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2196 if (result)
2197 goto out_err;
2198 parent_request->copyup_pages = pages;
2199
2200 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2201 result = rbd_img_request_submit(parent_request);
2202 if (!result)
2203 return 0;
2204
2205 parent_request->copyup_pages = NULL;
2206 parent_request->obj_request = NULL;
2207 rbd_obj_request_put(obj_request);
2208out_err:
2209 if (pages)
2210 ceph_release_page_vector(pages, page_count);
2211 if (parent_request)
2212 rbd_img_request_put(parent_request);
2213 obj_request->result = result;
2214 obj_request->xferred = 0;
2215 obj_request_done_set(obj_request);
2216
2217 return result;
2218}
2219
Alex Elderc5b5ef62013-02-11 12:33:24 -06002220static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2221{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002222 struct rbd_obj_request *orig_request;
2223 int result;
2224
2225 rbd_assert(!obj_request_img_data_test(obj_request));
2226
2227 /*
2228 * All we need from the object request is the original
2229 * request and the result of the STAT op. Grab those, then
2230 * we're done with the request.
2231 */
2232 orig_request = obj_request->obj_request;
2233 obj_request->obj_request = NULL;
2234 rbd_assert(orig_request);
2235 rbd_assert(orig_request->img_request);
2236
2237 result = obj_request->result;
2238 obj_request->result = 0;
2239
2240 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2241 obj_request, orig_request, result,
2242 obj_request->xferred, obj_request->length);
2243 rbd_obj_request_put(obj_request);
2244
2245 rbd_assert(orig_request);
2246 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002247
2248 /*
2249 * Our only purpose here is to determine whether the object
2250 * exists, and we don't want to treat the non-existence as
2251 * an error. If something else comes back, transfer the
2252 * error to the original request and complete it now.
2253 */
2254 if (!result) {
2255 obj_request_existence_set(orig_request, true);
2256 } else if (result == -ENOENT) {
2257 obj_request_existence_set(orig_request, false);
2258 } else if (result) {
2259 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002260 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002261 }
2262
2263 /*
2264 * Resubmit the original request now that we have recorded
2265 * whether the target object exists.
2266 */
Alex Elderb454e362013-04-19 15:34:50 -05002267 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002268out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002269 if (orig_request->result)
2270 rbd_obj_request_complete(orig_request);
2271 rbd_obj_request_put(orig_request);
2272}
2273
2274static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2275{
2276 struct rbd_obj_request *stat_request;
2277 struct rbd_device *rbd_dev;
2278 struct ceph_osd_client *osdc;
2279 struct page **pages = NULL;
2280 u32 page_count;
2281 size_t size;
2282 int ret;
2283
2284 /*
2285 * The response data for a STAT call consists of:
2286 * le64 length;
2287 * struct {
2288 * le32 tv_sec;
2289 * le32 tv_nsec;
2290 * } mtime;
2291 */
2292 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2293 page_count = (u32)calc_pages_for(0, size);
2294 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2295 if (IS_ERR(pages))
2296 return PTR_ERR(pages);
2297
2298 ret = -ENOMEM;
2299 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2300 OBJ_REQUEST_PAGES);
2301 if (!stat_request)
2302 goto out;
2303
2304 rbd_obj_request_get(obj_request);
2305 stat_request->obj_request = obj_request;
2306 stat_request->pages = pages;
2307 stat_request->page_count = page_count;
2308
2309 rbd_assert(obj_request->img_request);
2310 rbd_dev = obj_request->img_request->rbd_dev;
2311 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2312 stat_request);
2313 if (!stat_request->osd_req)
2314 goto out;
2315 stat_request->callback = rbd_img_obj_exists_callback;
2316
2317 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2318 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2319 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002320 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002321
2322 osdc = &rbd_dev->rbd_client->client->osdc;
2323 ret = rbd_obj_request_submit(osdc, stat_request);
2324out:
2325 if (ret)
2326 rbd_obj_request_put(obj_request);
2327
2328 return ret;
2329}
2330
Alex Elderb454e362013-04-19 15:34:50 -05002331static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2332{
2333 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002334 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002335 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002336
2337 rbd_assert(obj_request_img_data_test(obj_request));
2338
2339 img_request = obj_request->img_request;
2340 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002341 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002342
Alex Elderb454e362013-04-19 15:34:50 -05002343 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002344 * Only writes to layered images need special handling.
2345 * Reads and non-layered writes are simple object requests.
2346 * Layered writes that start beyond the end of the overlap
2347 * with the parent have no parent data, so they too are
2348 * simple object requests. Finally, if the target object is
2349 * known to already exist, its parent data has already been
2350 * copied, so a write to the object can also be handled as a
2351 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002352 */
2353 if (!img_request_write_test(img_request) ||
2354 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002355 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002356 ((known = obj_request_known_test(obj_request)) &&
2357 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002358
2359 struct rbd_device *rbd_dev;
2360 struct ceph_osd_client *osdc;
2361
2362 rbd_dev = obj_request->img_request->rbd_dev;
2363 osdc = &rbd_dev->rbd_client->client->osdc;
2364
2365 return rbd_obj_request_submit(osdc, obj_request);
2366 }
2367
2368 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002369 * It's a layered write. The target object might exist but
2370 * we may not know that yet. If we know it doesn't exist,
2371 * start by reading the data for the full target object from
2372 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002373 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002374 if (known)
2375 return rbd_img_obj_parent_read_full(obj_request);
2376
2377 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002378
2379 return rbd_img_obj_exists_submit(obj_request);
2380}
2381
Alex Elderbf0d5f502012-11-22 00:00:08 -06002382static int rbd_img_request_submit(struct rbd_img_request *img_request)
2383{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002384 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002385 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002386
Alex Elder37206ee2013-02-20 17:32:08 -06002387 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002388 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002389 int ret;
2390
Alex Elderb454e362013-04-19 15:34:50 -05002391 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002392 if (ret)
2393 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002394 }
2395
2396 return 0;
2397}
2398
Alex Elder8b3e1a52013-01-24 16:13:36 -06002399static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2400{
2401 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002402 struct rbd_device *rbd_dev;
2403 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002404
2405 rbd_assert(img_request_child_test(img_request));
2406
2407 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002408 rbd_assert(obj_request);
2409 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002410
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002411 obj_request->result = img_request->result;
2412 if (obj_request->result)
2413 goto out;
2414
2415 /*
2416 * We need to zero anything beyond the parent overlap
2417 * boundary. Since rbd_img_obj_request_read_callback()
2418 * will zero anything beyond the end of a short read, an
2419 * easy way to do this is to pretend the data from the
2420 * parent came up short--ending at the overlap boundary.
2421 */
2422 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2423 obj_end = obj_request->img_offset + obj_request->length;
2424 rbd_dev = obj_request->img_request->rbd_dev;
2425 if (obj_end > rbd_dev->parent_overlap) {
2426 u64 xferred = 0;
2427
2428 if (obj_request->img_offset < rbd_dev->parent_overlap)
2429 xferred = rbd_dev->parent_overlap -
2430 obj_request->img_offset;
2431
2432 obj_request->xferred = min(img_request->xferred, xferred);
2433 } else {
2434 obj_request->xferred = img_request->xferred;
2435 }
2436out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002437 rbd_img_obj_request_read_callback(obj_request);
2438 rbd_obj_request_complete(obj_request);
2439}
2440
2441static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2442{
2443 struct rbd_device *rbd_dev;
2444 struct rbd_img_request *img_request;
2445 int result;
2446
2447 rbd_assert(obj_request_img_data_test(obj_request));
2448 rbd_assert(obj_request->img_request != NULL);
2449 rbd_assert(obj_request->result == (s32) -ENOENT);
2450 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2451
2452 rbd_dev = obj_request->img_request->rbd_dev;
2453 rbd_assert(rbd_dev->parent != NULL);
2454 /* rbd_read_finish(obj_request, obj_request->length); */
2455 img_request = rbd_img_request_create(rbd_dev->parent,
2456 obj_request->img_offset,
2457 obj_request->length,
2458 false, true);
2459 result = -ENOMEM;
2460 if (!img_request)
2461 goto out_err;
2462
2463 rbd_obj_request_get(obj_request);
2464 img_request->obj_request = obj_request;
2465
Alex Elderf1a47392013-04-19 15:34:50 -05002466 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2467 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002468 if (result)
2469 goto out_err;
2470
2471 img_request->callback = rbd_img_parent_read_callback;
2472 result = rbd_img_request_submit(img_request);
2473 if (result)
2474 goto out_err;
2475
2476 return;
2477out_err:
2478 if (img_request)
2479 rbd_img_request_put(img_request);
2480 obj_request->result = result;
2481 obj_request->xferred = 0;
2482 obj_request_done_set(obj_request);
2483}
2484
Alex Eldercf81b602013-01-17 12:18:46 -06002485static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06002486 u64 ver, u64 notify_id)
2487{
2488 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002489 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002490 int ret;
2491
2492 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2493 OBJ_REQUEST_NODATA);
2494 if (!obj_request)
2495 return -ENOMEM;
2496
2497 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002498 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002499 if (!obj_request->osd_req)
2500 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002501 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002502
Alex Elderc99d2d42013-04-05 01:27:11 -05002503 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2504 notify_id, ver, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002505 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002506
Alex Elderb8d70032012-11-30 17:53:04 -06002507 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002508out:
Alex Eldercf81b602013-01-17 12:18:46 -06002509 if (ret)
2510 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002511
2512 return ret;
2513}
2514
2515static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2516{
2517 struct rbd_device *rbd_dev = (struct rbd_device *)data;
2518 u64 hver;
Alex Elderb8d70032012-11-30 17:53:04 -06002519
2520 if (!rbd_dev)
2521 return;
2522
Alex Elder37206ee2013-02-20 17:32:08 -06002523 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06002524 rbd_dev->header_name, (unsigned long long) notify_id,
2525 (unsigned int) opcode);
Alex Elder522a0cc2013-04-25 15:09:41 -05002526 (void)rbd_dev_refresh(rbd_dev, &hver);
Alex Elderb8d70032012-11-30 17:53:04 -06002527
Alex Eldercf81b602013-01-17 12:18:46 -06002528 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002529}
2530
Alex Elder9969ebc2013-01-18 12:31:10 -06002531/*
2532 * Request sync osd watch/unwatch. The value of "start" determines
2533 * whether a watch request is being initiated or torn down.
2534 */
2535static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2536{
2537 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2538 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002539 int ret;
2540
2541 rbd_assert(start ^ !!rbd_dev->watch_event);
2542 rbd_assert(start ^ !!rbd_dev->watch_request);
2543
2544 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002545 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002546 &rbd_dev->watch_event);
2547 if (ret < 0)
2548 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002549 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002550 }
2551
2552 ret = -ENOMEM;
2553 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2554 OBJ_REQUEST_NODATA);
2555 if (!obj_request)
2556 goto out_cancel;
2557
Alex Elder430c28c2013-04-03 21:32:51 -05002558 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2559 if (!obj_request->osd_req)
2560 goto out_cancel;
2561
Alex Elder8eb87562013-01-25 17:08:55 -06002562 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002563 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002564 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002565 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002566 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002567
2568 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2569 rbd_dev->watch_event->cookie,
2570 rbd_dev->header.obj_version, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002571 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002572
Alex Elder9969ebc2013-01-18 12:31:10 -06002573 ret = rbd_obj_request_submit(osdc, obj_request);
2574 if (ret)
2575 goto out_cancel;
2576 ret = rbd_obj_request_wait(obj_request);
2577 if (ret)
2578 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002579 ret = obj_request->result;
2580 if (ret)
2581 goto out_cancel;
2582
Alex Elder8eb87562013-01-25 17:08:55 -06002583 /*
2584 * A watch request is set to linger, so the underlying osd
2585 * request won't go away until we unregister it. We retain
2586 * a pointer to the object request during that time (in
2587 * rbd_dev->watch_request), so we'll keep a reference to
2588 * it. We'll drop that reference (below) after we've
2589 * unregistered it.
2590 */
2591 if (start) {
2592 rbd_dev->watch_request = obj_request;
2593
2594 return 0;
2595 }
2596
2597 /* We have successfully torn down the watch request */
2598
2599 rbd_obj_request_put(rbd_dev->watch_request);
2600 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002601out_cancel:
2602 /* Cancel the event if we're tearing down, or on error */
2603 ceph_osdc_cancel_event(rbd_dev->watch_event);
2604 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002605 if (obj_request)
2606 rbd_obj_request_put(obj_request);
2607
2608 return ret;
2609}
2610
Alex Elder36be9a72013-01-19 00:30:28 -06002611/*
Alex Elderf40eb342013-04-25 15:09:42 -05002612 * Synchronous osd object method call. Returns the number of bytes
2613 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002614 */
2615static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2616 const char *object_name,
2617 const char *class_name,
2618 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002619 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002620 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002621 void *inbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002622 size_t inbound_size,
2623 u64 *version)
2624{
Alex Elder21692382013-04-05 01:27:12 -05002625 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002626 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002627 struct page **pages;
2628 u32 page_count;
2629 int ret;
2630
2631 /*
Alex Elder6010a452013-04-05 01:27:11 -05002632 * Method calls are ultimately read operations. The result
2633 * should placed into the inbound buffer provided. They
2634 * also supply outbound data--parameters for the object
2635 * method. Currently if this is present it will be a
2636 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002637 */
Alex Elder57385b52013-04-21 12:14:45 -05002638 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002639 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2640 if (IS_ERR(pages))
2641 return PTR_ERR(pages);
2642
2643 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002644 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002645 OBJ_REQUEST_PAGES);
2646 if (!obj_request)
2647 goto out;
2648
2649 obj_request->pages = pages;
2650 obj_request->page_count = page_count;
2651
Alex Elder430c28c2013-04-03 21:32:51 -05002652 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002653 if (!obj_request->osd_req)
2654 goto out;
2655
Alex Elderc99d2d42013-04-05 01:27:11 -05002656 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002657 class_name, method_name);
2658 if (outbound_size) {
2659 struct ceph_pagelist *pagelist;
2660
2661 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2662 if (!pagelist)
2663 goto out;
2664
2665 ceph_pagelist_init(pagelist);
2666 ceph_pagelist_append(pagelist, outbound, outbound_size);
2667 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2668 pagelist);
2669 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002670 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2671 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002672 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002673 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002674
Alex Elder36be9a72013-01-19 00:30:28 -06002675 ret = rbd_obj_request_submit(osdc, obj_request);
2676 if (ret)
2677 goto out;
2678 ret = rbd_obj_request_wait(obj_request);
2679 if (ret)
2680 goto out;
2681
2682 ret = obj_request->result;
2683 if (ret < 0)
2684 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002685
2686 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2687 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002688 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002689 if (version)
2690 *version = obj_request->version;
2691out:
2692 if (obj_request)
2693 rbd_obj_request_put(obj_request);
2694 else
2695 ceph_release_page_vector(pages, page_count);
2696
2697 return ret;
2698}
2699
Alex Elderbf0d5f502012-11-22 00:00:08 -06002700static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002701 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002702{
2703 struct rbd_device *rbd_dev = q->queuedata;
2704 bool read_only = rbd_dev->mapping.read_only;
2705 struct request *rq;
2706 int result;
2707
2708 while ((rq = blk_fetch_request(q))) {
2709 bool write_request = rq_data_dir(rq) == WRITE;
2710 struct rbd_img_request *img_request;
2711 u64 offset;
2712 u64 length;
2713
2714 /* Ignore any non-FS requests that filter through. */
2715
2716 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002717 dout("%s: non-fs request type %d\n", __func__,
2718 (int) rq->cmd_type);
2719 __blk_end_request_all(rq, 0);
2720 continue;
2721 }
2722
2723 /* Ignore/skip any zero-length requests */
2724
2725 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2726 length = (u64) blk_rq_bytes(rq);
2727
2728 if (!length) {
2729 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002730 __blk_end_request_all(rq, 0);
2731 continue;
2732 }
2733
2734 spin_unlock_irq(q->queue_lock);
2735
2736 /* Disallow writes to a read-only device */
2737
2738 if (write_request) {
2739 result = -EROFS;
2740 if (read_only)
2741 goto end_request;
2742 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2743 }
2744
Alex Elder6d292902013-01-14 12:43:31 -06002745 /*
2746 * Quit early if the mapped snapshot no longer
2747 * exists. It's still possible the snapshot will
2748 * have disappeared by the time our request arrives
2749 * at the osd, but there's no sense in sending it if
2750 * we already know.
2751 */
2752 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002753 dout("request for non-existent snapshot");
2754 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2755 result = -ENXIO;
2756 goto end_request;
2757 }
2758
Alex Elderbf0d5f502012-11-22 00:00:08 -06002759 result = -EINVAL;
2760 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2761 goto end_request; /* Shouldn't happen */
2762
2763 result = -ENOMEM;
2764 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002765 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002766 if (!img_request)
2767 goto end_request;
2768
2769 img_request->rq = rq;
2770
Alex Elderf1a47392013-04-19 15:34:50 -05002771 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2772 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002773 if (!result)
2774 result = rbd_img_request_submit(img_request);
2775 if (result)
2776 rbd_img_request_put(img_request);
2777end_request:
2778 spin_lock_irq(q->queue_lock);
2779 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002780 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2781 write_request ? "write" : "read",
2782 length, offset, result);
2783
Alex Elderbf0d5f502012-11-22 00:00:08 -06002784 __blk_end_request_all(rq, result);
2785 }
2786 }
2787}
2788
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002789/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002790 * a queue callback. Makes sure that we don't create a bio that spans across
2791 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002792 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002793 */
2794static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2795 struct bio_vec *bvec)
2796{
2797 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002798 sector_t sector_offset;
2799 sector_t sectors_per_obj;
2800 sector_t obj_sector_offset;
2801 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002802
Alex Eldere5cfeed22012-10-20 22:17:27 -05002803 /*
2804 * Find how far into its rbd object the partition-relative
2805 * bio start sector is to offset relative to the enclosing
2806 * device.
2807 */
2808 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2809 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2810 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002811
Alex Eldere5cfeed22012-10-20 22:17:27 -05002812 /*
2813 * Compute the number of bytes from that offset to the end
2814 * of the object. Account for what's already used by the bio.
2815 */
2816 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2817 if (ret > bmd->bi_size)
2818 ret -= bmd->bi_size;
2819 else
2820 ret = 0;
2821
2822 /*
2823 * Don't send back more than was asked for. And if the bio
2824 * was empty, let the whole thing through because: "Note
2825 * that a block device *must* allow a single page to be
2826 * added to an empty bio."
2827 */
2828 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2829 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2830 ret = (int) bvec->bv_len;
2831
2832 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002833}
2834
2835static void rbd_free_disk(struct rbd_device *rbd_dev)
2836{
2837 struct gendisk *disk = rbd_dev->disk;
2838
2839 if (!disk)
2840 return;
2841
Alex Eldera0cab922013-04-25 23:15:08 -05002842 rbd_dev->disk = NULL;
2843 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002844 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002845 if (disk->queue)
2846 blk_cleanup_queue(disk->queue);
2847 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002848 put_disk(disk);
2849}
2850
Alex Elder788e2df2013-01-17 12:25:27 -06002851static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2852 const char *object_name,
2853 u64 offset, u64 length,
Alex Elder80ef15b2013-04-21 12:14:45 -05002854 void *buf, u64 *version)
Alex Elder788e2df2013-01-17 12:25:27 -06002855
2856{
Alex Elder21692382013-04-05 01:27:12 -05002857 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002858 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002859 struct page **pages = NULL;
2860 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002861 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002862 int ret;
2863
2864 page_count = (u32) calc_pages_for(offset, length);
2865 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2866 if (IS_ERR(pages))
2867 ret = PTR_ERR(pages);
2868
2869 ret = -ENOMEM;
2870 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002871 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002872 if (!obj_request)
2873 goto out;
2874
2875 obj_request->pages = pages;
2876 obj_request->page_count = page_count;
2877
Alex Elder430c28c2013-04-03 21:32:51 -05002878 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002879 if (!obj_request->osd_req)
2880 goto out;
2881
Alex Elderc99d2d42013-04-05 01:27:11 -05002882 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2883 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002884 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002885 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002886 obj_request->length,
2887 obj_request->offset & ~PAGE_MASK,
2888 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002889 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002890
Alex Elder788e2df2013-01-17 12:25:27 -06002891 ret = rbd_obj_request_submit(osdc, obj_request);
2892 if (ret)
2893 goto out;
2894 ret = rbd_obj_request_wait(obj_request);
2895 if (ret)
2896 goto out;
2897
2898 ret = obj_request->result;
2899 if (ret < 0)
2900 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002901
2902 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2903 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002904 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002905 rbd_assert(size <= (size_t) INT_MAX);
2906 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002907 if (version)
2908 *version = obj_request->version;
2909out:
2910 if (obj_request)
2911 rbd_obj_request_put(obj_request);
2912 else
2913 ceph_release_page_vector(pages, page_count);
2914
2915 return ret;
2916}
2917
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002918/*
Alex Elder4156d9982012-08-02 11:29:46 -05002919 * Read the complete header for the given rbd device.
2920 *
2921 * Returns a pointer to a dynamically-allocated buffer containing
2922 * the complete and validated header. Caller can pass the address
2923 * of a variable that will be filled in with the version of the
2924 * header object at the time it was read.
2925 *
2926 * Returns a pointer-coded errno if a failure occurs.
2927 */
2928static struct rbd_image_header_ondisk *
2929rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2930{
2931 struct rbd_image_header_ondisk *ondisk = NULL;
2932 u32 snap_count = 0;
2933 u64 names_size = 0;
2934 u32 want_count;
2935 int ret;
2936
2937 /*
2938 * The complete header will include an array of its 64-bit
2939 * snapshot ids, followed by the names of those snapshots as
2940 * a contiguous block of NUL-terminated strings. Note that
2941 * the number of snapshots could change by the time we read
2942 * it in, in which case we re-read it.
2943 */
2944 do {
2945 size_t size;
2946
2947 kfree(ondisk);
2948
2949 size = sizeof (*ondisk);
2950 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2951 size += names_size;
2952 ondisk = kmalloc(size, GFP_KERNEL);
2953 if (!ondisk)
2954 return ERR_PTR(-ENOMEM);
2955
Alex Elder788e2df2013-01-17 12:25:27 -06002956 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder80ef15b2013-04-21 12:14:45 -05002957 0, size, ondisk, version);
Alex Elder4156d9982012-08-02 11:29:46 -05002958 if (ret < 0)
2959 goto out_err;
2960 if (WARN_ON((size_t) ret < size)) {
2961 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002962 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2963 size, ret);
Alex Elder4156d9982012-08-02 11:29:46 -05002964 goto out_err;
2965 }
2966 if (!rbd_dev_ondisk_valid(ondisk)) {
2967 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002968 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d9982012-08-02 11:29:46 -05002969 goto out_err;
2970 }
2971
2972 names_size = le64_to_cpu(ondisk->snap_names_len);
2973 want_count = snap_count;
2974 snap_count = le32_to_cpu(ondisk->snap_count);
2975 } while (snap_count != want_count);
2976
2977 return ondisk;
2978
2979out_err:
2980 kfree(ondisk);
2981
2982 return ERR_PTR(ret);
2983}
2984
2985/*
2986 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002987 */
2988static int rbd_read_header(struct rbd_device *rbd_dev,
2989 struct rbd_image_header *header)
2990{
Alex Elder4156d9982012-08-02 11:29:46 -05002991 struct rbd_image_header_ondisk *ondisk;
2992 u64 ver = 0;
2993 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002994
Alex Elder4156d9982012-08-02 11:29:46 -05002995 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2996 if (IS_ERR(ondisk))
2997 return PTR_ERR(ondisk);
2998 ret = rbd_header_from_disk(header, ondisk);
2999 if (ret >= 0)
3000 header->obj_version = ver;
3001 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003002
Alex Elder4156d9982012-08-02 11:29:46 -05003003 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003004}
3005
Alex Elder41f38c22012-10-25 23:34:40 -05003006static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003007{
3008 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05003009 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003010
Alex Elder6087b512013-04-25 15:09:41 -05003011 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
3012 list_del(&snap->node);
3013 rbd_snap_destroy(snap);
3014 }
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003015}
3016
Alex Elder94785542012-10-09 13:50:17 -07003017static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3018{
3019 sector_t size;
3020
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003021 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003022 return;
3023
3024 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
3025 dout("setting size to %llu sectors", (unsigned long long) size);
3026 rbd_dev->mapping.size = (u64) size;
3027 set_capacity(rbd_dev->disk, size);
3028}
3029
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003030/*
3031 * only read the first part of the ondisk header, without the snaps info
3032 */
Alex Elder117973f2012-08-31 17:29:55 -05003033static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003034{
3035 int ret;
3036 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003037
3038 ret = rbd_read_header(rbd_dev, &h);
3039 if (ret < 0)
3040 return ret;
3041
Josh Durgina51aa0c2011-12-05 10:35:04 -08003042 down_write(&rbd_dev->header_rwsem);
3043
Alex Elder94785542012-10-09 13:50:17 -07003044 /* Update image size, and check for resize of mapped image */
3045 rbd_dev->header.image_size = h.image_size;
3046 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003047
Alex Elder849b4262012-07-09 21:04:24 -05003048 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003049 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003050 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003051 /* osd requests may still refer to snapc */
3052 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003053
Alex Elderb8136232012-07-25 09:32:41 -05003054 if (hver)
3055 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08003056 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08003057 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003058 rbd_dev->header.snapc = h.snapc;
3059 rbd_dev->header.snap_names = h.snap_names;
3060 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003061 /* Free the extra copy of the object prefix */
3062 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3063 kfree(h.object_prefix);
3064
Alex Elder304f6802012-08-31 17:29:52 -05003065 ret = rbd_dev_snaps_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003066
Josh Durginc6666012011-11-21 17:11:12 -08003067 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003068
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003069 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003070}
3071
Alex Elder117973f2012-08-31 17:29:55 -05003072static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05003073{
3074 int ret;
3075
Alex Elder117973f2012-08-31 17:29:55 -05003076 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05003077 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003078 if (rbd_dev->image_format == 1)
3079 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3080 else
3081 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05003082 mutex_unlock(&ctl_mutex);
Laurent Barbed98df632013-04-10 17:47:46 -05003083 revalidate_disk(rbd_dev->disk);
Alex Elder522a0cc2013-04-25 15:09:41 -05003084 if (ret)
3085 rbd_warn(rbd_dev, "got notification but failed to "
3086 " update snaps: %d\n", ret);
Alex Elder1fe5e992012-07-25 09:32:41 -05003087
3088 return ret;
3089}
3090
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003091static int rbd_init_disk(struct rbd_device *rbd_dev)
3092{
3093 struct gendisk *disk;
3094 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003095 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003098 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3099 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003100 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003101
Alex Elderf0f8cef2012-01-29 13:57:44 -06003102 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003103 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003104 disk->major = rbd_dev->major;
3105 disk->first_minor = 0;
3106 disk->fops = &rbd_bd_ops;
3107 disk->private_data = rbd_dev;
3108
Alex Elderbf0d5f502012-11-22 00:00:08 -06003109 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003110 if (!q)
3111 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003112
Alex Elder593a9e72012-02-07 12:03:37 -06003113 /* We use the default size, but let's be explicit about it. */
3114 blk_queue_physical_block_size(q, SECTOR_SIZE);
3115
Josh Durgin029bcbd2011-07-22 11:35:23 -07003116 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003117 segment_size = rbd_obj_bytes(&rbd_dev->header);
3118 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3119 blk_queue_max_segment_size(q, segment_size);
3120 blk_queue_io_min(q, segment_size);
3121 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003122
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003123 blk_queue_merge_bvec(q, rbd_merge_bvec);
3124 disk->queue = q;
3125
3126 q->queuedata = rbd_dev;
3127
3128 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003129
Alex Elder12f02942012-08-29 17:11:07 -05003130 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
3131
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003132 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133out_disk:
3134 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003135
3136 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003137}
3138
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003139/*
3140 sysfs
3141*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003142
Alex Elder593a9e72012-02-07 12:03:37 -06003143static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3144{
3145 return container_of(dev, struct rbd_device, dev);
3146}
3147
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003148static ssize_t rbd_size_show(struct device *dev,
3149 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003150{
Alex Elder593a9e72012-02-07 12:03:37 -06003151 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08003152 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003153
Josh Durgina51aa0c2011-12-05 10:35:04 -08003154 down_read(&rbd_dev->header_rwsem);
3155 size = get_capacity(rbd_dev->disk);
3156 up_read(&rbd_dev->header_rwsem);
3157
3158 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003159}
3160
Alex Elder34b13182012-07-13 20:35:12 -05003161/*
3162 * Note this shows the features for whatever's mapped, which is not
3163 * necessarily the base image.
3164 */
3165static ssize_t rbd_features_show(struct device *dev,
3166 struct device_attribute *attr, char *buf)
3167{
3168 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3169
3170 return sprintf(buf, "0x%016llx\n",
3171 (unsigned long long) rbd_dev->mapping.features);
3172}
3173
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003174static ssize_t rbd_major_show(struct device *dev,
3175 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003176{
Alex Elder593a9e72012-02-07 12:03:37 -06003177 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003178
3179 return sprintf(buf, "%d\n", rbd_dev->major);
3180}
3181
3182static ssize_t rbd_client_id_show(struct device *dev,
3183 struct device_attribute *attr, char *buf)
3184{
Alex Elder593a9e72012-02-07 12:03:37 -06003185 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003186
Alex Elder1dbb4392012-01-24 10:08:37 -06003187 return sprintf(buf, "client%lld\n",
3188 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003189}
3190
3191static ssize_t rbd_pool_show(struct device *dev,
3192 struct device_attribute *attr, char *buf)
3193{
Alex Elder593a9e72012-02-07 12:03:37 -06003194 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003195
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003196 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003197}
3198
Alex Elder9bb2f332012-07-12 10:46:35 -05003199static ssize_t rbd_pool_id_show(struct device *dev,
3200 struct device_attribute *attr, char *buf)
3201{
3202 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3203
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003204 return sprintf(buf, "%llu\n",
3205 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003206}
3207
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003208static ssize_t rbd_name_show(struct device *dev,
3209 struct device_attribute *attr, char *buf)
3210{
Alex Elder593a9e72012-02-07 12:03:37 -06003211 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003212
Alex Eldera92ffdf2012-10-30 19:40:33 -05003213 if (rbd_dev->spec->image_name)
3214 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3215
3216 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003217}
3218
Alex Elder589d30e2012-07-10 20:30:11 -05003219static ssize_t rbd_image_id_show(struct device *dev,
3220 struct device_attribute *attr, char *buf)
3221{
3222 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3223
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003224 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003225}
3226
Alex Elder34b13182012-07-13 20:35:12 -05003227/*
3228 * Shows the name of the currently-mapped snapshot (or
3229 * RBD_SNAP_HEAD_NAME for the base image).
3230 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003231static ssize_t rbd_snap_show(struct device *dev,
3232 struct device_attribute *attr,
3233 char *buf)
3234{
Alex Elder593a9e72012-02-07 12:03:37 -06003235 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003236
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003237 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003238}
3239
Alex Elder86b00e02012-10-25 23:34:42 -05003240/*
3241 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3242 * for the parent image. If there is no parent, simply shows
3243 * "(no parent image)".
3244 */
3245static ssize_t rbd_parent_show(struct device *dev,
3246 struct device_attribute *attr,
3247 char *buf)
3248{
3249 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3250 struct rbd_spec *spec = rbd_dev->parent_spec;
3251 int count;
3252 char *bufp = buf;
3253
3254 if (!spec)
3255 return sprintf(buf, "(no parent image)\n");
3256
3257 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3258 (unsigned long long) spec->pool_id, spec->pool_name);
3259 if (count < 0)
3260 return count;
3261 bufp += count;
3262
3263 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3264 spec->image_name ? spec->image_name : "(unknown)");
3265 if (count < 0)
3266 return count;
3267 bufp += count;
3268
3269 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3270 (unsigned long long) spec->snap_id, spec->snap_name);
3271 if (count < 0)
3272 return count;
3273 bufp += count;
3274
3275 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3276 if (count < 0)
3277 return count;
3278 bufp += count;
3279
3280 return (ssize_t) (bufp - buf);
3281}
3282
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003283static ssize_t rbd_image_refresh(struct device *dev,
3284 struct device_attribute *attr,
3285 const char *buf,
3286 size_t size)
3287{
Alex Elder593a9e72012-02-07 12:03:37 -06003288 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003289 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003290
Alex Elder117973f2012-08-31 17:29:55 -05003291 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05003292
3293 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003294}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003295
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003296static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003297static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003298static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3299static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3300static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003301static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003302static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003303static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003304static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3305static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003306static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003307
3308static struct attribute *rbd_attrs[] = {
3309 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003310 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003311 &dev_attr_major.attr,
3312 &dev_attr_client_id.attr,
3313 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003314 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003315 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003316 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003317 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003318 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003319 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003320 NULL
3321};
3322
3323static struct attribute_group rbd_attr_group = {
3324 .attrs = rbd_attrs,
3325};
3326
3327static const struct attribute_group *rbd_attr_groups[] = {
3328 &rbd_attr_group,
3329 NULL
3330};
3331
3332static void rbd_sysfs_dev_release(struct device *dev)
3333{
3334}
3335
3336static struct device_type rbd_device_type = {
3337 .name = "rbd",
3338 .groups = rbd_attr_groups,
3339 .release = rbd_sysfs_dev_release,
3340};
3341
Alex Elder8b8fb992012-10-26 17:25:24 -05003342static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3343{
3344 kref_get(&spec->kref);
3345
3346 return spec;
3347}
3348
3349static void rbd_spec_free(struct kref *kref);
3350static void rbd_spec_put(struct rbd_spec *spec)
3351{
3352 if (spec)
3353 kref_put(&spec->kref, rbd_spec_free);
3354}
3355
3356static struct rbd_spec *rbd_spec_alloc(void)
3357{
3358 struct rbd_spec *spec;
3359
3360 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3361 if (!spec)
3362 return NULL;
3363 kref_init(&spec->kref);
3364
Alex Elder8b8fb992012-10-26 17:25:24 -05003365 return spec;
3366}
3367
3368static void rbd_spec_free(struct kref *kref)
3369{
3370 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3371
3372 kfree(spec->pool_name);
3373 kfree(spec->image_id);
3374 kfree(spec->image_name);
3375 kfree(spec->snap_name);
3376 kfree(spec);
3377}
3378
Alex Eldercc344fa2013-02-19 12:25:56 -06003379static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003380 struct rbd_spec *spec)
3381{
3382 struct rbd_device *rbd_dev;
3383
3384 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3385 if (!rbd_dev)
3386 return NULL;
3387
3388 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003389 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003390 INIT_LIST_HEAD(&rbd_dev->node);
3391 INIT_LIST_HEAD(&rbd_dev->snaps);
3392 init_rwsem(&rbd_dev->header_rwsem);
3393
3394 rbd_dev->spec = spec;
3395 rbd_dev->rbd_client = rbdc;
3396
Alex Elder0903e872012-11-14 12:25:19 -06003397 /* Initialize the layout used for all rbd requests */
3398
3399 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3400 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3401 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3402 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3403
Alex Elderc53d5892012-10-25 23:34:42 -05003404 return rbd_dev;
3405}
3406
3407static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3408{
Alex Elder86b00e02012-10-25 23:34:42 -05003409 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05003410 kfree(rbd_dev->header_name);
3411 rbd_put_client(rbd_dev->rbd_client);
3412 rbd_spec_put(rbd_dev->spec);
3413 kfree(rbd_dev);
3414}
3415
Alex Elder6087b512013-04-25 15:09:41 -05003416static void rbd_snap_destroy(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003417{
Alex Elder3e83b652013-04-23 13:52:53 -05003418 kfree(snap->name);
3419 kfree(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003420}
3421
Alex Elder6087b512013-04-25 15:09:41 -05003422static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003423 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003424 u64 snap_id, u64 snap_size,
3425 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003426{
Alex Elder4e891e02012-07-10 20:30:10 -05003427 struct rbd_snap *snap;
Alex Elder4e891e02012-07-10 20:30:10 -05003428
3429 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003430 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003431 return ERR_PTR(-ENOMEM);
3432
Alex Elder6e584f52013-04-25 15:09:42 -05003433 snap->name = snap_name;
Alex Elderc8d18422012-07-10 20:30:11 -05003434 snap->id = snap_id;
3435 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003436 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003437
3438 return snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003439}
3440
Alex Elder6e584f52013-04-25 15:09:42 -05003441/*
3442 * Returns a dynamically-allocated snapshot name if successful, or a
3443 * pointer-coded error otherwise.
3444 */
Alex Eldercd892122012-07-03 16:01:19 -05003445static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3446 u64 *snap_size, u64 *snap_features)
3447{
3448 char *snap_name;
Alex Elder6e584f52013-04-25 15:09:42 -05003449 int i;
Alex Eldercd892122012-07-03 16:01:19 -05003450
3451 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3452
Alex Eldercd892122012-07-03 16:01:19 -05003453 /* Skip over names until we find the one we are looking for */
3454
3455 snap_name = rbd_dev->header.snap_names;
Alex Elder6e584f52013-04-25 15:09:42 -05003456 for (i = 0; i < which; i++)
Alex Eldercd892122012-07-03 16:01:19 -05003457 snap_name += strlen(snap_name) + 1;
3458
Alex Elder6e584f52013-04-25 15:09:42 -05003459 snap_name = kstrdup(snap_name, GFP_KERNEL);
3460 if (!snap_name)
3461 return ERR_PTR(-ENOMEM);
3462
3463 *snap_size = rbd_dev->header.snap_sizes[which];
3464 *snap_features = 0; /* No features for v1 */
3465
Alex Eldercd892122012-07-03 16:01:19 -05003466 return snap_name;
3467}
3468
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003469/*
Alex Elder9d475de2012-07-03 16:01:19 -05003470 * Get the size and object order for an image snapshot, or if
3471 * snap_id is CEPH_NOSNAP, gets this information for the base
3472 * image.
3473 */
3474static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3475 u8 *order, u64 *snap_size)
3476{
3477 __le64 snapid = cpu_to_le64(snap_id);
3478 int ret;
3479 struct {
3480 u8 order;
3481 __le64 size;
3482 } __attribute__ ((packed)) size_buf = { 0 };
3483
Alex Elder36be9a72013-01-19 00:30:28 -06003484 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003485 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003486 &snapid, sizeof (snapid),
3487 &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003488 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003489 if (ret < 0)
3490 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003491 if (ret < sizeof (size_buf))
3492 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003493
Alex Elderc86f86e2013-04-25 15:09:41 -05003494 if (order)
3495 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003496 *snap_size = le64_to_cpu(size_buf.size);
3497
3498 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003499 (unsigned long long)snap_id, (unsigned int)*order,
3500 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003501
3502 return 0;
3503}
3504
3505static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3506{
3507 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3508 &rbd_dev->header.obj_order,
3509 &rbd_dev->header.image_size);
3510}
3511
Alex Elder1e130192012-07-03 16:01:19 -05003512static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3513{
3514 void *reply_buf;
3515 int ret;
3516 void *p;
3517
3518 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3519 if (!reply_buf)
3520 return -ENOMEM;
3521
Alex Elder36be9a72013-01-19 00:30:28 -06003522 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003523 "rbd", "get_object_prefix", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003524 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003525 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003526 if (ret < 0)
3527 goto out;
3528
3529 p = reply_buf;
3530 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003531 p + ret, NULL, GFP_NOIO);
3532 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003533
3534 if (IS_ERR(rbd_dev->header.object_prefix)) {
3535 ret = PTR_ERR(rbd_dev->header.object_prefix);
3536 rbd_dev->header.object_prefix = NULL;
3537 } else {
3538 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3539 }
Alex Elder1e130192012-07-03 16:01:19 -05003540out:
3541 kfree(reply_buf);
3542
3543 return ret;
3544}
3545
Alex Elderb1b54022012-07-03 16:01:19 -05003546static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3547 u64 *snap_features)
3548{
3549 __le64 snapid = cpu_to_le64(snap_id);
3550 struct {
3551 __le64 features;
3552 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003553 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003554 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003555 int ret;
3556
Alex Elder36be9a72013-01-19 00:30:28 -06003557 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003558 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003559 &snapid, sizeof (snapid),
3560 &features_buf, sizeof (features_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003561 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003562 if (ret < 0)
3563 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003564 if (ret < sizeof (features_buf))
3565 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003566
3567 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003568 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003569 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003570
Alex Elderb1b54022012-07-03 16:01:19 -05003571 *snap_features = le64_to_cpu(features_buf.features);
3572
3573 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003574 (unsigned long long)snap_id,
3575 (unsigned long long)*snap_features,
3576 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003577
3578 return 0;
3579}
3580
3581static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3582{
3583 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3584 &rbd_dev->header.features);
3585}
3586
Alex Elder86b00e02012-10-25 23:34:42 -05003587static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3588{
3589 struct rbd_spec *parent_spec;
3590 size_t size;
3591 void *reply_buf = NULL;
3592 __le64 snapid;
3593 void *p;
3594 void *end;
3595 char *image_id;
3596 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003597 int ret;
3598
3599 parent_spec = rbd_spec_alloc();
3600 if (!parent_spec)
3601 return -ENOMEM;
3602
3603 size = sizeof (__le64) + /* pool_id */
3604 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3605 sizeof (__le64) + /* snap_id */
3606 sizeof (__le64); /* overlap */
3607 reply_buf = kmalloc(size, GFP_KERNEL);
3608 if (!reply_buf) {
3609 ret = -ENOMEM;
3610 goto out_err;
3611 }
3612
3613 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003614 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003615 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003616 &snapid, sizeof (snapid),
3617 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003618 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003619 if (ret < 0)
3620 goto out_err;
3621
Alex Elder86b00e02012-10-25 23:34:42 -05003622 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003623 end = reply_buf + ret;
3624 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003625 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3626 if (parent_spec->pool_id == CEPH_NOPOOL)
3627 goto out; /* No parent? No problem. */
3628
Alex Elder0903e872012-11-14 12:25:19 -06003629 /* The ceph file layout needs to fit pool id in 32 bits */
3630
3631 ret = -EIO;
Alex Elder57385b52013-04-21 12:14:45 -05003632 if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX))
3633 goto out_err;
Alex Elder0903e872012-11-14 12:25:19 -06003634
Alex Elder979ed482012-11-01 08:39:26 -05003635 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003636 if (IS_ERR(image_id)) {
3637 ret = PTR_ERR(image_id);
3638 goto out_err;
3639 }
3640 parent_spec->image_id = image_id;
3641 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3642 ceph_decode_64_safe(&p, end, overlap, out_err);
3643
3644 rbd_dev->parent_overlap = overlap;
3645 rbd_dev->parent_spec = parent_spec;
3646 parent_spec = NULL; /* rbd_dev now owns this */
3647out:
3648 ret = 0;
3649out_err:
3650 kfree(reply_buf);
3651 rbd_spec_put(parent_spec);
3652
3653 return ret;
3654}
3655
Alex Eldercc070d52013-04-21 12:14:45 -05003656static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3657{
3658 struct {
3659 __le64 stripe_unit;
3660 __le64 stripe_count;
3661 } __attribute__ ((packed)) striping_info_buf = { 0 };
3662 size_t size = sizeof (striping_info_buf);
3663 void *p;
3664 u64 obj_size;
3665 u64 stripe_unit;
3666 u64 stripe_count;
3667 int ret;
3668
3669 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3670 "rbd", "get_stripe_unit_count", NULL, 0,
3671 (char *)&striping_info_buf, size, NULL);
3672 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3673 if (ret < 0)
3674 return ret;
3675 if (ret < size)
3676 return -ERANGE;
3677
3678 /*
3679 * We don't actually support the "fancy striping" feature
3680 * (STRIPINGV2) yet, but if the striping sizes are the
3681 * defaults the behavior is the same as before. So find
3682 * out, and only fail if the image has non-default values.
3683 */
3684 ret = -EINVAL;
3685 obj_size = (u64)1 << rbd_dev->header.obj_order;
3686 p = &striping_info_buf;
3687 stripe_unit = ceph_decode_64(&p);
3688 if (stripe_unit != obj_size) {
3689 rbd_warn(rbd_dev, "unsupported stripe unit "
3690 "(got %llu want %llu)",
3691 stripe_unit, obj_size);
3692 return -EINVAL;
3693 }
3694 stripe_count = ceph_decode_64(&p);
3695 if (stripe_count != 1) {
3696 rbd_warn(rbd_dev, "unsupported stripe count "
3697 "(got %llu want 1)", stripe_count);
3698 return -EINVAL;
3699 }
3700 rbd_dev->stripe_unit = stripe_unit;
3701 rbd_dev->stripe_count = stripe_count;
3702
3703 return 0;
3704}
3705
Alex Elder9e15b772012-10-30 19:40:33 -05003706static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3707{
3708 size_t image_id_size;
3709 char *image_id;
3710 void *p;
3711 void *end;
3712 size_t size;
3713 void *reply_buf = NULL;
3714 size_t len = 0;
3715 char *image_name = NULL;
3716 int ret;
3717
3718 rbd_assert(!rbd_dev->spec->image_name);
3719
Alex Elder69e7a022012-11-01 08:39:26 -05003720 len = strlen(rbd_dev->spec->image_id);
3721 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003722 image_id = kmalloc(image_id_size, GFP_KERNEL);
3723 if (!image_id)
3724 return NULL;
3725
3726 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003727 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003728 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003729
3730 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3731 reply_buf = kmalloc(size, GFP_KERNEL);
3732 if (!reply_buf)
3733 goto out;
3734
Alex Elder36be9a72013-01-19 00:30:28 -06003735 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003736 "rbd", "dir_get_name",
3737 image_id, image_id_size,
Alex Elder41579762013-04-21 12:14:45 -05003738 reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003739 if (ret < 0)
3740 goto out;
3741 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003742 end = reply_buf + ret;
3743
Alex Elder9e15b772012-10-30 19:40:33 -05003744 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3745 if (IS_ERR(image_name))
3746 image_name = NULL;
3747 else
3748 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3749out:
3750 kfree(reply_buf);
3751 kfree(image_id);
3752
3753 return image_name;
3754}
3755
3756/*
3757 * When a parent image gets probed, we only have the pool, image,
3758 * and snapshot ids but not the names of any of them. This call
3759 * is made later to fill in those names. It has to be done after
3760 * rbd_dev_snaps_update() has completed because some of the
3761 * information (in particular, snapshot name) is not available
3762 * until then.
3763 */
3764static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3765{
3766 struct ceph_osd_client *osdc;
3767 const char *name;
3768 void *reply_buf = NULL;
3769 int ret;
3770
3771 if (rbd_dev->spec->pool_name)
3772 return 0; /* Already have the names */
3773
3774 /* Look up the pool name */
3775
3776 osdc = &rbd_dev->rbd_client->client->osdc;
3777 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003778 if (!name) {
3779 rbd_warn(rbd_dev, "there is no pool with id %llu",
3780 rbd_dev->spec->pool_id); /* Really a BUG() */
3781 return -EIO;
3782 }
Alex Elder9e15b772012-10-30 19:40:33 -05003783
3784 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3785 if (!rbd_dev->spec->pool_name)
3786 return -ENOMEM;
3787
3788 /* Fetch the image name; tolerate failure here */
3789
3790 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003791 if (name)
Alex Elder41579762013-04-21 12:14:45 -05003792 rbd_dev->spec->image_name = (char *)name;
Alex Elder69e7a022012-11-01 08:39:26 -05003793 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003794 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003795
3796 /* Look up the snapshot name. */
3797
3798 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3799 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003800 rbd_warn(rbd_dev, "no snapshot with id %llu",
3801 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003802 ret = -EIO;
3803 goto out_err;
3804 }
3805 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3806 if(!rbd_dev->spec->snap_name)
3807 goto out_err;
3808
3809 return 0;
3810out_err:
3811 kfree(reply_buf);
3812 kfree(rbd_dev->spec->pool_name);
3813 rbd_dev->spec->pool_name = NULL;
3814
3815 return ret;
3816}
3817
Alex Elder6e14b1a2012-07-03 16:01:19 -05003818static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003819{
3820 size_t size;
3821 int ret;
3822 void *reply_buf;
3823 void *p;
3824 void *end;
3825 u64 seq;
3826 u32 snap_count;
3827 struct ceph_snap_context *snapc;
3828 u32 i;
3829
3830 /*
3831 * We'll need room for the seq value (maximum snapshot id),
3832 * snapshot count, and array of that many snapshot ids.
3833 * For now we have a fixed upper limit on the number we're
3834 * prepared to receive.
3835 */
3836 size = sizeof (__le64) + sizeof (__le32) +
3837 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3838 reply_buf = kzalloc(size, GFP_KERNEL);
3839 if (!reply_buf)
3840 return -ENOMEM;
3841
Alex Elder36be9a72013-01-19 00:30:28 -06003842 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003843 "rbd", "get_snapcontext", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003844 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003845 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003846 if (ret < 0)
3847 goto out;
3848
Alex Elder35d489f2012-07-03 16:01:19 -05003849 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003850 end = reply_buf + ret;
3851 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003852 ceph_decode_64_safe(&p, end, seq, out);
3853 ceph_decode_32_safe(&p, end, snap_count, out);
3854
3855 /*
3856 * Make sure the reported number of snapshot ids wouldn't go
3857 * beyond the end of our buffer. But before checking that,
3858 * make sure the computed size of the snapshot context we
3859 * allocate is representable in a size_t.
3860 */
3861 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3862 / sizeof (u64)) {
3863 ret = -EINVAL;
3864 goto out;
3865 }
3866 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3867 goto out;
3868
3869 size = sizeof (struct ceph_snap_context) +
3870 snap_count * sizeof (snapc->snaps[0]);
3871 snapc = kmalloc(size, GFP_KERNEL);
3872 if (!snapc) {
3873 ret = -ENOMEM;
3874 goto out;
3875 }
Alex Elder57385b52013-04-21 12:14:45 -05003876 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05003877
3878 atomic_set(&snapc->nref, 1);
3879 snapc->seq = seq;
3880 snapc->num_snaps = snap_count;
3881 for (i = 0; i < snap_count; i++)
3882 snapc->snaps[i] = ceph_decode_64(&p);
3883
3884 rbd_dev->header.snapc = snapc;
3885
3886 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05003887 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003888out:
3889 kfree(reply_buf);
3890
Alex Elder57385b52013-04-21 12:14:45 -05003891 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05003892}
3893
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003894static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3895{
3896 size_t size;
3897 void *reply_buf;
3898 __le64 snap_id;
3899 int ret;
3900 void *p;
3901 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003902 char *snap_name;
3903
3904 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3905 reply_buf = kmalloc(size, GFP_KERNEL);
3906 if (!reply_buf)
3907 return ERR_PTR(-ENOMEM);
3908
Alex Elderacb1b6c2013-04-25 15:09:41 -05003909 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003910 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003911 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003912 "rbd", "get_snapshot_name",
Alex Elder41579762013-04-21 12:14:45 -05003913 &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003914 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003915 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05003916 if (ret < 0) {
3917 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003918 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05003919 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003920
3921 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003922 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05003923 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05003924 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003925 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003926
Alex Elderf40eb342013-04-25 15:09:42 -05003927 dout(" snap_id 0x%016llx snap_name = %s\n",
3928 (unsigned long long)le64_to_cpu(snap_id), snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003929out:
3930 kfree(reply_buf);
3931
Alex Elderf40eb342013-04-25 15:09:42 -05003932 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003933}
3934
3935static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3936 u64 *snap_size, u64 *snap_features)
3937{
Alex Eldere0b49862013-01-09 14:44:18 -06003938 u64 snap_id;
Alex Elderacb1b6c2013-04-25 15:09:41 -05003939 u64 size;
3940 u64 features;
3941 char *snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003942 int ret;
3943
Alex Elderacb1b6c2013-04-25 15:09:41 -05003944 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003945 snap_id = rbd_dev->header.snapc->snaps[which];
Alex Elderacb1b6c2013-04-25 15:09:41 -05003946 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003947 if (ret)
Alex Elderacb1b6c2013-04-25 15:09:41 -05003948 goto out_err;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003949
Alex Elderacb1b6c2013-04-25 15:09:41 -05003950 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
3951 if (ret)
3952 goto out_err;
3953
3954 snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
3955 if (!IS_ERR(snap_name)) {
3956 *snap_size = size;
3957 *snap_features = features;
3958 }
3959
3960 return snap_name;
3961out_err:
3962 return ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003963}
3964
3965static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3966 u64 *snap_size, u64 *snap_features)
3967{
3968 if (rbd_dev->image_format == 1)
3969 return rbd_dev_v1_snap_info(rbd_dev, which,
3970 snap_size, snap_features);
3971 if (rbd_dev->image_format == 2)
3972 return rbd_dev_v2_snap_info(rbd_dev, which,
3973 snap_size, snap_features);
3974 return ERR_PTR(-EINVAL);
3975}
3976
Alex Elder117973f2012-08-31 17:29:55 -05003977static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3978{
3979 int ret;
3980 __u8 obj_order;
3981
3982 down_write(&rbd_dev->header_rwsem);
3983
3984 /* Grab old order first, to see if it changes */
3985
3986 obj_order = rbd_dev->header.obj_order,
3987 ret = rbd_dev_v2_image_size(rbd_dev);
3988 if (ret)
3989 goto out;
3990 if (rbd_dev->header.obj_order != obj_order) {
3991 ret = -EIO;
3992 goto out;
3993 }
3994 rbd_update_mapping_size(rbd_dev);
3995
3996 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3997 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3998 if (ret)
3999 goto out;
4000 ret = rbd_dev_snaps_update(rbd_dev);
4001 dout("rbd_dev_snaps_update returned %d\n", ret);
4002 if (ret)
4003 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004004out:
4005 up_write(&rbd_dev->header_rwsem);
4006
4007 return ret;
4008}
4009
Alex Elder9d475de2012-07-03 16:01:19 -05004010/*
Alex Elder35938152012-08-02 11:29:46 -05004011 * Scan the rbd device's current snapshot list and compare it to the
4012 * newly-received snapshot context. Remove any existing snapshots
4013 * not present in the new snapshot context. Add a new snapshot for
4014 * any snaphots in the snapshot context not in the current list.
4015 * And verify there are no changes to snapshots we already know
4016 * about.
4017 *
4018 * Assumes the snapshots in the snapshot context are sorted by
4019 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4020 * are also maintained in that order.)
Alex Elder522a0cc2013-04-25 15:09:41 -05004021 *
4022 * Note that any error occurs while updating the snapshot list
4023 * aborts the update, and the entire list is cleared. The snapshot
4024 * list becomes inconsistent at that point anyway, so it might as
4025 * well be empty.
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004026 */
Alex Elder304f6802012-08-31 17:29:52 -05004027static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004028{
Alex Elder35938152012-08-02 11:29:46 -05004029 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4030 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004031 struct list_head *head = &rbd_dev->snaps;
4032 struct list_head *links = head->next;
4033 u32 index = 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004034 int ret = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004035
Alex Elder522a0cc2013-04-25 15:09:41 -05004036 dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004037 while (index < snap_count || links != head) {
4038 u64 snap_id;
4039 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05004040 char *snap_name;
4041 u64 snap_size = 0;
4042 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004043
Alex Elder35938152012-08-02 11:29:46 -05004044 snap_id = index < snap_count ? snapc->snaps[index]
4045 : CEPH_NOSNAP;
4046 snap = links != head ? list_entry(links, struct rbd_snap, node)
4047 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004048 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004049
Alex Elder35938152012-08-02 11:29:46 -05004050 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4051 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004052
Alex Elder6d292902013-01-14 12:43:31 -06004053 /*
4054 * A previously-existing snapshot is not in
4055 * the new snap context.
4056 *
Alex Elder522a0cc2013-04-25 15:09:41 -05004057 * If the now-missing snapshot is the one
4058 * the image represents, clear its existence
4059 * flag so we can avoid sending any more
4060 * requests to it.
Alex Elder6d292902013-01-14 12:43:31 -06004061 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004062 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004063 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder3e83b652013-04-23 13:52:53 -05004064 dout("removing %ssnap id %llu\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004065 rbd_dev->spec->snap_id == snap->id ?
4066 "mapped " : "",
Alex Elder522a0cc2013-04-25 15:09:41 -05004067 (unsigned long long)snap->id);
Alex Elder6087b512013-04-25 15:09:41 -05004068
4069 list_del(&snap->node);
4070 rbd_snap_destroy(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004071
Alex Elder35938152012-08-02 11:29:46 -05004072 /* Done with this list entry; advance */
4073
4074 links = next;
4075 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004076 }
Alex Elder35938152012-08-02 11:29:46 -05004077
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004078 snap_name = rbd_dev_snap_info(rbd_dev, index,
4079 &snap_size, &snap_features);
Alex Elder522a0cc2013-04-25 15:09:41 -05004080 if (IS_ERR(snap_name)) {
4081 ret = PTR_ERR(snap_name);
4082 dout("failed to get snap info, error %d\n", ret);
4083 goto out_err;
4084 }
Alex Eldercd892122012-07-03 16:01:19 -05004085
Alex Elder522a0cc2013-04-25 15:09:41 -05004086 dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
4087 (unsigned long long)snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004088 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4089 struct rbd_snap *new_snap;
4090
4091 /* We haven't seen this snapshot before */
4092
Alex Elder6087b512013-04-25 15:09:41 -05004093 new_snap = rbd_snap_create(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004094 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004095 if (IS_ERR(new_snap)) {
Alex Elder522a0cc2013-04-25 15:09:41 -05004096 ret = PTR_ERR(new_snap);
4097 dout(" failed to add dev, error %d\n", ret);
4098 goto out_err;
Alex Elder9fcbb802012-08-23 23:48:49 -05004099 }
Alex Elder35938152012-08-02 11:29:46 -05004100
4101 /* New goes before existing, or at end of list */
4102
Alex Elder9fcbb802012-08-23 23:48:49 -05004103 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004104 if (snap)
4105 list_add_tail(&new_snap->node, &snap->node);
4106 else
Alex Elder523f3252012-08-30 00:16:37 -05004107 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004108 } else {
4109 /* Already have this one */
4110
Alex Elder9fcbb802012-08-23 23:48:49 -05004111 dout(" already present\n");
4112
Alex Eldercd892122012-07-03 16:01:19 -05004113 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004114 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004115 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004116
4117 /* Done with this list entry; advance */
4118
4119 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004120 }
Alex Elder35938152012-08-02 11:29:46 -05004121
4122 /* Advance to the next entry in the snapshot context */
4123
4124 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004125 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004126 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004127
4128 return 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004129out_err:
4130 rbd_remove_all_snaps(rbd_dev);
4131
4132 return ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004133}
4134
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004135static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4136{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004137 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004138 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004139
4140 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004141
Alex Eldercd789ab2012-08-30 00:16:38 -05004142 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004143 dev->bus = &rbd_bus_type;
4144 dev->type = &rbd_device_type;
4145 dev->parent = &rbd_root_dev;
4146 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05004147 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004148 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004149
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004150 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004151
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004152 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004153}
4154
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004155static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4156{
4157 device_unregister(&rbd_dev->dev);
4158}
4159
Alex Eldere2839302012-08-29 17:11:06 -05004160static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004161
4162/*
Alex Elder499afd52012-02-02 08:13:29 -06004163 * Get a unique rbd identifier for the given new rbd_dev, and add
4164 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004165 */
Alex Eldere2839302012-08-29 17:11:06 -05004166static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004167{
Alex Eldere2839302012-08-29 17:11:06 -05004168 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004169
4170 spin_lock(&rbd_dev_list_lock);
4171 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4172 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004173 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4174 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004175}
Alex Elderb7f23c32012-01-29 13:57:43 -06004176
Alex Elder1ddbe942012-01-29 13:57:44 -06004177/*
Alex Elder499afd52012-02-02 08:13:29 -06004178 * Remove an rbd_dev from the global list, and record that its
4179 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004180 */
Alex Eldere2839302012-08-29 17:11:06 -05004181static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004182{
Alex Elderd184f6b2012-01-29 13:57:44 -06004183 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004184 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004185 int max_id;
4186
Alex Elderaafb2302012-09-06 16:00:54 -05004187 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004188
Alex Eldere2839302012-08-29 17:11:06 -05004189 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4190 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004191 spin_lock(&rbd_dev_list_lock);
4192 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004193
4194 /*
4195 * If the id being "put" is not the current maximum, there
4196 * is nothing special we need to do.
4197 */
Alex Eldere2839302012-08-29 17:11:06 -05004198 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004199 spin_unlock(&rbd_dev_list_lock);
4200 return;
4201 }
4202
4203 /*
4204 * We need to update the current maximum id. Search the
4205 * list to find out what it is. We're more likely to find
4206 * the maximum at the end, so search the list backward.
4207 */
4208 max_id = 0;
4209 list_for_each_prev(tmp, &rbd_dev_list) {
4210 struct rbd_device *rbd_dev;
4211
4212 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004213 if (rbd_dev->dev_id > max_id)
4214 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004215 }
Alex Elder499afd52012-02-02 08:13:29 -06004216 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004217
Alex Elder1ddbe942012-01-29 13:57:44 -06004218 /*
Alex Eldere2839302012-08-29 17:11:06 -05004219 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004220 * which case it now accurately reflects the new maximum.
4221 * Be careful not to overwrite the maximum value in that
4222 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004223 */
Alex Eldere2839302012-08-29 17:11:06 -05004224 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4225 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004226}
4227
Alex Eldera725f65e2012-02-02 08:13:30 -06004228/*
Alex Eldere28fff262012-02-02 08:13:30 -06004229 * Skips over white space at *buf, and updates *buf to point to the
4230 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004231 * the token (string of non-white space characters) found. Note
4232 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004233 */
4234static inline size_t next_token(const char **buf)
4235{
4236 /*
4237 * These are the characters that produce nonzero for
4238 * isspace() in the "C" and "POSIX" locales.
4239 */
4240 const char *spaces = " \f\n\r\t\v";
4241
4242 *buf += strspn(*buf, spaces); /* Find start of token */
4243
4244 return strcspn(*buf, spaces); /* Return token length */
4245}
4246
4247/*
4248 * Finds the next token in *buf, and if the provided token buffer is
4249 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004250 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4251 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004252 *
4253 * Returns the length of the token found (not including the '\0').
4254 * Return value will be 0 if no token is found, and it will be >=
4255 * token_size if the token would not fit.
4256 *
Alex Elder593a9e72012-02-07 12:03:37 -06004257 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004258 * found token. Note that this occurs even if the token buffer is
4259 * too small to hold it.
4260 */
4261static inline size_t copy_token(const char **buf,
4262 char *token,
4263 size_t token_size)
4264{
4265 size_t len;
4266
4267 len = next_token(buf);
4268 if (len < token_size) {
4269 memcpy(token, *buf, len);
4270 *(token + len) = '\0';
4271 }
4272 *buf += len;
4273
4274 return len;
4275}
4276
4277/*
Alex Elderea3352f2012-07-09 21:04:23 -05004278 * Finds the next token in *buf, dynamically allocates a buffer big
4279 * enough to hold a copy of it, and copies the token into the new
4280 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4281 * that a duplicate buffer is created even for a zero-length token.
4282 *
4283 * Returns a pointer to the newly-allocated duplicate, or a null
4284 * pointer if memory for the duplicate was not available. If
4285 * the lenp argument is a non-null pointer, the length of the token
4286 * (not including the '\0') is returned in *lenp.
4287 *
4288 * If successful, the *buf pointer will be updated to point beyond
4289 * the end of the found token.
4290 *
4291 * Note: uses GFP_KERNEL for allocation.
4292 */
4293static inline char *dup_token(const char **buf, size_t *lenp)
4294{
4295 char *dup;
4296 size_t len;
4297
4298 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004299 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004300 if (!dup)
4301 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004302 *(dup + len) = '\0';
4303 *buf += len;
4304
4305 if (lenp)
4306 *lenp = len;
4307
4308 return dup;
4309}
4310
4311/*
Alex Elder859c31d2012-10-25 23:34:42 -05004312 * Parse the options provided for an "rbd add" (i.e., rbd image
4313 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4314 * and the data written is passed here via a NUL-terminated buffer.
4315 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004316 *
Alex Elder859c31d2012-10-25 23:34:42 -05004317 * The information extracted from these options is recorded in
4318 * the other parameters which return dynamically-allocated
4319 * structures:
4320 * ceph_opts
4321 * The address of a pointer that will refer to a ceph options
4322 * structure. Caller must release the returned pointer using
4323 * ceph_destroy_options() when it is no longer needed.
4324 * rbd_opts
4325 * Address of an rbd options pointer. Fully initialized by
4326 * this function; caller must release with kfree().
4327 * spec
4328 * Address of an rbd image specification pointer. Fully
4329 * initialized by this function based on parsed options.
4330 * Caller must release with rbd_spec_put().
4331 *
4332 * The options passed take this form:
4333 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4334 * where:
4335 * <mon_addrs>
4336 * A comma-separated list of one or more monitor addresses.
4337 * A monitor address is an ip address, optionally followed
4338 * by a port number (separated by a colon).
4339 * I.e.: ip1[:port1][,ip2[:port2]...]
4340 * <options>
4341 * A comma-separated list of ceph and/or rbd options.
4342 * <pool_name>
4343 * The name of the rados pool containing the rbd image.
4344 * <image_name>
4345 * The name of the image in that pool to map.
4346 * <snap_id>
4347 * An optional snapshot id. If provided, the mapping will
4348 * present data from the image at the time that snapshot was
4349 * created. The image head is used if no snapshot id is
4350 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004351 */
Alex Elder859c31d2012-10-25 23:34:42 -05004352static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004353 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004354 struct rbd_options **opts,
4355 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004356{
Alex Elderd22f76e2012-07-12 10:46:35 -05004357 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004358 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004359 const char *mon_addrs;
4360 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004361 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004362 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004363 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004364 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004365
4366 /* The first four tokens are required */
4367
Alex Elder7ef32142012-02-02 08:13:30 -06004368 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004369 if (!len) {
4370 rbd_warn(NULL, "no monitor address(es) provided");
4371 return -EINVAL;
4372 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004373 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004374 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004375 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004376
Alex Elderdc79b112012-10-25 23:34:41 -05004377 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004378 options = dup_token(&buf, NULL);
4379 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004380 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004381 if (!*options) {
4382 rbd_warn(NULL, "no options provided");
4383 goto out_err;
4384 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004385
Alex Elder859c31d2012-10-25 23:34:42 -05004386 spec = rbd_spec_alloc();
4387 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004388 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004389
4390 spec->pool_name = dup_token(&buf, NULL);
4391 if (!spec->pool_name)
4392 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004393 if (!*spec->pool_name) {
4394 rbd_warn(NULL, "no pool name provided");
4395 goto out_err;
4396 }
Alex Eldere28fff262012-02-02 08:13:30 -06004397
Alex Elder69e7a022012-11-01 08:39:26 -05004398 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004399 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004400 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004401 if (!*spec->image_name) {
4402 rbd_warn(NULL, "no image name provided");
4403 goto out_err;
4404 }
Alex Eldere28fff262012-02-02 08:13:30 -06004405
Alex Elderf28e5652012-10-25 23:34:41 -05004406 /*
4407 * Snapshot name is optional; default is to use "-"
4408 * (indicating the head/no snapshot).
4409 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004410 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004411 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004412 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4413 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004414 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004415 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004416 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004417 }
Alex Elder4caf35f2012-11-01 08:39:27 -05004418 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05004419 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004420 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004421 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05004422
Alex Elder0ddebc02012-10-25 23:34:41 -05004423 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004424
Alex Elder4e9afeb2012-10-25 23:34:41 -05004425 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4426 if (!rbd_opts)
4427 goto out_mem;
4428
4429 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004430
Alex Elder859c31d2012-10-25 23:34:42 -05004431 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004432 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004433 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004434 if (IS_ERR(copts)) {
4435 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004436 goto out_err;
4437 }
Alex Elder859c31d2012-10-25 23:34:42 -05004438 kfree(options);
4439
4440 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004441 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004442 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004443
Alex Elderdc79b112012-10-25 23:34:41 -05004444 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004445out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004446 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004447out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004448 kfree(rbd_opts);
4449 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004450 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004451
Alex Elderdc79b112012-10-25 23:34:41 -05004452 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004453}
4454
Alex Elder589d30e2012-07-10 20:30:11 -05004455/*
4456 * An rbd format 2 image has a unique identifier, distinct from the
4457 * name given to it by the user. Internally, that identifier is
4458 * what's used to specify the names of objects related to the image.
4459 *
4460 * A special "rbd id" object is used to map an rbd image name to its
4461 * id. If that object doesn't exist, then there is no v2 rbd image
4462 * with the supplied name.
4463 *
4464 * This function will record the given rbd_dev's image_id field if
4465 * it can be determined, and in that case will return 0. If any
4466 * errors occur a negative errno will be returned and the rbd_dev's
4467 * image_id field will be unchanged (and should be NULL).
4468 */
4469static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4470{
4471 int ret;
4472 size_t size;
4473 char *object_name;
4474 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004475 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004476
Alex Elder589d30e2012-07-10 20:30:11 -05004477 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004478 * When probing a parent image, the image id is already
4479 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004480 * need to fetch the image id again in this case. We
4481 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004482 */
Alex Elderc0fba362013-04-25 23:15:08 -05004483 if (rbd_dev->spec->image_id) {
4484 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4485
Alex Elder2c0d0a12012-10-30 19:40:33 -05004486 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004487 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004488
4489 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004490 * First, see if the format 2 image id file exists, and if
4491 * so, get the image's persistent id from it.
4492 */
Alex Elder69e7a022012-11-01 08:39:26 -05004493 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004494 object_name = kmalloc(size, GFP_NOIO);
4495 if (!object_name)
4496 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004497 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004498 dout("rbd id object name is %s\n", object_name);
4499
4500 /* Response will be an encoded string, which includes a length */
4501
4502 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4503 response = kzalloc(size, GFP_NOIO);
4504 if (!response) {
4505 ret = -ENOMEM;
4506 goto out;
4507 }
4508
Alex Elderc0fba362013-04-25 23:15:08 -05004509 /* If it doesn't exist we'll assume it's a format 1 image */
4510
Alex Elder36be9a72013-01-19 00:30:28 -06004511 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004512 "rbd", "get_id", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06004513 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06004514 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004515 if (ret == -ENOENT) {
4516 image_id = kstrdup("", GFP_KERNEL);
4517 ret = image_id ? 0 : -ENOMEM;
4518 if (!ret)
4519 rbd_dev->image_format = 1;
4520 } else if (ret > sizeof (__le32)) {
4521 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004522
Alex Elderc0fba362013-04-25 23:15:08 -05004523 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004524 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004525 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4526 if (!ret)
4527 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004528 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004529 ret = -EINVAL;
4530 }
4531
4532 if (!ret) {
4533 rbd_dev->spec->image_id = image_id;
4534 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004535 }
4536out:
4537 kfree(response);
4538 kfree(object_name);
4539
4540 return ret;
4541}
4542
Alex Eldera30b71b2012-07-10 20:30:11 -05004543static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4544{
4545 int ret;
4546 size_t size;
4547
Alex Eldera30b71b2012-07-10 20:30:11 -05004548 /* Record the header object name for this rbd image. */
4549
Alex Elder69e7a022012-11-01 08:39:26 -05004550 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004551 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4552 if (!rbd_dev->header_name) {
4553 ret = -ENOMEM;
4554 goto out_err;
4555 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004556 sprintf(rbd_dev->header_name, "%s%s",
4557 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004558
4559 /* Populate rbd image metadata */
4560
4561 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4562 if (ret < 0)
4563 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004564
4565 /* Version 1 images have no parent (no layering) */
4566
4567 rbd_dev->parent_spec = NULL;
4568 rbd_dev->parent_overlap = 0;
4569
Alex Eldera30b71b2012-07-10 20:30:11 -05004570 dout("discovered version 1 image, header name is %s\n",
4571 rbd_dev->header_name);
4572
4573 return 0;
4574
4575out_err:
4576 kfree(rbd_dev->header_name);
4577 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004578 kfree(rbd_dev->spec->image_id);
4579 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004580
4581 return ret;
4582}
4583
4584static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4585{
4586 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05004587 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004588 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004589
4590 /*
4591 * Image id was filled in by the caller. Record the header
4592 * object name for this rbd image.
4593 */
Alex Elder979ed482012-11-01 08:39:26 -05004594 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05004595 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4596 if (!rbd_dev->header_name)
4597 return -ENOMEM;
4598 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004599 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05004600
4601 /* Get the size and object order for the image */
Alex Elder9d475de2012-07-03 16:01:19 -05004602 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004603 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004604 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004605
4606 /* Get the object prefix (a.k.a. block_name) for the image */
4607
4608 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004609 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004610 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004611
Alex Elderd8891402012-10-09 13:50:17 -07004612 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004613
4614 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004615 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004616 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004617
Alex Elder86b00e02012-10-25 23:34:42 -05004618 /* If the image supports layering, get the parent info */
4619
4620 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4621 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004622 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004623 goto out_err;
Alex Elder770eba62012-10-25 23:34:40 -05004624 rbd_warn(rbd_dev, "WARNING: kernel support for "
4625 "layered rbd images is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004626 }
4627
Alex Eldercc070d52013-04-21 12:14:45 -05004628 /* If the image supports fancy striping, get its parameters */
4629
4630 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4631 ret = rbd_dev_v2_striping_info(rbd_dev);
4632 if (ret < 0)
4633 goto out_err;
4634 }
4635
Alex Elder6e14b1a2012-07-03 16:01:19 -05004636 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004637
Alex Elder6e14b1a2012-07-03 16:01:19 -05004638 rbd_dev->header.crypt_type = 0;
4639 rbd_dev->header.comp_type = 0;
4640
4641 /* Get the snapshot context, plus the header version */
4642
4643 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004644 if (ret)
4645 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004646 rbd_dev->header.obj_version = ver;
4647
Alex Eldera30b71b2012-07-10 20:30:11 -05004648 dout("discovered version 2 image, header name is %s\n",
4649 rbd_dev->header_name);
4650
Alex Elder35152972012-08-31 17:29:55 -05004651 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004652out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004653 rbd_dev->parent_overlap = 0;
4654 rbd_spec_put(rbd_dev->parent_spec);
4655 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004656 kfree(rbd_dev->header_name);
4657 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004658 kfree(rbd_dev->header.object_prefix);
4659 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004660
4661 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004662}
4663
Alex Elder83a06262012-10-30 15:47:17 -05004664static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4665{
Alex Elder2f82ee52012-10-30 19:40:33 -05004666 struct rbd_device *parent = NULL;
4667 struct rbd_spec *parent_spec = NULL;
4668 struct rbd_client *rbdc = NULL;
Alex Elder83a06262012-10-30 15:47:17 -05004669 int ret;
4670
4671 /* no need to lock here, as rbd_dev is not registered yet */
4672 ret = rbd_dev_snaps_update(rbd_dev);
4673 if (ret)
4674 return ret;
4675
Alex Elder9e15b772012-10-30 19:40:33 -05004676 ret = rbd_dev_probe_update_spec(rbd_dev);
4677 if (ret)
4678 goto err_out_snaps;
4679
Alex Elder83a06262012-10-30 15:47:17 -05004680 ret = rbd_dev_set_mapping(rbd_dev);
4681 if (ret)
4682 goto err_out_snaps;
4683
4684 /* generate unique id: find highest unique id, add one */
4685 rbd_dev_id_get(rbd_dev);
4686
4687 /* Fill in the device name, now that we have its id. */
4688 BUILD_BUG_ON(DEV_NAME_LEN
4689 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4690 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4691
4692 /* Get our block major device number. */
4693
4694 ret = register_blkdev(0, rbd_dev->name);
4695 if (ret < 0)
4696 goto err_out_id;
4697 rbd_dev->major = ret;
4698
4699 /* Set up the blkdev mapping. */
4700
4701 ret = rbd_init_disk(rbd_dev);
4702 if (ret)
4703 goto err_out_blkdev;
4704
4705 ret = rbd_bus_add_dev(rbd_dev);
4706 if (ret)
4707 goto err_out_disk;
4708
4709 /*
4710 * At this point cleanup in the event of an error is the job
4711 * of the sysfs code (initiated by rbd_bus_del_dev()).
4712 */
Alex Elder2f82ee52012-10-30 19:40:33 -05004713 /* Probe the parent if there is one */
4714
4715 if (rbd_dev->parent_spec) {
4716 /*
4717 * We need to pass a reference to the client and the
4718 * parent spec when creating the parent rbd_dev.
4719 * Images related by parent/child relationships
4720 * always share both.
4721 */
4722 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4723 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4724
4725 parent = rbd_dev_create(rbdc, parent_spec);
4726 if (!parent) {
4727 ret = -ENOMEM;
4728 goto err_out_spec;
4729 }
4730 rbdc = NULL; /* parent now owns reference */
4731 parent_spec = NULL; /* parent now owns reference */
4732 ret = rbd_dev_probe(parent);
4733 if (ret < 0)
4734 goto err_out_parent;
4735 rbd_dev->parent = parent;
4736 }
4737
Alex Elder9969ebc2013-01-18 12:31:10 -06004738 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004739 if (ret)
4740 goto err_out_bus;
4741
4742 /* Everything's ready. Announce the disk to the world. */
4743
4744 add_disk(rbd_dev->disk);
4745
4746 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4747 (unsigned long long) rbd_dev->mapping.size);
4748
4749 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004750
4751err_out_parent:
4752 rbd_dev_destroy(parent);
4753err_out_spec:
4754 rbd_spec_put(parent_spec);
4755 rbd_put_client(rbdc);
Alex Elder83a06262012-10-30 15:47:17 -05004756err_out_bus:
4757 /* this will also clean up rest of rbd_dev stuff */
4758
4759 rbd_bus_del_dev(rbd_dev);
4760
4761 return ret;
4762err_out_disk:
4763 rbd_free_disk(rbd_dev);
4764err_out_blkdev:
4765 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4766err_out_id:
4767 rbd_dev_id_put(rbd_dev);
4768err_out_snaps:
4769 rbd_remove_all_snaps(rbd_dev);
4770
4771 return ret;
4772}
4773
Alex Eldera30b71b2012-07-10 20:30:11 -05004774/*
4775 * Probe for the existence of the header object for the given rbd
4776 * device. For format 2 images this includes determining the image
4777 * id.
4778 */
4779static int rbd_dev_probe(struct rbd_device *rbd_dev)
4780{
4781 int ret;
4782
4783 /*
4784 * Get the id from the image id object. If it's not a
4785 * format 2 image, we'll get ENOENT back, and we'll assume
4786 * it's a format 1 image.
4787 */
4788 ret = rbd_dev_image_id(rbd_dev);
4789 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004790 return ret;
4791 rbd_assert(rbd_dev->spec->image_id);
4792 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4793
4794 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004795 ret = rbd_dev_v1_probe(rbd_dev);
4796 else
4797 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004798 if (ret)
4799 goto out_err;
Alex Elder83a06262012-10-30 15:47:17 -05004800
4801 ret = rbd_dev_probe_finish(rbd_dev);
4802 if (ret)
4803 rbd_header_free(&rbd_dev->header);
4804
Alex Eldera30b71b2012-07-10 20:30:11 -05004805 return ret;
Alex Elder5655c4d2013-04-25 23:15:08 -05004806out_err:
4807 kfree(rbd_dev->spec->image_id);
4808 rbd_dev->spec->image_id = NULL;
4809
4810 dout("probe failed, returning %d\n", ret);
4811
4812 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004813}
4814
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004815static ssize_t rbd_add(struct bus_type *bus,
4816 const char *buf,
4817 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004818{
Alex Eldercb8627c2012-07-09 21:04:23 -05004819 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004820 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004821 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004822 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004823 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004824 struct ceph_osd_client *osdc;
4825 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004826
4827 if (!try_module_get(THIS_MODULE))
4828 return -ENODEV;
4829
Alex Eldera725f65e2012-02-02 08:13:30 -06004830 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004831 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004832 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004833 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004834
Alex Elder9d3997f2012-10-25 23:34:42 -05004835 rbdc = rbd_get_client(ceph_opts);
4836 if (IS_ERR(rbdc)) {
4837 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004838 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004839 }
Alex Elderc53d5892012-10-25 23:34:42 -05004840 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004841
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004842 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004843 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004844 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004845 if (rc < 0)
4846 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004847 spec->pool_id = (u64) rc;
4848
Alex Elder0903e872012-11-14 12:25:19 -06004849 /* The ceph file layout needs to fit pool id in 32 bits */
4850
4851 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4852 rc = -EIO;
4853 goto err_out_client;
4854 }
4855
Alex Elderc53d5892012-10-25 23:34:42 -05004856 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004857 if (!rbd_dev)
4858 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004859 rbdc = NULL; /* rbd_dev now owns this */
4860 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004861
Alex Elderbd4ba652012-10-25 23:34:42 -05004862 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004863 kfree(rbd_opts);
4864 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004865
Alex Eldera30b71b2012-07-10 20:30:11 -05004866 rc = rbd_dev_probe(rbd_dev);
4867 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004868 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004869
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004870 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004871err_out_rbd_dev:
4872 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004873err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004874 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004875err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004876 if (ceph_opts)
4877 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004878 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004879 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004880err_out_module:
4881 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004882
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004883 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004884
4885 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004886}
4887
Alex Elderde71a292012-07-03 16:01:19 -05004888static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004889{
4890 struct list_head *tmp;
4891 struct rbd_device *rbd_dev;
4892
Alex Eldere124a82f2012-01-29 13:57:44 -06004893 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004894 list_for_each(tmp, &rbd_dev_list) {
4895 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004896 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004897 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004898 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004899 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004900 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004901 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004902 return NULL;
4903}
4904
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004905static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004906{
Alex Elder593a9e72012-02-07 12:03:37 -06004907 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004908
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004909 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004910 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004911
4912 /* clean up and free blkdev */
4913 rbd_free_disk(rbd_dev);
4914 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004915
Alex Elder2ac4e752012-07-10 20:30:10 -05004916 /* release allocated disk header fields */
4917 rbd_header_free(&rbd_dev->header);
4918
Alex Elder32eec682012-02-08 16:11:14 -06004919 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004920 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004921 rbd_assert(rbd_dev->rbd_client != NULL);
4922 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004923
4924 /* release module ref */
4925 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004926}
4927
Alex Elder2f82ee52012-10-30 19:40:33 -05004928static void __rbd_remove(struct rbd_device *rbd_dev)
4929{
4930 rbd_remove_all_snaps(rbd_dev);
4931 rbd_bus_del_dev(rbd_dev);
4932}
4933
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004934static ssize_t rbd_remove(struct bus_type *bus,
4935 const char *buf,
4936 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004937{
4938 struct rbd_device *rbd_dev = NULL;
4939 int target_id, rc;
4940 unsigned long ul;
4941 int ret = count;
4942
4943 rc = strict_strtoul(buf, 10, &ul);
4944 if (rc)
4945 return rc;
4946
4947 /* convert to int; abort if we lost anything in the conversion */
4948 target_id = (int) ul;
4949 if (target_id != ul)
4950 return -EINVAL;
4951
4952 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4953
4954 rbd_dev = __rbd_get_dev(target_id);
4955 if (!rbd_dev) {
4956 ret = -ENOENT;
4957 goto done;
4958 }
4959
Alex Eldera14ea262013-02-05 13:23:12 -06004960 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004961 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004962 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004963 else
4964 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004965 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004966 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06004967 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06004968
Alex Elder2f82ee52012-10-30 19:40:33 -05004969 while (rbd_dev->parent_spec) {
4970 struct rbd_device *first = rbd_dev;
4971 struct rbd_device *second = first->parent;
4972 struct rbd_device *third;
4973
4974 /*
4975 * Follow to the parent with no grandparent and
4976 * remove it.
4977 */
4978 while (second && (third = second->parent)) {
4979 first = second;
4980 second = third;
4981 }
4982 __rbd_remove(second);
4983 rbd_spec_put(first->parent_spec);
4984 first->parent_spec = NULL;
4985 first->parent_overlap = 0;
4986 first->parent = NULL;
4987 }
4988 __rbd_remove(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004989
4990done:
4991 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004992
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004993 return ret;
4994}
4995
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004996/*
4997 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004998 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004999 */
5000static int rbd_sysfs_init(void)
5001{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005002 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005003
Alex Elderfed4c142012-02-07 12:03:36 -06005004 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005005 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005006 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005007
Alex Elderfed4c142012-02-07 12:03:36 -06005008 ret = bus_register(&rbd_bus_type);
5009 if (ret < 0)
5010 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005011
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005012 return ret;
5013}
5014
5015static void rbd_sysfs_cleanup(void)
5016{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005017 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005018 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005019}
5020
Alex Eldercc344fa2013-02-19 12:25:56 -06005021static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005022{
5023 int rc;
5024
Alex Elder1e32d342013-01-30 11:13:33 -06005025 if (!libceph_compatible(NULL)) {
5026 rbd_warn(NULL, "libceph incompatibility (quitting)");
5027
5028 return -EINVAL;
5029 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005030 rc = rbd_sysfs_init();
5031 if (rc)
5032 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005033 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005034 return 0;
5035}
5036
Alex Eldercc344fa2013-02-19 12:25:56 -06005037static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005038{
5039 rbd_sysfs_cleanup();
5040}
5041
5042module_init(rbd_init);
5043module_exit(rbd_exit);
5044
5045MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5046MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5047MODULE_DESCRIPTION("rados block device");
5048
5049/* following authorship retained from original osdblk.c */
5050MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5051
5052MODULE_LICENSE("GPL");