blob: 91b4b741efda25c400d26c137269637f614887e7 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderf0f8cef2012-01-29 13:57:44 -060055#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070057
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
Alex Elderd4b125e2012-07-03 16:01:19 -050060#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
Alex Elder35d489f2012-07-03 16:01:19 -050064#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070065
66#define RBD_SNAP_HEAD_NAME "-"
67
Alex Elder9e15b772012-10-30 19:40:33 -050068/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050071
Alex Elder1e130192012-07-03 16:01:19 -050072#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050073
Alex Elderd8891402012-10-09 13:50:17 -070074/* Feature bits */
75
Alex Elder5cbf6f122013-04-11 09:29:48 -050076#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070080
81/* Features supported by this (client software) implementation. */
82
Alex Elder5cbf6f122013-04-11 09:29:48 -050083#define RBD_FEATURES_SUPPORTED (0)
Alex Elderd8891402012-10-09 13:50:17 -070084
Alex Elder81a89792012-02-02 08:13:30 -060085/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060092#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050098 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050099 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500100 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 char *snap_names;
109 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700110
111 u64 obj_version;
112};
113
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
Alex Elderbf0d5f502012-11-22 00:00:08 -0600161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
Alex Elder9969ebc2013-01-18 12:31:10 -0600169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600172
Alex Elder926f9b32013-02-11 12:33:24 -0600173enum obj_req_flags {
174 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600175 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600176 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
177 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600178};
179
Alex Elderbf0d5f502012-11-22 00:00:08 -0600180struct rbd_obj_request {
181 const char *object_name;
182 u64 offset; /* object start byte */
183 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600184 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600185
Alex Elderc5b5ef62013-02-11 12:33:24 -0600186 /*
187 * An object request associated with an image will have its
188 * img_data flag set; a standalone object request will not.
189 *
190 * A standalone object request will have which == BAD_WHICH
191 * and a null obj_request pointer.
192 *
193 * An object request initiated in support of a layered image
194 * object (to check for its existence before a write) will
195 * have which == BAD_WHICH and a non-null obj_request pointer.
196 *
197 * Finally, an object request for rbd image data will have
198 * which != BAD_WHICH, and will have a non-null img_request
199 * pointer. The value of which will be in the range
200 * 0..(img_request->obj_request_count-1).
201 */
202 union {
203 struct rbd_obj_request *obj_request; /* STAT op */
204 struct {
205 struct rbd_img_request *img_request;
206 u64 img_offset;
207 /* links for img_request->obj_requests list */
208 struct list_head links;
209 };
210 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211 u32 which; /* posn image request list */
212
213 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600214 union {
215 struct bio *bio_list;
216 struct {
217 struct page **pages;
218 u32 page_count;
219 };
220 };
Alex Elder0eefd472013-04-19 15:34:50 -0500221 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600222
223 struct ceph_osd_request *osd_req;
224
225 u64 xferred; /* bytes transferred */
226 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800227 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600230 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 struct kref kref;
233};
234
Alex Elder0c425242013-02-08 09:55:49 -0600235enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600236 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
237 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600238 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242 struct rbd_device *rbd_dev;
243 u64 offset; /* starting image byte offset */
244 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600245 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246 union {
Alex Elder9849e982013-01-24 16:13:36 -0600247 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600249 };
250 union {
251 struct request *rq; /* block request */
252 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500254 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 spinlock_t completion_lock;/* protects next_completion */
256 u32 next_completion;
257 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500258 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600259 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260
261 u32 obj_request_count;
262 struct list_head obj_requests; /* rbd_obj_request structs */
263
264 struct kref kref;
265};
266
267#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600268 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600270 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600271#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600272 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600273
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274struct rbd_snap {
275 struct device dev;
276 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800277 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278 struct list_head node;
279 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500280 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800281};
282
Alex Elderf84344f2012-08-31 17:29:51 -0500283struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500284 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500285 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500286 bool read_only;
287};
288
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289/*
290 * a single device
291 */
292struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500293 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294
295 int major; /* blkdev assigned major */
296 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
Alex Eldera30b71b2012-07-10 20:30:11 -0500298 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299 struct rbd_client *rbd_client;
300
301 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
302
Alex Elderb82d1672013-01-14 12:43:31 -0600303 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
305 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600306 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500307 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500309 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500310
Alex Elder0903e872012-11-14 12:25:19 -0600311 struct ceph_file_layout layout;
312
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700313 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600314 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700315
Alex Elder86b00e02012-10-25 23:34:42 -0500316 struct rbd_spec *parent_spec;
317 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500318 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500319
Josh Durginc6666012011-11-21 17:11:12 -0800320 /* protects updating the header */
321 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500322
323 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
325 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800326
327 /* list of snapshots */
328 struct list_head snaps;
329
330 /* sysfs related */
331 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600332 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800333};
334
Alex Elderb82d1672013-01-14 12:43:31 -0600335/*
336 * Flag bits for rbd_dev->flags. If atomicity is required,
337 * rbd_dev->lock is used to protect access.
338 *
339 * Currently, only the "removing" flag (which is coupled with the
340 * "open_count" field) requires atomic access.
341 */
Alex Elder6d292902013-01-14 12:43:31 -0600342enum rbd_dev_flags {
343 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600344 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600345};
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350static DEFINE_SPINLOCK(rbd_dev_list_lock);
351
Alex Elder432b8582012-01-29 13:57:44 -0600352static LIST_HEAD(rbd_client_list); /* clients */
353static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
Alex Elder3d7efd12013-04-19 15:34:50 -0500355static int rbd_img_request_submit(struct rbd_img_request *img_request);
356
Alex Elder304f6802012-08-31 17:29:52 -0500357static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
358static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
359
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800360static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500361static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800362
Alex Elderf0f8cef2012-01-29 13:57:44 -0600363static ssize_t rbd_add(struct bus_type *bus, const char *buf,
364 size_t count);
365static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
366 size_t count);
Alex Elder2f82ee52012-10-30 19:40:33 -0500367static int rbd_dev_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600368
369static struct bus_attribute rbd_bus_attrs[] = {
370 __ATTR(add, S_IWUSR, NULL, rbd_add),
371 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
372 __ATTR_NULL
373};
374
375static struct bus_type rbd_bus_type = {
376 .name = "rbd",
377 .bus_attrs = rbd_bus_attrs,
378};
379
380static void rbd_root_dev_release(struct device *dev)
381{
382}
383
384static struct device rbd_root_dev = {
385 .init_name = "rbd",
386 .release = rbd_root_dev_release,
387};
388
Alex Elder06ecc6c2012-11-01 10:17:15 -0500389static __printf(2, 3)
390void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
391{
392 struct va_format vaf;
393 va_list args;
394
395 va_start(args, fmt);
396 vaf.fmt = fmt;
397 vaf.va = &args;
398
399 if (!rbd_dev)
400 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
401 else if (rbd_dev->disk)
402 printk(KERN_WARNING "%s: %s: %pV\n",
403 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
404 else if (rbd_dev->spec && rbd_dev->spec->image_name)
405 printk(KERN_WARNING "%s: image %s: %pV\n",
406 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
407 else if (rbd_dev->spec && rbd_dev->spec->image_id)
408 printk(KERN_WARNING "%s: id %s: %pV\n",
409 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
410 else /* punt */
411 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
412 RBD_DRV_NAME, rbd_dev, &vaf);
413 va_end(args);
414}
415
Alex Elderaafb2302012-09-06 16:00:54 -0500416#ifdef RBD_DEBUG
417#define rbd_assert(expr) \
418 if (unlikely(!(expr))) { \
419 printk(KERN_ERR "\nAssertion failure in %s() " \
420 "at line %d:\n\n" \
421 "\trbd_assert(%s);\n\n", \
422 __func__, __LINE__, #expr); \
423 BUG(); \
424 }
425#else /* !RBD_DEBUG */
426# define rbd_assert(expr) ((void) 0)
427#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800428
Alex Elder8b3e1a52013-01-24 16:13:36 -0600429static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500430static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600431
Alex Elder117973f2012-08-31 17:29:55 -0500432static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
433static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435static int rbd_open(struct block_device *bdev, fmode_t mode)
436{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600437 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600438 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderf84344f2012-08-31 17:29:51 -0500440 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 return -EROFS;
442
Alex Eldera14ea262013-02-05 13:23:12 -0600443 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600444 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
445 removing = true;
446 else
447 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600448 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600449 if (removing)
450 return -ENOENT;
451
Alex Elder42382b72012-11-16 09:29:16 -0600452 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600453 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500454 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600455 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700456
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 return 0;
458}
459
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800460static int rbd_release(struct gendisk *disk, fmode_t mode)
461{
462 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600463 unsigned long open_count_before;
464
Alex Eldera14ea262013-02-05 13:23:12 -0600465 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600466 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600467 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600468 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800469
Alex Elder42382b72012-11-16 09:29:16 -0600470 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600471 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600472 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800473
474 return 0;
475}
476
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477static const struct block_device_operations rbd_bd_ops = {
478 .owner = THIS_MODULE,
479 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800480 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481};
482
483/*
484 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500485 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 */
Alex Elderf8c38922012-08-10 13:12:07 -0700487static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488{
489 struct rbd_client *rbdc;
490 int ret = -ENOMEM;
491
Alex Elder37206ee2013-02-20 17:32:08 -0600492 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
494 if (!rbdc)
495 goto out_opt;
496
497 kref_init(&rbdc->kref);
498 INIT_LIST_HEAD(&rbdc->node);
499
Alex Elderbc534d82012-01-29 13:57:44 -0600500 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
501
Alex Elder43ae4702012-07-03 16:01:18 -0500502 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600504 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500505 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506
507 ret = ceph_open_session(rbdc->client);
508 if (ret < 0)
509 goto out_err;
510
Alex Elder432b8582012-01-29 13:57:44 -0600511 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600513 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Elderbc534d82012-01-29 13:57:44 -0600515 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600516 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 return rbdc;
519
520out_err:
521 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600522out_mutex:
523 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524 kfree(rbdc);
525out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500526 if (ceph_opts)
527 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600528 dout("%s: error %d\n", __func__, ret);
529
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400530 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531}
532
Alex Elder2f82ee52012-10-30 19:40:33 -0500533static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
534{
535 kref_get(&rbdc->kref);
536
537 return rbdc;
538}
539
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700541 * Find a ceph client with specific addr and configuration. If
542 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700544static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
546 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700547 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548
Alex Elder43ae4702012-07-03 16:01:18 -0500549 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550 return NULL;
551
Alex Elder1f7ba332012-08-10 13:12:07 -0700552 spin_lock(&rbd_client_list_lock);
553 list_for_each_entry(client_node, &rbd_client_list, node) {
554 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500555 __rbd_get_client(client_node);
556
Alex Elder1f7ba332012-08-10 13:12:07 -0700557 found = true;
558 break;
559 }
560 }
561 spin_unlock(&rbd_client_list_lock);
562
563 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564}
565
566/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700567 * mount options
568 */
569enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570 Opt_last_int,
571 /* int args above */
572 Opt_last_string,
573 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700574 Opt_read_only,
575 Opt_read_write,
576 /* Boolean args above */
577 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700578};
579
Alex Elder43ae4702012-07-03 16:01:18 -0500580static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700581 /* int args above */
582 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500583 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700584 {Opt_read_only, "ro"}, /* Alternate spelling */
585 {Opt_read_write, "read_write"},
586 {Opt_read_write, "rw"}, /* Alternate spelling */
587 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700588 {-1, NULL}
589};
590
Alex Elder98571b52013-01-20 14:44:42 -0600591struct rbd_options {
592 bool read_only;
593};
594
595#define RBD_READ_ONLY_DEFAULT false
596
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700597static int parse_rbd_opts_token(char *c, void *private)
598{
Alex Elder43ae4702012-07-03 16:01:18 -0500599 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700600 substring_t argstr[MAX_OPT_ARGS];
601 int token, intval, ret;
602
Alex Elder43ae4702012-07-03 16:01:18 -0500603 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700604 if (token < 0)
605 return -EINVAL;
606
607 if (token < Opt_last_int) {
608 ret = match_int(&argstr[0], &intval);
609 if (ret < 0) {
610 pr_err("bad mount option arg (not int) "
611 "at '%s'\n", c);
612 return ret;
613 }
614 dout("got int token %d val %d\n", token, intval);
615 } else if (token > Opt_last_int && token < Opt_last_string) {
616 dout("got string token %d val %s\n", token,
617 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700618 } else if (token > Opt_last_string && token < Opt_last_bool) {
619 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700620 } else {
621 dout("got token %d\n", token);
622 }
623
624 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700625 case Opt_read_only:
626 rbd_opts->read_only = true;
627 break;
628 case Opt_read_write:
629 rbd_opts->read_only = false;
630 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700631 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500632 rbd_assert(false);
633 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700634 }
635 return 0;
636}
637
638/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 * Get a ceph client with specific addr and configuration, if one does
640 * not exist create it.
641 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500642static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643{
Alex Elderf8c38922012-08-10 13:12:07 -0700644 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700645
Alex Elder1f7ba332012-08-10 13:12:07 -0700646 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500647 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500648 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500649 else
Alex Elderf8c38922012-08-10 13:12:07 -0700650 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder9d3997f2012-10-25 23:34:42 -0500652 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
655/*
656 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600657 *
Alex Elder432b8582012-01-29 13:57:44 -0600658 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 */
660static void rbd_client_release(struct kref *kref)
661{
662 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
663
Alex Elder37206ee2013-02-20 17:32:08 -0600664 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500665 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500667 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
669 ceph_destroy_client(rbdc->client);
670 kfree(rbdc);
671}
672
673/*
674 * Drop reference to ceph client node. If it's not referenced anymore, release
675 * it.
676 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500677static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678{
Alex Elderc53d5892012-10-25 23:34:42 -0500679 if (rbdc)
680 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Eldera30b71b2012-07-10 20:30:11 -0500683static bool rbd_image_format_valid(u32 image_format)
684{
685 return image_format == 1 || image_format == 2;
686}
687
Alex Elder8e94af82012-07-25 09:32:40 -0500688static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
689{
Alex Elder103a1502012-08-02 11:29:45 -0500690 size_t size;
691 u32 snap_count;
692
693 /* The header has to start with the magic rbd header text */
694 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
695 return false;
696
Alex Elderdb2388b2012-10-20 22:17:27 -0500697 /* The bio layer requires at least sector-sized I/O */
698
699 if (ondisk->options.order < SECTOR_SHIFT)
700 return false;
701
702 /* If we use u64 in a few spots we may be able to loosen this */
703
704 if (ondisk->options.order > 8 * sizeof (int) - 1)
705 return false;
706
Alex Elder103a1502012-08-02 11:29:45 -0500707 /*
708 * The size of a snapshot header has to fit in a size_t, and
709 * that limits the number of snapshots.
710 */
711 snap_count = le32_to_cpu(ondisk->snap_count);
712 size = SIZE_MAX - sizeof (struct ceph_snap_context);
713 if (snap_count > size / sizeof (__le64))
714 return false;
715
716 /*
717 * Not only that, but the size of the entire the snapshot
718 * header must also be representable in a size_t.
719 */
720 size -= snap_count * sizeof (__le64);
721 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
722 return false;
723
724 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500725}
726
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727/*
728 * Create a new header structure, translate header format from the on-disk
729 * header.
730 */
731static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d9982012-08-02 11:29:46 -0500732 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733{
Alex Elderccece232012-07-10 20:30:10 -0500734 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500735 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500736 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500737 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738
Alex Elder6a523252012-07-19 17:12:59 -0500739 memset(header, 0, sizeof (*header));
740
Alex Elder103a1502012-08-02 11:29:45 -0500741 snap_count = le32_to_cpu(ondisk->snap_count);
742
Alex Elder58c17b02012-08-23 23:22:06 -0500743 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
744 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500745 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500747 memcpy(header->object_prefix, ondisk->object_prefix, len);
748 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500751 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
752
Alex Elder621901d2012-08-23 23:22:06 -0500753 /* Save a copy of the snapshot names */
754
Alex Elderf785cc12012-08-23 23:22:06 -0500755 if (snap_names_len > (u64) SIZE_MAX)
756 return -EIO;
757 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500759 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500760 /*
761 * Note that rbd_dev_v1_header_read() guarantees
762 * the ondisk buffer we're working with has
763 * snap_names_len bytes beyond the end of the
764 * snapshot id array, this memcpy() is safe.
765 */
766 memcpy(header->snap_names, &ondisk->snaps[snap_count],
767 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500768
Alex Elder621901d2012-08-23 23:22:06 -0500769 /* Record each snapshot's size */
770
Alex Elderd2bb24e2012-07-26 23:37:14 -0500771 size = snap_count * sizeof (*header->snap_sizes);
772 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500774 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500775 for (i = 0; i < snap_count; i++)
776 header->snap_sizes[i] =
777 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778 } else {
Alex Elderccece232012-07-10 20:30:10 -0500779 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780 header->snap_names = NULL;
781 header->snap_sizes = NULL;
782 }
Alex Elder849b4262012-07-09 21:04:24 -0500783
Alex Elder34b13182012-07-13 20:35:12 -0500784 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 header->obj_order = ondisk->options.order;
786 header->crypt_type = ondisk->options.crypt_type;
787 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500788
Alex Elder621901d2012-08-23 23:22:06 -0500789 /* Allocate and fill in the snapshot context */
790
Alex Elderf84344f2012-08-31 17:29:51 -0500791 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500792 size = sizeof (struct ceph_snap_context);
793 size += snap_count * sizeof (header->snapc->snaps[0]);
794 header->snapc = kzalloc(size, GFP_KERNEL);
795 if (!header->snapc)
796 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797
798 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500799 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500801 for (i = 0; i < snap_count; i++)
802 header->snapc->snaps[i] =
803 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804
805 return 0;
806
Alex Elder6a523252012-07-19 17:12:59 -0500807out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500808 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500809 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500811 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500812 kfree(header->object_prefix);
813 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500814
Alex Elder00f1f362012-02-07 12:03:36 -0600815 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816}
817
Alex Elder9e15b772012-10-30 19:40:33 -0500818static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
819{
820 struct rbd_snap *snap;
821
822 if (snap_id == CEPH_NOSNAP)
823 return RBD_SNAP_HEAD_NAME;
824
825 list_for_each_entry(snap, &rbd_dev->snaps, node)
826 if (snap_id == snap->id)
827 return snap->name;
828
829 return NULL;
830}
831
Alex Elder8836b992012-08-30 14:42:15 -0500832static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
Alex Eldere86924a2012-07-10 20:30:11 -0500835 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600836
Alex Eldere86924a2012-07-10 20:30:11 -0500837 list_for_each_entry(snap, &rbd_dev->snaps, node) {
838 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500839 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500840 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500841 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600842
Alex Eldere86924a2012-07-10 20:30:11 -0500843 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600844 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 }
Alex Eldere86924a2012-07-10 20:30:11 -0500846
Alex Elder00f1f362012-02-07 12:03:36 -0600847 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848}
849
Alex Elder819d52b2012-10-25 23:34:41 -0500850static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851{
Alex Elder78dc4472012-07-19 08:49:18 -0500852 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500854 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800855 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500856 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500857 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500858 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500859 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500861 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 if (ret < 0)
863 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500864 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 }
Alex Elder6d292902013-01-14 12:43:31 -0600866 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
867
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 return ret;
870}
871
872static void rbd_header_free(struct rbd_image_header *header)
873{
Alex Elder849b4262012-07-09 21:04:24 -0500874 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500875 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500877 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500878 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500879 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800880 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500881 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882}
883
Alex Elder98571b52013-01-20 14:44:42 -0600884static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885{
Alex Elder65ccfe22012-08-09 10:33:26 -0700886 char *name;
887 u64 segment;
888 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889
Alex Elder2fd82b92012-11-09 15:05:54 -0600890 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700891 if (!name)
892 return NULL;
893 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600894 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700895 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600896 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700897 pr_err("error formatting segment name for #%llu (%d)\n",
898 segment, ret);
899 kfree(name);
900 name = NULL;
901 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Alex Elder65ccfe22012-08-09 10:33:26 -0700903 return name;
904}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
Alex Elder65ccfe22012-08-09 10:33:26 -0700906static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
907{
908 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909
Alex Elder65ccfe22012-08-09 10:33:26 -0700910 return offset & (segment_size - 1);
911}
912
913static u64 rbd_segment_length(struct rbd_device *rbd_dev,
914 u64 offset, u64 length)
915{
916 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
917
918 offset &= segment_size - 1;
919
Alex Elderaafb2302012-09-06 16:00:54 -0500920 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700921 if (offset + length > segment_size)
922 length = segment_size - offset;
923
924 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925}
926
927/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700928 * returns the size of an object in the image
929 */
930static u64 rbd_obj_bytes(struct rbd_image_header *header)
931{
932 return 1 << header->obj_order;
933}
934
935/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936 * bio helpers
937 */
938
939static void bio_chain_put(struct bio *chain)
940{
941 struct bio *tmp;
942
943 while (chain) {
944 tmp = chain;
945 chain = chain->bi_next;
946 bio_put(tmp);
947 }
948}
949
950/*
951 * zeros a bio chain, starting at specific offset
952 */
953static void zero_bio_chain(struct bio *chain, int start_ofs)
954{
955 struct bio_vec *bv;
956 unsigned long flags;
957 void *buf;
958 int i;
959 int pos = 0;
960
961 while (chain) {
962 bio_for_each_segment(bv, chain, i) {
963 if (pos + bv->bv_len > start_ofs) {
964 int remainder = max(start_ofs - pos, 0);
965 buf = bvec_kmap_irq(bv, &flags);
966 memset(buf + remainder, 0,
967 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200968 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969 }
970 pos += bv->bv_len;
971 }
972
973 chain = chain->bi_next;
974 }
975}
976
977/*
Alex Elderb9434c52013-04-19 15:34:50 -0500978 * similar to zero_bio_chain(), zeros data defined by a page array,
979 * starting at the given byte offset from the start of the array and
980 * continuing up to the given end offset. The pages array is
981 * assumed to be big enough to hold all bytes up to the end.
982 */
983static void zero_pages(struct page **pages, u64 offset, u64 end)
984{
985 struct page **page = &pages[offset >> PAGE_SHIFT];
986
987 rbd_assert(end > offset);
988 rbd_assert(end - offset <= (u64)SIZE_MAX);
989 while (offset < end) {
990 size_t page_offset;
991 size_t length;
992 unsigned long flags;
993 void *kaddr;
994
995 page_offset = (size_t)(offset & ~PAGE_MASK);
996 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
997 local_irq_save(flags);
998 kaddr = kmap_atomic(*page);
999 memset(kaddr + page_offset, 0, length);
1000 kunmap_atomic(kaddr);
1001 local_irq_restore(flags);
1002
1003 offset += length;
1004 page++;
1005 }
1006}
1007
1008/*
Alex Elderf7760da2012-10-20 22:17:27 -05001009 * Clone a portion of a bio, starting at the given byte offset
1010 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011 */
Alex Elderf7760da2012-10-20 22:17:27 -05001012static struct bio *bio_clone_range(struct bio *bio_src,
1013 unsigned int offset,
1014 unsigned int len,
1015 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001016{
Alex Elderf7760da2012-10-20 22:17:27 -05001017 struct bio_vec *bv;
1018 unsigned int resid;
1019 unsigned short idx;
1020 unsigned int voff;
1021 unsigned short end_idx;
1022 unsigned short vcnt;
1023 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Alex Elderf7760da2012-10-20 22:17:27 -05001025 /* Handle the easy case for the caller */
1026
1027 if (!offset && len == bio_src->bi_size)
1028 return bio_clone(bio_src, gfpmask);
1029
1030 if (WARN_ON_ONCE(!len))
1031 return NULL;
1032 if (WARN_ON_ONCE(len > bio_src->bi_size))
1033 return NULL;
1034 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1035 return NULL;
1036
1037 /* Find first affected segment... */
1038
1039 resid = offset;
1040 __bio_for_each_segment(bv, bio_src, idx, 0) {
1041 if (resid < bv->bv_len)
1042 break;
1043 resid -= bv->bv_len;
1044 }
1045 voff = resid;
1046
1047 /* ...and the last affected segment */
1048
1049 resid += len;
1050 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1051 if (resid <= bv->bv_len)
1052 break;
1053 resid -= bv->bv_len;
1054 }
1055 vcnt = end_idx - idx + 1;
1056
1057 /* Build the clone */
1058
1059 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1060 if (!bio)
1061 return NULL; /* ENOMEM */
1062
1063 bio->bi_bdev = bio_src->bi_bdev;
1064 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1065 bio->bi_rw = bio_src->bi_rw;
1066 bio->bi_flags |= 1 << BIO_CLONED;
1067
1068 /*
1069 * Copy over our part of the bio_vec, then update the first
1070 * and last (or only) entries.
1071 */
1072 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1073 vcnt * sizeof (struct bio_vec));
1074 bio->bi_io_vec[0].bv_offset += voff;
1075 if (vcnt > 1) {
1076 bio->bi_io_vec[0].bv_len -= voff;
1077 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1078 } else {
1079 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 }
1081
Alex Elderf7760da2012-10-20 22:17:27 -05001082 bio->bi_vcnt = vcnt;
1083 bio->bi_size = len;
1084 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001085
Alex Elderf7760da2012-10-20 22:17:27 -05001086 return bio;
1087}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
Alex Elderf7760da2012-10-20 22:17:27 -05001089/*
1090 * Clone a portion of a bio chain, starting at the given byte offset
1091 * into the first bio in the source chain and continuing for the
1092 * number of bytes indicated. The result is another bio chain of
1093 * exactly the given length, or a null pointer on error.
1094 *
1095 * The bio_src and offset parameters are both in-out. On entry they
1096 * refer to the first source bio and the offset into that bio where
1097 * the start of data to be cloned is located.
1098 *
1099 * On return, bio_src is updated to refer to the bio in the source
1100 * chain that contains first un-cloned byte, and *offset will
1101 * contain the offset of that byte within that bio.
1102 */
1103static struct bio *bio_chain_clone_range(struct bio **bio_src,
1104 unsigned int *offset,
1105 unsigned int len,
1106 gfp_t gfpmask)
1107{
1108 struct bio *bi = *bio_src;
1109 unsigned int off = *offset;
1110 struct bio *chain = NULL;
1111 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112
Alex Elderf7760da2012-10-20 22:17:27 -05001113 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114
Alex Elderf7760da2012-10-20 22:17:27 -05001115 if (!bi || off >= bi->bi_size || !len)
1116 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
Alex Elderf7760da2012-10-20 22:17:27 -05001118 end = &chain;
1119 while (len) {
1120 unsigned int bi_size;
1121 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elderf5400b72012-11-01 10:17:15 -05001123 if (!bi) {
1124 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001125 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001126 }
Alex Elderf7760da2012-10-20 22:17:27 -05001127 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1128 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1129 if (!bio)
1130 goto out_err; /* ENOMEM */
1131
1132 *end = bio;
1133 end = &bio->bi_next;
1134
1135 off += bi_size;
1136 if (off == bi->bi_size) {
1137 bi = bi->bi_next;
1138 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 }
Alex Elderf7760da2012-10-20 22:17:27 -05001140 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141 }
Alex Elderf7760da2012-10-20 22:17:27 -05001142 *bio_src = bi;
1143 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144
Alex Elderf7760da2012-10-20 22:17:27 -05001145 return chain;
1146out_err:
1147 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 return NULL;
1150}
1151
Alex Elder926f9b32013-02-11 12:33:24 -06001152/*
1153 * The default/initial value for all object request flags is 0. For
1154 * each flag, once its value is set to 1 it is never reset to 0
1155 * again.
1156 */
Alex Elder6365d332013-02-11 12:33:24 -06001157static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1158{
1159 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001160 struct rbd_device *rbd_dev;
1161
Alex Elder57acbaa2013-02-11 12:33:24 -06001162 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001163 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1164 obj_request);
1165 }
1166}
1167
1168static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1169{
1170 smp_mb();
1171 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1172}
1173
Alex Elder57acbaa2013-02-11 12:33:24 -06001174static void obj_request_done_set(struct rbd_obj_request *obj_request)
1175{
1176 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1177 struct rbd_device *rbd_dev = NULL;
1178
1179 if (obj_request_img_data_test(obj_request))
1180 rbd_dev = obj_request->img_request->rbd_dev;
1181 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1182 obj_request);
1183 }
1184}
1185
1186static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1187{
1188 smp_mb();
1189 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1190}
1191
Alex Elder5679c592013-02-11 12:33:24 -06001192/*
1193 * This sets the KNOWN flag after (possibly) setting the EXISTS
1194 * flag. The latter is set based on the "exists" value provided.
1195 *
1196 * Note that for our purposes once an object exists it never goes
1197 * away again. It's possible that the response from two existence
1198 * checks are separated by the creation of the target object, and
1199 * the first ("doesn't exist") response arrives *after* the second
1200 * ("does exist"). In that case we ignore the second one.
1201 */
1202static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1203 bool exists)
1204{
1205 if (exists)
1206 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1207 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1208 smp_mb();
1209}
1210
1211static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1212{
1213 smp_mb();
1214 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1215}
1216
1217static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1218{
1219 smp_mb();
1220 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1221}
1222
Alex Elderbf0d5f502012-11-22 00:00:08 -06001223static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1224{
Alex Elder37206ee2013-02-20 17:32:08 -06001225 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1226 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001227 kref_get(&obj_request->kref);
1228}
1229
1230static void rbd_obj_request_destroy(struct kref *kref);
1231static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1232{
1233 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001234 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1235 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001236 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1237}
1238
1239static void rbd_img_request_get(struct rbd_img_request *img_request)
1240{
Alex Elder37206ee2013-02-20 17:32:08 -06001241 dout("%s: img %p (was %d)\n", __func__, img_request,
1242 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001243 kref_get(&img_request->kref);
1244}
1245
1246static void rbd_img_request_destroy(struct kref *kref);
1247static void rbd_img_request_put(struct rbd_img_request *img_request)
1248{
1249 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001250 dout("%s: img %p (was %d)\n", __func__, img_request,
1251 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001252 kref_put(&img_request->kref, rbd_img_request_destroy);
1253}
1254
1255static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1256 struct rbd_obj_request *obj_request)
1257{
Alex Elder25dcf952013-01-25 17:08:55 -06001258 rbd_assert(obj_request->img_request == NULL);
1259
Alex Elderb155e862013-04-15 14:50:37 -05001260 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001261 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001262 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001263 rbd_assert(!obj_request_img_data_test(obj_request));
1264 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001265 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001266 img_request->obj_request_count++;
1267 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001268 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1269 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001270}
1271
1272static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1273 struct rbd_obj_request *obj_request)
1274{
1275 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001276
Alex Elder37206ee2013-02-20 17:32:08 -06001277 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1278 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001279 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001280 rbd_assert(img_request->obj_request_count > 0);
1281 img_request->obj_request_count--;
1282 rbd_assert(obj_request->which == img_request->obj_request_count);
1283 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001284 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001285 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001287 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001288 rbd_obj_request_put(obj_request);
1289}
1290
1291static bool obj_request_type_valid(enum obj_request_type type)
1292{
1293 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001294 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001295 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001296 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 return true;
1298 default:
1299 return false;
1300 }
1301}
1302
Alex Elderbf0d5f502012-11-22 00:00:08 -06001303static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1304 struct rbd_obj_request *obj_request)
1305{
Alex Elder37206ee2013-02-20 17:32:08 -06001306 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1307
Alex Elderbf0d5f502012-11-22 00:00:08 -06001308 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1309}
1310
1311static void rbd_img_request_complete(struct rbd_img_request *img_request)
1312{
Alex Elder55f27e02013-04-10 12:34:25 -05001313
Alex Elder37206ee2013-02-20 17:32:08 -06001314 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001315
1316 /*
1317 * If no error occurred, compute the aggregate transfer
1318 * count for the image request. We could instead use
1319 * atomic64_cmpxchg() to update it as each object request
1320 * completes; not clear which way is better off hand.
1321 */
1322 if (!img_request->result) {
1323 struct rbd_obj_request *obj_request;
1324 u64 xferred = 0;
1325
1326 for_each_obj_request(img_request, obj_request)
1327 xferred += obj_request->xferred;
1328 img_request->xferred = xferred;
1329 }
1330
Alex Elderbf0d5f502012-11-22 00:00:08 -06001331 if (img_request->callback)
1332 img_request->callback(img_request);
1333 else
1334 rbd_img_request_put(img_request);
1335}
1336
Alex Elder788e2df2013-01-17 12:25:27 -06001337/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1338
1339static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1340{
Alex Elder37206ee2013-02-20 17:32:08 -06001341 dout("%s: obj %p\n", __func__, obj_request);
1342
Alex Elder788e2df2013-01-17 12:25:27 -06001343 return wait_for_completion_interruptible(&obj_request->completion);
1344}
1345
Alex Elder0c425242013-02-08 09:55:49 -06001346/*
1347 * The default/initial value for all image request flags is 0. Each
1348 * is conditionally set to 1 at image request initialization time
1349 * and currently never change thereafter.
1350 */
1351static void img_request_write_set(struct rbd_img_request *img_request)
1352{
1353 set_bit(IMG_REQ_WRITE, &img_request->flags);
1354 smp_mb();
1355}
1356
1357static bool img_request_write_test(struct rbd_img_request *img_request)
1358{
1359 smp_mb();
1360 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1361}
1362
Alex Elder9849e982013-01-24 16:13:36 -06001363static void img_request_child_set(struct rbd_img_request *img_request)
1364{
1365 set_bit(IMG_REQ_CHILD, &img_request->flags);
1366 smp_mb();
1367}
1368
1369static bool img_request_child_test(struct rbd_img_request *img_request)
1370{
1371 smp_mb();
1372 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1373}
1374
Alex Elderd0b2e942013-01-24 16:13:36 -06001375static void img_request_layered_set(struct rbd_img_request *img_request)
1376{
1377 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1378 smp_mb();
1379}
1380
1381static bool img_request_layered_test(struct rbd_img_request *img_request)
1382{
1383 smp_mb();
1384 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1385}
1386
Alex Elder6e2a4502013-03-27 09:16:30 -05001387static void
1388rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1389{
Alex Elderb9434c52013-04-19 15:34:50 -05001390 u64 xferred = obj_request->xferred;
1391 u64 length = obj_request->length;
1392
Alex Elder6e2a4502013-03-27 09:16:30 -05001393 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1394 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001395 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001396 /*
1397 * ENOENT means a hole in the image. We zero-fill the
1398 * entire length of the request. A short read also implies
1399 * zero-fill to the end of the request. Either way we
1400 * update the xferred count to indicate the whole request
1401 * was satisfied.
1402 */
Alex Elderb9434c52013-04-19 15:34:50 -05001403 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001404 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001405 if (obj_request->type == OBJ_REQUEST_BIO)
1406 zero_bio_chain(obj_request->bio_list, 0);
1407 else
1408 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001409 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001410 obj_request->xferred = length;
1411 } else if (xferred < length && !obj_request->result) {
1412 if (obj_request->type == OBJ_REQUEST_BIO)
1413 zero_bio_chain(obj_request->bio_list, xferred);
1414 else
1415 zero_pages(obj_request->pages, xferred, length);
1416 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001417 }
1418 obj_request_done_set(obj_request);
1419}
1420
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1422{
Alex Elder37206ee2013-02-20 17:32:08 -06001423 dout("%s: obj %p cb %p\n", __func__, obj_request,
1424 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001425 if (obj_request->callback)
1426 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001427 else
1428 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429}
1430
Alex Elderc47f9372013-02-26 14:23:07 -06001431static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001432{
1433 dout("%s: obj %p\n", __func__, obj_request);
1434 obj_request_done_set(obj_request);
1435}
1436
Alex Elderc47f9372013-02-26 14:23:07 -06001437static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001438{
Alex Elder57acbaa2013-02-11 12:33:24 -06001439 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001440 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001441 bool layered = false;
1442
1443 if (obj_request_img_data_test(obj_request)) {
1444 img_request = obj_request->img_request;
1445 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001446 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001447 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001448
1449 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1450 obj_request, img_request, obj_request->result,
1451 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001452 if (layered && obj_request->result == -ENOENT &&
1453 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001454 rbd_img_parent_read(obj_request);
1455 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001456 rbd_img_obj_request_read_callback(obj_request);
1457 else
1458 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001459}
1460
Alex Elderc47f9372013-02-26 14:23:07 -06001461static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462{
Sage Weil1b83bef2013-02-25 16:11:12 -08001463 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1464 obj_request->result, obj_request->length);
1465 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001466 * There is no such thing as a successful short write. Set
1467 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001468 */
1469 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001470 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001471}
1472
Alex Elderfbfab532013-02-08 09:55:48 -06001473/*
1474 * For a simple stat call there's nothing to do. We'll do more if
1475 * this is part of a write sequence for a layered image.
1476 */
Alex Elderc47f9372013-02-26 14:23:07 -06001477static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001478{
Alex Elder37206ee2013-02-20 17:32:08 -06001479 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001480 obj_request_done_set(obj_request);
1481}
1482
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1484 struct ceph_msg *msg)
1485{
1486 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001487 u16 opcode;
1488
Alex Elder37206ee2013-02-20 17:32:08 -06001489 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001491 if (obj_request_img_data_test(obj_request)) {
1492 rbd_assert(obj_request->img_request);
1493 rbd_assert(obj_request->which != BAD_WHICH);
1494 } else {
1495 rbd_assert(obj_request->which == BAD_WHICH);
1496 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497
Sage Weil1b83bef2013-02-25 16:11:12 -08001498 if (osd_req->r_result < 0)
1499 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001500 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1501
Alex Elder0eefd472013-04-19 15:34:50 -05001502 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001503
Alex Elderc47f9372013-02-26 14:23:07 -06001504 /*
1505 * We support a 64-bit length, but ultimately it has to be
1506 * passed to blk_end_request(), which takes an unsigned int.
1507 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001508 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001509 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001510 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001511 switch (opcode) {
1512 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001513 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514 break;
1515 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001516 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001518 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001519 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001520 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001521 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001522 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001523 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001524 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001525 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526 default:
1527 rbd_warn(NULL, "%s: unsupported op %hu\n",
1528 obj_request->object_name, (unsigned short) opcode);
1529 break;
1530 }
1531
Alex Elder07741302013-02-05 23:41:50 -06001532 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001533 rbd_obj_request_complete(obj_request);
1534}
1535
Alex Elder9d4df012013-04-19 15:34:50 -05001536static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001537{
1538 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001539 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001540 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001541
Alex Elder8c042b02013-04-03 01:28:58 -05001542 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001543
Alex Elder9d4df012013-04-19 15:34:50 -05001544 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001545 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001546 NULL, snap_id, NULL);
1547}
1548
1549static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1550{
1551 struct rbd_img_request *img_request = obj_request->img_request;
1552 struct ceph_osd_request *osd_req = obj_request->osd_req;
1553 struct ceph_snap_context *snapc;
1554 struct timespec mtime = CURRENT_TIME;
1555
1556 rbd_assert(osd_req != NULL);
1557
1558 snapc = img_request ? img_request->snapc : NULL;
1559 ceph_osdc_build_request(osd_req, obj_request->offset,
1560 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001561}
1562
Alex Elderbf0d5f502012-11-22 00:00:08 -06001563static struct ceph_osd_request *rbd_osd_req_create(
1564 struct rbd_device *rbd_dev,
1565 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001566 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568 struct ceph_snap_context *snapc = NULL;
1569 struct ceph_osd_client *osdc;
1570 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571
Alex Elder6365d332013-02-11 12:33:24 -06001572 if (obj_request_img_data_test(obj_request)) {
1573 struct rbd_img_request *img_request = obj_request->img_request;
1574
Alex Elder0c425242013-02-08 09:55:49 -06001575 rbd_assert(write_request ==
1576 img_request_write_test(img_request));
1577 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001579 }
1580
1581 /* Allocate and initialize the request, for the single op */
1582
1583 osdc = &rbd_dev->rbd_client->client->osdc;
1584 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1585 if (!osd_req)
1586 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001587
Alex Elder430c28c2013-04-03 21:32:51 -05001588 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001589 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001590 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001592
1593 osd_req->r_callback = rbd_osd_req_callback;
1594 osd_req->r_priv = obj_request;
1595
1596 osd_req->r_oid_len = strlen(obj_request->object_name);
1597 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1598 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1599
1600 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1601
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602 return osd_req;
1603}
1604
Alex Elder0eefd472013-04-19 15:34:50 -05001605/*
1606 * Create a copyup osd request based on the information in the
1607 * object request supplied. A copyup request has two osd ops,
1608 * a copyup method call, and a "normal" write request.
1609 */
1610static struct ceph_osd_request *
1611rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1612{
1613 struct rbd_img_request *img_request;
1614 struct ceph_snap_context *snapc;
1615 struct rbd_device *rbd_dev;
1616 struct ceph_osd_client *osdc;
1617 struct ceph_osd_request *osd_req;
1618
1619 rbd_assert(obj_request_img_data_test(obj_request));
1620 img_request = obj_request->img_request;
1621 rbd_assert(img_request);
1622 rbd_assert(img_request_write_test(img_request));
1623
1624 /* Allocate and initialize the request, for the two ops */
1625
1626 snapc = img_request->snapc;
1627 rbd_dev = img_request->rbd_dev;
1628 osdc = &rbd_dev->rbd_client->client->osdc;
1629 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1630 if (!osd_req)
1631 return NULL; /* ENOMEM */
1632
1633 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1634 osd_req->r_callback = rbd_osd_req_callback;
1635 osd_req->r_priv = obj_request;
1636
1637 osd_req->r_oid_len = strlen(obj_request->object_name);
1638 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1639 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1640
1641 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1642
1643 return osd_req;
1644}
1645
1646
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1648{
1649 ceph_osdc_put_request(osd_req);
1650}
1651
1652/* object_name is assumed to be a non-null pointer and NUL-terminated */
1653
1654static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1655 u64 offset, u64 length,
1656 enum obj_request_type type)
1657{
1658 struct rbd_obj_request *obj_request;
1659 size_t size;
1660 char *name;
1661
1662 rbd_assert(obj_request_type_valid(type));
1663
1664 size = strlen(object_name) + 1;
1665 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1666 if (!obj_request)
1667 return NULL;
1668
1669 name = (char *)(obj_request + 1);
1670 obj_request->object_name = memcpy(name, object_name, size);
1671 obj_request->offset = offset;
1672 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001673 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 obj_request->which = BAD_WHICH;
1675 obj_request->type = type;
1676 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001677 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678 kref_init(&obj_request->kref);
1679
Alex Elder37206ee2013-02-20 17:32:08 -06001680 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1681 offset, length, (int)type, obj_request);
1682
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683 return obj_request;
1684}
1685
1686static void rbd_obj_request_destroy(struct kref *kref)
1687{
1688 struct rbd_obj_request *obj_request;
1689
1690 obj_request = container_of(kref, struct rbd_obj_request, kref);
1691
Alex Elder37206ee2013-02-20 17:32:08 -06001692 dout("%s: obj %p\n", __func__, obj_request);
1693
Alex Elderbf0d5f502012-11-22 00:00:08 -06001694 rbd_assert(obj_request->img_request == NULL);
1695 rbd_assert(obj_request->which == BAD_WHICH);
1696
1697 if (obj_request->osd_req)
1698 rbd_osd_req_destroy(obj_request->osd_req);
1699
1700 rbd_assert(obj_request_type_valid(obj_request->type));
1701 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001702 case OBJ_REQUEST_NODATA:
1703 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001704 case OBJ_REQUEST_BIO:
1705 if (obj_request->bio_list)
1706 bio_chain_put(obj_request->bio_list);
1707 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001708 case OBJ_REQUEST_PAGES:
1709 if (obj_request->pages)
1710 ceph_release_page_vector(obj_request->pages,
1711 obj_request->page_count);
1712 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001713 }
1714
1715 kfree(obj_request);
1716}
1717
1718/*
1719 * Caller is responsible for filling in the list of object requests
1720 * that comprises the image request, and the Linux request pointer
1721 * (if there is one).
1722 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001723static struct rbd_img_request *rbd_img_request_create(
1724 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001725 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001726 bool write_request,
1727 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001728{
1729 struct rbd_img_request *img_request;
1730 struct ceph_snap_context *snapc = NULL;
1731
1732 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1733 if (!img_request)
1734 return NULL;
1735
1736 if (write_request) {
1737 down_read(&rbd_dev->header_rwsem);
1738 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1739 up_read(&rbd_dev->header_rwsem);
1740 if (WARN_ON(!snapc)) {
1741 kfree(img_request);
1742 return NULL; /* Shouldn't happen */
1743 }
Alex Elder0c425242013-02-08 09:55:49 -06001744
Alex Elderbf0d5f502012-11-22 00:00:08 -06001745 }
1746
1747 img_request->rq = NULL;
1748 img_request->rbd_dev = rbd_dev;
1749 img_request->offset = offset;
1750 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001751 img_request->flags = 0;
1752 if (write_request) {
1753 img_request_write_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001754 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001755 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001756 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001757 }
Alex Elder9849e982013-01-24 16:13:36 -06001758 if (child_request)
1759 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001760 if (rbd_dev->parent_spec)
1761 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001762 spin_lock_init(&img_request->completion_lock);
1763 img_request->next_completion = 0;
1764 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001765 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001766 img_request->obj_request_count = 0;
1767 INIT_LIST_HEAD(&img_request->obj_requests);
1768 kref_init(&img_request->kref);
1769
1770 rbd_img_request_get(img_request); /* Avoid a warning */
1771 rbd_img_request_put(img_request); /* TEMPORARY */
1772
Alex Elder37206ee2013-02-20 17:32:08 -06001773 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1774 write_request ? "write" : "read", offset, length,
1775 img_request);
1776
Alex Elderbf0d5f502012-11-22 00:00:08 -06001777 return img_request;
1778}
1779
1780static void rbd_img_request_destroy(struct kref *kref)
1781{
1782 struct rbd_img_request *img_request;
1783 struct rbd_obj_request *obj_request;
1784 struct rbd_obj_request *next_obj_request;
1785
1786 img_request = container_of(kref, struct rbd_img_request, kref);
1787
Alex Elder37206ee2013-02-20 17:32:08 -06001788 dout("%s: img %p\n", __func__, img_request);
1789
Alex Elderbf0d5f502012-11-22 00:00:08 -06001790 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1791 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001792 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001793
Alex Elder0c425242013-02-08 09:55:49 -06001794 if (img_request_write_test(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001795 ceph_put_snap_context(img_request->snapc);
1796
Alex Elder8b3e1a52013-01-24 16:13:36 -06001797 if (img_request_child_test(img_request))
1798 rbd_obj_request_put(img_request->obj_request);
1799
Alex Elderbf0d5f502012-11-22 00:00:08 -06001800 kfree(img_request);
1801}
1802
Alex Elder12178572013-02-08 09:55:49 -06001803static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1804{
Alex Elder6365d332013-02-11 12:33:24 -06001805 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001806 unsigned int xferred;
1807 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001808 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001809
Alex Elder6365d332013-02-11 12:33:24 -06001810 rbd_assert(obj_request_img_data_test(obj_request));
1811 img_request = obj_request->img_request;
1812
Alex Elder12178572013-02-08 09:55:49 -06001813 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1814 xferred = (unsigned int)obj_request->xferred;
1815 result = obj_request->result;
1816 if (result) {
1817 struct rbd_device *rbd_dev = img_request->rbd_dev;
1818
1819 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1820 img_request_write_test(img_request) ? "write" : "read",
1821 obj_request->length, obj_request->img_offset,
1822 obj_request->offset);
1823 rbd_warn(rbd_dev, " result %d xferred %x\n",
1824 result, xferred);
1825 if (!img_request->result)
1826 img_request->result = result;
1827 }
1828
Alex Elderf1a47392013-04-19 15:34:50 -05001829 /* Image object requests don't own their page array */
1830
1831 if (obj_request->type == OBJ_REQUEST_PAGES) {
1832 obj_request->pages = NULL;
1833 obj_request->page_count = 0;
1834 }
1835
Alex Elder8b3e1a52013-01-24 16:13:36 -06001836 if (img_request_child_test(img_request)) {
1837 rbd_assert(img_request->obj_request != NULL);
1838 more = obj_request->which < img_request->obj_request_count - 1;
1839 } else {
1840 rbd_assert(img_request->rq != NULL);
1841 more = blk_end_request(img_request->rq, result, xferred);
1842 }
1843
1844 return more;
Alex Elder12178572013-02-08 09:55:49 -06001845}
1846
Alex Elder21692382013-04-05 01:27:12 -05001847static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1848{
1849 struct rbd_img_request *img_request;
1850 u32 which = obj_request->which;
1851 bool more = true;
1852
Alex Elder6365d332013-02-11 12:33:24 -06001853 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001854 img_request = obj_request->img_request;
1855
1856 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1857 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001858 rbd_assert(img_request->obj_request_count > 0);
1859 rbd_assert(which != BAD_WHICH);
1860 rbd_assert(which < img_request->obj_request_count);
1861 rbd_assert(which >= img_request->next_completion);
1862
1863 spin_lock_irq(&img_request->completion_lock);
1864 if (which != img_request->next_completion)
1865 goto out;
1866
1867 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001868 rbd_assert(more);
1869 rbd_assert(which < img_request->obj_request_count);
1870
1871 if (!obj_request_done_test(obj_request))
1872 break;
Alex Elder12178572013-02-08 09:55:49 -06001873 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001874 which++;
1875 }
1876
1877 rbd_assert(more ^ (which == img_request->obj_request_count));
1878 img_request->next_completion = which;
1879out:
1880 spin_unlock_irq(&img_request->completion_lock);
1881
1882 if (!more)
1883 rbd_img_request_complete(img_request);
1884}
1885
Alex Elderf1a47392013-04-19 15:34:50 -05001886/*
1887 * Split up an image request into one or more object requests, each
1888 * to a different object. The "type" parameter indicates whether
1889 * "data_desc" is the pointer to the head of a list of bio
1890 * structures, or the base of a page array. In either case this
1891 * function assumes data_desc describes memory sufficient to hold
1892 * all data described by the image request.
1893 */
1894static int rbd_img_request_fill(struct rbd_img_request *img_request,
1895 enum obj_request_type type,
1896 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001897{
1898 struct rbd_device *rbd_dev = img_request->rbd_dev;
1899 struct rbd_obj_request *obj_request = NULL;
1900 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001901 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001902 struct bio *bio_list;
1903 unsigned int bio_offset = 0;
1904 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001905 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001906 u64 resid;
1907 u16 opcode;
1908
Alex Elderf1a47392013-04-19 15:34:50 -05001909 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1910 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001911
Alex Elder430c28c2013-04-03 21:32:51 -05001912 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001913 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001914 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001915 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001916
1917 if (type == OBJ_REQUEST_BIO) {
1918 bio_list = data_desc;
1919 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1920 } else {
1921 rbd_assert(type == OBJ_REQUEST_PAGES);
1922 pages = data_desc;
1923 }
1924
Alex Elderbf0d5f502012-11-22 00:00:08 -06001925 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001926 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001927 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001928 u64 offset;
1929 u64 length;
1930
Alex Elder7da22d22013-01-24 16:13:36 -06001931 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001932 if (!object_name)
1933 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001934 offset = rbd_segment_offset(rbd_dev, img_offset);
1935 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001936 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001937 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001938 kfree(object_name); /* object request has its own copy */
1939 if (!obj_request)
1940 goto out_unwind;
1941
Alex Elderf1a47392013-04-19 15:34:50 -05001942 if (type == OBJ_REQUEST_BIO) {
1943 unsigned int clone_size;
1944
1945 rbd_assert(length <= (u64)UINT_MAX);
1946 clone_size = (unsigned int)length;
1947 obj_request->bio_list =
1948 bio_chain_clone_range(&bio_list,
1949 &bio_offset,
1950 clone_size,
1951 GFP_ATOMIC);
1952 if (!obj_request->bio_list)
1953 goto out_partial;
1954 } else {
1955 unsigned int page_count;
1956
1957 obj_request->pages = pages;
1958 page_count = (u32)calc_pages_for(offset, length);
1959 obj_request->page_count = page_count;
1960 if ((offset + length) & ~PAGE_MASK)
1961 page_count--; /* more on last page */
1962 pages += page_count;
1963 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001964
Alex Elder2fa12322013-04-05 01:27:12 -05001965 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1966 obj_request);
1967 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001968 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001969 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001970 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001971
Alex Elder2fa12322013-04-05 01:27:12 -05001972 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1973 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001974 if (type == OBJ_REQUEST_BIO)
1975 osd_req_op_extent_osd_data_bio(osd_req, 0,
1976 obj_request->bio_list, length);
1977 else
1978 osd_req_op_extent_osd_data_pages(osd_req, 0,
1979 obj_request->pages, length,
1980 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05001981
1982 if (write_request)
1983 rbd_osd_req_format_write(obj_request);
1984 else
1985 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05001986
Alex Elder7da22d22013-01-24 16:13:36 -06001987 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001988 rbd_img_obj_request_add(img_request, obj_request);
1989
Alex Elder7da22d22013-01-24 16:13:36 -06001990 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001991 resid -= length;
1992 }
1993
1994 return 0;
1995
1996out_partial:
1997 rbd_obj_request_put(obj_request);
1998out_unwind:
1999 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2000 rbd_obj_request_put(obj_request);
2001
2002 return -ENOMEM;
2003}
2004
Alex Elder3d7efd12013-04-19 15:34:50 -05002005static void
Alex Elder0eefd472013-04-19 15:34:50 -05002006rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2007{
2008 struct rbd_img_request *img_request;
2009 struct rbd_device *rbd_dev;
2010 u64 length;
2011 u32 page_count;
2012
2013 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2014 rbd_assert(obj_request_img_data_test(obj_request));
2015 img_request = obj_request->img_request;
2016 rbd_assert(img_request);
2017
2018 rbd_dev = img_request->rbd_dev;
2019 rbd_assert(rbd_dev);
2020 length = (u64)1 << rbd_dev->header.obj_order;
2021 page_count = (u32)calc_pages_for(0, length);
2022
2023 rbd_assert(obj_request->copyup_pages);
2024 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2025 obj_request->copyup_pages = NULL;
2026
2027 /*
2028 * We want the transfer count to reflect the size of the
2029 * original write request. There is no such thing as a
2030 * successful short write, so if the request was successful
2031 * we can just set it to the originally-requested length.
2032 */
2033 if (!obj_request->result)
2034 obj_request->xferred = obj_request->length;
2035
2036 /* Finish up with the normal image object callback */
2037
2038 rbd_img_obj_callback(obj_request);
2039}
2040
2041static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002042rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2043{
2044 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002045 struct ceph_osd_request *osd_req;
2046 struct ceph_osd_client *osdc;
2047 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002048 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002049 int result;
2050 u64 obj_size;
2051 u64 xferred;
2052
2053 rbd_assert(img_request_child_test(img_request));
2054
2055 /* First get what we need from the image request */
2056
2057 pages = img_request->copyup_pages;
2058 rbd_assert(pages != NULL);
2059 img_request->copyup_pages = NULL;
2060
2061 orig_request = img_request->obj_request;
2062 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002063 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002064 result = img_request->result;
2065 obj_size = img_request->length;
2066 xferred = img_request->xferred;
2067
Alex Elder0eefd472013-04-19 15:34:50 -05002068 rbd_dev = img_request->rbd_dev;
2069 rbd_assert(rbd_dev);
2070 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2071
Alex Elder3d7efd12013-04-19 15:34:50 -05002072 rbd_img_request_put(img_request);
2073
Alex Elder0eefd472013-04-19 15:34:50 -05002074 if (result)
2075 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002076
Alex Elder0eefd472013-04-19 15:34:50 -05002077 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002078
Alex Elder0eefd472013-04-19 15:34:50 -05002079 result = -ENOMEM;
2080 rbd_assert(!orig_request->osd_req);
2081 osd_req = rbd_osd_req_create_copyup(orig_request);
2082 if (!osd_req)
2083 goto out_err;
2084 orig_request->osd_req = osd_req;
2085 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002086
Alex Elder0eefd472013-04-19 15:34:50 -05002087 /* Initialize the copyup op */
2088
2089 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2090 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2091 false, false);
2092
2093 /* Then the original write request op */
2094
2095 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2096 orig_request->offset,
2097 orig_request->length, 0, 0);
2098 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2099 orig_request->length);
2100
2101 rbd_osd_req_format_write(orig_request);
2102
2103 /* All set, send it off. */
2104
2105 orig_request->callback = rbd_img_obj_copyup_callback;
2106 osdc = &rbd_dev->rbd_client->client->osdc;
2107 result = rbd_obj_request_submit(osdc, orig_request);
2108 if (!result)
2109 return;
2110out_err:
2111 /* Record the error code and complete the request */
2112
2113 orig_request->result = result;
2114 orig_request->xferred = 0;
2115 obj_request_done_set(orig_request);
2116 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002117}
2118
2119/*
2120 * Read from the parent image the range of data that covers the
2121 * entire target of the given object request. This is used for
2122 * satisfying a layered image write request when the target of an
2123 * object request from the image request does not exist.
2124 *
2125 * A page array big enough to hold the returned data is allocated
2126 * and supplied to rbd_img_request_fill() as the "data descriptor."
2127 * When the read completes, this page array will be transferred to
2128 * the original object request for the copyup operation.
2129 *
2130 * If an error occurs, record it as the result of the original
2131 * object request and mark it done so it gets completed.
2132 */
2133static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2134{
2135 struct rbd_img_request *img_request = NULL;
2136 struct rbd_img_request *parent_request = NULL;
2137 struct rbd_device *rbd_dev;
2138 u64 img_offset;
2139 u64 length;
2140 struct page **pages = NULL;
2141 u32 page_count;
2142 int result;
2143
2144 rbd_assert(obj_request_img_data_test(obj_request));
2145 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2146
2147 img_request = obj_request->img_request;
2148 rbd_assert(img_request != NULL);
2149 rbd_dev = img_request->rbd_dev;
2150 rbd_assert(rbd_dev->parent != NULL);
2151
2152 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002153 * First things first. The original osd request is of no
2154 * use to use any more, we'll need a new one that can hold
2155 * the two ops in a copyup request. We'll get that later,
2156 * but for now we can release the old one.
2157 */
2158 rbd_osd_req_destroy(obj_request->osd_req);
2159 obj_request->osd_req = NULL;
2160
2161 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002162 * Determine the byte range covered by the object in the
2163 * child image to which the original request was to be sent.
2164 */
2165 img_offset = obj_request->img_offset - obj_request->offset;
2166 length = (u64)1 << rbd_dev->header.obj_order;
2167
2168 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002169 * There is no defined parent data beyond the parent
2170 * overlap, so limit what we read at that boundary if
2171 * necessary.
2172 */
2173 if (img_offset + length > rbd_dev->parent_overlap) {
2174 rbd_assert(img_offset < rbd_dev->parent_overlap);
2175 length = rbd_dev->parent_overlap - img_offset;
2176 }
2177
2178 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002179 * Allocate a page array big enough to receive the data read
2180 * from the parent.
2181 */
2182 page_count = (u32)calc_pages_for(0, length);
2183 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2184 if (IS_ERR(pages)) {
2185 result = PTR_ERR(pages);
2186 pages = NULL;
2187 goto out_err;
2188 }
2189
2190 result = -ENOMEM;
2191 parent_request = rbd_img_request_create(rbd_dev->parent,
2192 img_offset, length,
2193 false, true);
2194 if (!parent_request)
2195 goto out_err;
2196 rbd_obj_request_get(obj_request);
2197 parent_request->obj_request = obj_request;
2198
2199 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2200 if (result)
2201 goto out_err;
2202 parent_request->copyup_pages = pages;
2203
2204 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2205 result = rbd_img_request_submit(parent_request);
2206 if (!result)
2207 return 0;
2208
2209 parent_request->copyup_pages = NULL;
2210 parent_request->obj_request = NULL;
2211 rbd_obj_request_put(obj_request);
2212out_err:
2213 if (pages)
2214 ceph_release_page_vector(pages, page_count);
2215 if (parent_request)
2216 rbd_img_request_put(parent_request);
2217 obj_request->result = result;
2218 obj_request->xferred = 0;
2219 obj_request_done_set(obj_request);
2220
2221 return result;
2222}
2223
Alex Elderc5b5ef62013-02-11 12:33:24 -06002224static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2225{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002226 struct rbd_obj_request *orig_request;
2227 int result;
2228
2229 rbd_assert(!obj_request_img_data_test(obj_request));
2230
2231 /*
2232 * All we need from the object request is the original
2233 * request and the result of the STAT op. Grab those, then
2234 * we're done with the request.
2235 */
2236 orig_request = obj_request->obj_request;
2237 obj_request->obj_request = NULL;
2238 rbd_assert(orig_request);
2239 rbd_assert(orig_request->img_request);
2240
2241 result = obj_request->result;
2242 obj_request->result = 0;
2243
2244 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2245 obj_request, orig_request, result,
2246 obj_request->xferred, obj_request->length);
2247 rbd_obj_request_put(obj_request);
2248
2249 rbd_assert(orig_request);
2250 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002251
2252 /*
2253 * Our only purpose here is to determine whether the object
2254 * exists, and we don't want to treat the non-existence as
2255 * an error. If something else comes back, transfer the
2256 * error to the original request and complete it now.
2257 */
2258 if (!result) {
2259 obj_request_existence_set(orig_request, true);
2260 } else if (result == -ENOENT) {
2261 obj_request_existence_set(orig_request, false);
2262 } else if (result) {
2263 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002264 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002265 }
2266
2267 /*
2268 * Resubmit the original request now that we have recorded
2269 * whether the target object exists.
2270 */
Alex Elderb454e362013-04-19 15:34:50 -05002271 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002272out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002273 if (orig_request->result)
2274 rbd_obj_request_complete(orig_request);
2275 rbd_obj_request_put(orig_request);
2276}
2277
2278static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2279{
2280 struct rbd_obj_request *stat_request;
2281 struct rbd_device *rbd_dev;
2282 struct ceph_osd_client *osdc;
2283 struct page **pages = NULL;
2284 u32 page_count;
2285 size_t size;
2286 int ret;
2287
2288 /*
2289 * The response data for a STAT call consists of:
2290 * le64 length;
2291 * struct {
2292 * le32 tv_sec;
2293 * le32 tv_nsec;
2294 * } mtime;
2295 */
2296 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2297 page_count = (u32)calc_pages_for(0, size);
2298 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2299 if (IS_ERR(pages))
2300 return PTR_ERR(pages);
2301
2302 ret = -ENOMEM;
2303 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2304 OBJ_REQUEST_PAGES);
2305 if (!stat_request)
2306 goto out;
2307
2308 rbd_obj_request_get(obj_request);
2309 stat_request->obj_request = obj_request;
2310 stat_request->pages = pages;
2311 stat_request->page_count = page_count;
2312
2313 rbd_assert(obj_request->img_request);
2314 rbd_dev = obj_request->img_request->rbd_dev;
2315 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2316 stat_request);
2317 if (!stat_request->osd_req)
2318 goto out;
2319 stat_request->callback = rbd_img_obj_exists_callback;
2320
2321 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2322 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2323 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002324 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002325
2326 osdc = &rbd_dev->rbd_client->client->osdc;
2327 ret = rbd_obj_request_submit(osdc, stat_request);
2328out:
2329 if (ret)
2330 rbd_obj_request_put(obj_request);
2331
2332 return ret;
2333}
2334
Alex Elderb454e362013-04-19 15:34:50 -05002335static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2336{
2337 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002338 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002339 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002340
2341 rbd_assert(obj_request_img_data_test(obj_request));
2342
2343 img_request = obj_request->img_request;
2344 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002345 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002346
Alex Elderb454e362013-04-19 15:34:50 -05002347 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002348 * Only writes to layered images need special handling.
2349 * Reads and non-layered writes are simple object requests.
2350 * Layered writes that start beyond the end of the overlap
2351 * with the parent have no parent data, so they too are
2352 * simple object requests. Finally, if the target object is
2353 * known to already exist, its parent data has already been
2354 * copied, so a write to the object can also be handled as a
2355 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002356 */
2357 if (!img_request_write_test(img_request) ||
2358 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002359 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002360 ((known = obj_request_known_test(obj_request)) &&
2361 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002362
2363 struct rbd_device *rbd_dev;
2364 struct ceph_osd_client *osdc;
2365
2366 rbd_dev = obj_request->img_request->rbd_dev;
2367 osdc = &rbd_dev->rbd_client->client->osdc;
2368
2369 return rbd_obj_request_submit(osdc, obj_request);
2370 }
2371
2372 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002373 * It's a layered write. The target object might exist but
2374 * we may not know that yet. If we know it doesn't exist,
2375 * start by reading the data for the full target object from
2376 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002377 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002378 if (known)
2379 return rbd_img_obj_parent_read_full(obj_request);
2380
2381 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002382
2383 return rbd_img_obj_exists_submit(obj_request);
2384}
2385
Alex Elderbf0d5f502012-11-22 00:00:08 -06002386static int rbd_img_request_submit(struct rbd_img_request *img_request)
2387{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002388 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002389 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002390
Alex Elder37206ee2013-02-20 17:32:08 -06002391 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002392 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002393 int ret;
2394
Alex Elderb454e362013-04-19 15:34:50 -05002395 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002396 if (ret)
2397 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002398 }
2399
2400 return 0;
2401}
2402
Alex Elder8b3e1a52013-01-24 16:13:36 -06002403static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2404{
2405 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002406 struct rbd_device *rbd_dev;
2407 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002408
2409 rbd_assert(img_request_child_test(img_request));
2410
2411 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002412 rbd_assert(obj_request);
2413 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002414
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002415 obj_request->result = img_request->result;
2416 if (obj_request->result)
2417 goto out;
2418
2419 /*
2420 * We need to zero anything beyond the parent overlap
2421 * boundary. Since rbd_img_obj_request_read_callback()
2422 * will zero anything beyond the end of a short read, an
2423 * easy way to do this is to pretend the data from the
2424 * parent came up short--ending at the overlap boundary.
2425 */
2426 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2427 obj_end = obj_request->img_offset + obj_request->length;
2428 rbd_dev = obj_request->img_request->rbd_dev;
2429 if (obj_end > rbd_dev->parent_overlap) {
2430 u64 xferred = 0;
2431
2432 if (obj_request->img_offset < rbd_dev->parent_overlap)
2433 xferred = rbd_dev->parent_overlap -
2434 obj_request->img_offset;
2435
2436 obj_request->xferred = min(img_request->xferred, xferred);
2437 } else {
2438 obj_request->xferred = img_request->xferred;
2439 }
2440out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002441 rbd_img_obj_request_read_callback(obj_request);
2442 rbd_obj_request_complete(obj_request);
2443}
2444
2445static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2446{
2447 struct rbd_device *rbd_dev;
2448 struct rbd_img_request *img_request;
2449 int result;
2450
2451 rbd_assert(obj_request_img_data_test(obj_request));
2452 rbd_assert(obj_request->img_request != NULL);
2453 rbd_assert(obj_request->result == (s32) -ENOENT);
2454 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2455
2456 rbd_dev = obj_request->img_request->rbd_dev;
2457 rbd_assert(rbd_dev->parent != NULL);
2458 /* rbd_read_finish(obj_request, obj_request->length); */
2459 img_request = rbd_img_request_create(rbd_dev->parent,
2460 obj_request->img_offset,
2461 obj_request->length,
2462 false, true);
2463 result = -ENOMEM;
2464 if (!img_request)
2465 goto out_err;
2466
2467 rbd_obj_request_get(obj_request);
2468 img_request->obj_request = obj_request;
2469
Alex Elderf1a47392013-04-19 15:34:50 -05002470 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2471 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002472 if (result)
2473 goto out_err;
2474
2475 img_request->callback = rbd_img_parent_read_callback;
2476 result = rbd_img_request_submit(img_request);
2477 if (result)
2478 goto out_err;
2479
2480 return;
2481out_err:
2482 if (img_request)
2483 rbd_img_request_put(img_request);
2484 obj_request->result = result;
2485 obj_request->xferred = 0;
2486 obj_request_done_set(obj_request);
2487}
2488
Alex Eldercf81b602013-01-17 12:18:46 -06002489static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06002490 u64 ver, u64 notify_id)
2491{
2492 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002493 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002494 int ret;
2495
2496 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2497 OBJ_REQUEST_NODATA);
2498 if (!obj_request)
2499 return -ENOMEM;
2500
2501 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002502 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002503 if (!obj_request->osd_req)
2504 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002505 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002506
Alex Elderc99d2d42013-04-05 01:27:11 -05002507 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2508 notify_id, ver, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002509 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002510
Alex Elderb8d70032012-11-30 17:53:04 -06002511 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002512out:
Alex Eldercf81b602013-01-17 12:18:46 -06002513 if (ret)
2514 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002515
2516 return ret;
2517}
2518
2519static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2520{
2521 struct rbd_device *rbd_dev = (struct rbd_device *)data;
2522 u64 hver;
2523 int rc;
2524
2525 if (!rbd_dev)
2526 return;
2527
Alex Elder37206ee2013-02-20 17:32:08 -06002528 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06002529 rbd_dev->header_name, (unsigned long long) notify_id,
2530 (unsigned int) opcode);
2531 rc = rbd_dev_refresh(rbd_dev, &hver);
2532 if (rc)
2533 rbd_warn(rbd_dev, "got notification but failed to "
2534 " update snaps: %d\n", rc);
2535
Alex Eldercf81b602013-01-17 12:18:46 -06002536 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002537}
2538
Alex Elder9969ebc2013-01-18 12:31:10 -06002539/*
2540 * Request sync osd watch/unwatch. The value of "start" determines
2541 * whether a watch request is being initiated or torn down.
2542 */
2543static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2544{
2545 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2546 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002547 int ret;
2548
2549 rbd_assert(start ^ !!rbd_dev->watch_event);
2550 rbd_assert(start ^ !!rbd_dev->watch_request);
2551
2552 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002553 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002554 &rbd_dev->watch_event);
2555 if (ret < 0)
2556 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002557 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002558 }
2559
2560 ret = -ENOMEM;
2561 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2562 OBJ_REQUEST_NODATA);
2563 if (!obj_request)
2564 goto out_cancel;
2565
Alex Elder430c28c2013-04-03 21:32:51 -05002566 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2567 if (!obj_request->osd_req)
2568 goto out_cancel;
2569
Alex Elder8eb87562013-01-25 17:08:55 -06002570 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002571 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002572 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002573 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002574 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002575
2576 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2577 rbd_dev->watch_event->cookie,
2578 rbd_dev->header.obj_version, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002579 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002580
Alex Elder9969ebc2013-01-18 12:31:10 -06002581 ret = rbd_obj_request_submit(osdc, obj_request);
2582 if (ret)
2583 goto out_cancel;
2584 ret = rbd_obj_request_wait(obj_request);
2585 if (ret)
2586 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002587 ret = obj_request->result;
2588 if (ret)
2589 goto out_cancel;
2590
Alex Elder8eb87562013-01-25 17:08:55 -06002591 /*
2592 * A watch request is set to linger, so the underlying osd
2593 * request won't go away until we unregister it. We retain
2594 * a pointer to the object request during that time (in
2595 * rbd_dev->watch_request), so we'll keep a reference to
2596 * it. We'll drop that reference (below) after we've
2597 * unregistered it.
2598 */
2599 if (start) {
2600 rbd_dev->watch_request = obj_request;
2601
2602 return 0;
2603 }
2604
2605 /* We have successfully torn down the watch request */
2606
2607 rbd_obj_request_put(rbd_dev->watch_request);
2608 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002609out_cancel:
2610 /* Cancel the event if we're tearing down, or on error */
2611 ceph_osdc_cancel_event(rbd_dev->watch_event);
2612 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002613 if (obj_request)
2614 rbd_obj_request_put(obj_request);
2615
2616 return ret;
2617}
2618
Alex Elder36be9a72013-01-19 00:30:28 -06002619/*
2620 * Synchronous osd object method call
2621 */
2622static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2623 const char *object_name,
2624 const char *class_name,
2625 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002626 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002627 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002628 void *inbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002629 size_t inbound_size,
2630 u64 *version)
2631{
Alex Elder21692382013-04-05 01:27:12 -05002632 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002633 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002634 struct page **pages;
2635 u32 page_count;
2636 int ret;
2637
2638 /*
Alex Elder6010a452013-04-05 01:27:11 -05002639 * Method calls are ultimately read operations. The result
2640 * should placed into the inbound buffer provided. They
2641 * also supply outbound data--parameters for the object
2642 * method. Currently if this is present it will be a
2643 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002644 */
2645 page_count = (u32) calc_pages_for(0, inbound_size);
2646 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2647 if (IS_ERR(pages))
2648 return PTR_ERR(pages);
2649
2650 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002651 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002652 OBJ_REQUEST_PAGES);
2653 if (!obj_request)
2654 goto out;
2655
2656 obj_request->pages = pages;
2657 obj_request->page_count = page_count;
2658
Alex Elder430c28c2013-04-03 21:32:51 -05002659 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002660 if (!obj_request->osd_req)
2661 goto out;
2662
Alex Elderc99d2d42013-04-05 01:27:11 -05002663 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002664 class_name, method_name);
2665 if (outbound_size) {
2666 struct ceph_pagelist *pagelist;
2667
2668 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2669 if (!pagelist)
2670 goto out;
2671
2672 ceph_pagelist_init(pagelist);
2673 ceph_pagelist_append(pagelist, outbound, outbound_size);
2674 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2675 pagelist);
2676 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002677 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2678 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002679 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002680 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002681
Alex Elder36be9a72013-01-19 00:30:28 -06002682 ret = rbd_obj_request_submit(osdc, obj_request);
2683 if (ret)
2684 goto out;
2685 ret = rbd_obj_request_wait(obj_request);
2686 if (ret)
2687 goto out;
2688
2689 ret = obj_request->result;
2690 if (ret < 0)
2691 goto out;
Alex Elder23ed6e12013-02-06 13:11:38 -06002692 ret = 0;
Alex Elder903bb322013-02-06 13:11:38 -06002693 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002694 if (version)
2695 *version = obj_request->version;
2696out:
2697 if (obj_request)
2698 rbd_obj_request_put(obj_request);
2699 else
2700 ceph_release_page_vector(pages, page_count);
2701
2702 return ret;
2703}
2704
Alex Elderbf0d5f502012-11-22 00:00:08 -06002705static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002706 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002707{
2708 struct rbd_device *rbd_dev = q->queuedata;
2709 bool read_only = rbd_dev->mapping.read_only;
2710 struct request *rq;
2711 int result;
2712
2713 while ((rq = blk_fetch_request(q))) {
2714 bool write_request = rq_data_dir(rq) == WRITE;
2715 struct rbd_img_request *img_request;
2716 u64 offset;
2717 u64 length;
2718
2719 /* Ignore any non-FS requests that filter through. */
2720
2721 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002722 dout("%s: non-fs request type %d\n", __func__,
2723 (int) rq->cmd_type);
2724 __blk_end_request_all(rq, 0);
2725 continue;
2726 }
2727
2728 /* Ignore/skip any zero-length requests */
2729
2730 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2731 length = (u64) blk_rq_bytes(rq);
2732
2733 if (!length) {
2734 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002735 __blk_end_request_all(rq, 0);
2736 continue;
2737 }
2738
2739 spin_unlock_irq(q->queue_lock);
2740
2741 /* Disallow writes to a read-only device */
2742
2743 if (write_request) {
2744 result = -EROFS;
2745 if (read_only)
2746 goto end_request;
2747 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2748 }
2749
Alex Elder6d292902013-01-14 12:43:31 -06002750 /*
2751 * Quit early if the mapped snapshot no longer
2752 * exists. It's still possible the snapshot will
2753 * have disappeared by the time our request arrives
2754 * at the osd, but there's no sense in sending it if
2755 * we already know.
2756 */
2757 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002758 dout("request for non-existent snapshot");
2759 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2760 result = -ENXIO;
2761 goto end_request;
2762 }
2763
Alex Elderbf0d5f502012-11-22 00:00:08 -06002764 result = -EINVAL;
2765 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2766 goto end_request; /* Shouldn't happen */
2767
2768 result = -ENOMEM;
2769 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002770 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002771 if (!img_request)
2772 goto end_request;
2773
2774 img_request->rq = rq;
2775
Alex Elderf1a47392013-04-19 15:34:50 -05002776 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2777 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002778 if (!result)
2779 result = rbd_img_request_submit(img_request);
2780 if (result)
2781 rbd_img_request_put(img_request);
2782end_request:
2783 spin_lock_irq(q->queue_lock);
2784 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002785 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2786 write_request ? "write" : "read",
2787 length, offset, result);
2788
Alex Elderbf0d5f502012-11-22 00:00:08 -06002789 __blk_end_request_all(rq, result);
2790 }
2791 }
2792}
2793
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002794/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002795 * a queue callback. Makes sure that we don't create a bio that spans across
2796 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002797 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002798 */
2799static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2800 struct bio_vec *bvec)
2801{
2802 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002803 sector_t sector_offset;
2804 sector_t sectors_per_obj;
2805 sector_t obj_sector_offset;
2806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002807
Alex Eldere5cfeed22012-10-20 22:17:27 -05002808 /*
2809 * Find how far into its rbd object the partition-relative
2810 * bio start sector is to offset relative to the enclosing
2811 * device.
2812 */
2813 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2814 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2815 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002816
Alex Eldere5cfeed22012-10-20 22:17:27 -05002817 /*
2818 * Compute the number of bytes from that offset to the end
2819 * of the object. Account for what's already used by the bio.
2820 */
2821 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2822 if (ret > bmd->bi_size)
2823 ret -= bmd->bi_size;
2824 else
2825 ret = 0;
2826
2827 /*
2828 * Don't send back more than was asked for. And if the bio
2829 * was empty, let the whole thing through because: "Note
2830 * that a block device *must* allow a single page to be
2831 * added to an empty bio."
2832 */
2833 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2834 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2835 ret = (int) bvec->bv_len;
2836
2837 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002838}
2839
2840static void rbd_free_disk(struct rbd_device *rbd_dev)
2841{
2842 struct gendisk *disk = rbd_dev->disk;
2843
2844 if (!disk)
2845 return;
2846
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002847 if (disk->flags & GENHD_FL_UP)
2848 del_gendisk(disk);
2849 if (disk->queue)
2850 blk_cleanup_queue(disk->queue);
2851 put_disk(disk);
2852}
2853
Alex Elder788e2df2013-01-17 12:25:27 -06002854static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2855 const char *object_name,
2856 u64 offset, u64 length,
Alex Elder80ef15b2013-04-21 12:14:45 -05002857 void *buf, u64 *version)
Alex Elder788e2df2013-01-17 12:25:27 -06002858
2859{
Alex Elder21692382013-04-05 01:27:12 -05002860 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002861 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002862 struct page **pages = NULL;
2863 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002864 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002865 int ret;
2866
2867 page_count = (u32) calc_pages_for(offset, length);
2868 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2869 if (IS_ERR(pages))
2870 ret = PTR_ERR(pages);
2871
2872 ret = -ENOMEM;
2873 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002874 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002875 if (!obj_request)
2876 goto out;
2877
2878 obj_request->pages = pages;
2879 obj_request->page_count = page_count;
2880
Alex Elder430c28c2013-04-03 21:32:51 -05002881 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002882 if (!obj_request->osd_req)
2883 goto out;
2884
Alex Elderc99d2d42013-04-05 01:27:11 -05002885 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2886 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002887 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002888 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002889 obj_request->length,
2890 obj_request->offset & ~PAGE_MASK,
2891 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002892 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002893
Alex Elder788e2df2013-01-17 12:25:27 -06002894 ret = rbd_obj_request_submit(osdc, obj_request);
2895 if (ret)
2896 goto out;
2897 ret = rbd_obj_request_wait(obj_request);
2898 if (ret)
2899 goto out;
2900
2901 ret = obj_request->result;
2902 if (ret < 0)
2903 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002904
2905 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2906 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002907 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002908 rbd_assert(size <= (size_t) INT_MAX);
2909 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002910 if (version)
2911 *version = obj_request->version;
2912out:
2913 if (obj_request)
2914 rbd_obj_request_put(obj_request);
2915 else
2916 ceph_release_page_vector(pages, page_count);
2917
2918 return ret;
2919}
2920
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002921/*
Alex Elder4156d9982012-08-02 11:29:46 -05002922 * Read the complete header for the given rbd device.
2923 *
2924 * Returns a pointer to a dynamically-allocated buffer containing
2925 * the complete and validated header. Caller can pass the address
2926 * of a variable that will be filled in with the version of the
2927 * header object at the time it was read.
2928 *
2929 * Returns a pointer-coded errno if a failure occurs.
2930 */
2931static struct rbd_image_header_ondisk *
2932rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2933{
2934 struct rbd_image_header_ondisk *ondisk = NULL;
2935 u32 snap_count = 0;
2936 u64 names_size = 0;
2937 u32 want_count;
2938 int ret;
2939
2940 /*
2941 * The complete header will include an array of its 64-bit
2942 * snapshot ids, followed by the names of those snapshots as
2943 * a contiguous block of NUL-terminated strings. Note that
2944 * the number of snapshots could change by the time we read
2945 * it in, in which case we re-read it.
2946 */
2947 do {
2948 size_t size;
2949
2950 kfree(ondisk);
2951
2952 size = sizeof (*ondisk);
2953 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2954 size += names_size;
2955 ondisk = kmalloc(size, GFP_KERNEL);
2956 if (!ondisk)
2957 return ERR_PTR(-ENOMEM);
2958
Alex Elder788e2df2013-01-17 12:25:27 -06002959 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder80ef15b2013-04-21 12:14:45 -05002960 0, size, ondisk, version);
Alex Elder4156d9982012-08-02 11:29:46 -05002961 if (ret < 0)
2962 goto out_err;
2963 if (WARN_ON((size_t) ret < size)) {
2964 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002965 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2966 size, ret);
Alex Elder4156d9982012-08-02 11:29:46 -05002967 goto out_err;
2968 }
2969 if (!rbd_dev_ondisk_valid(ondisk)) {
2970 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002971 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d9982012-08-02 11:29:46 -05002972 goto out_err;
2973 }
2974
2975 names_size = le64_to_cpu(ondisk->snap_names_len);
2976 want_count = snap_count;
2977 snap_count = le32_to_cpu(ondisk->snap_count);
2978 } while (snap_count != want_count);
2979
2980 return ondisk;
2981
2982out_err:
2983 kfree(ondisk);
2984
2985 return ERR_PTR(ret);
2986}
2987
2988/*
2989 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002990 */
2991static int rbd_read_header(struct rbd_device *rbd_dev,
2992 struct rbd_image_header *header)
2993{
Alex Elder4156d9982012-08-02 11:29:46 -05002994 struct rbd_image_header_ondisk *ondisk;
2995 u64 ver = 0;
2996 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002997
Alex Elder4156d9982012-08-02 11:29:46 -05002998 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2999 if (IS_ERR(ondisk))
3000 return PTR_ERR(ondisk);
3001 ret = rbd_header_from_disk(header, ondisk);
3002 if (ret >= 0)
3003 header->obj_version = ver;
3004 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003005
Alex Elder4156d9982012-08-02 11:29:46 -05003006 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003007}
3008
Alex Elder41f38c22012-10-25 23:34:40 -05003009static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003010{
3011 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05003012 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003013
Alex Eldera0593292012-07-19 09:09:27 -05003014 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05003015 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003016}
3017
Alex Elder94785542012-10-09 13:50:17 -07003018static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3019{
3020 sector_t size;
3021
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003022 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003023 return;
3024
3025 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
3026 dout("setting size to %llu sectors", (unsigned long long) size);
3027 rbd_dev->mapping.size = (u64) size;
3028 set_capacity(rbd_dev->disk, size);
3029}
3030
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003031/*
3032 * only read the first part of the ondisk header, without the snaps info
3033 */
Alex Elder117973f2012-08-31 17:29:55 -05003034static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003035{
3036 int ret;
3037 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003038
3039 ret = rbd_read_header(rbd_dev, &h);
3040 if (ret < 0)
3041 return ret;
3042
Josh Durgina51aa0c2011-12-05 10:35:04 -08003043 down_write(&rbd_dev->header_rwsem);
3044
Alex Elder94785542012-10-09 13:50:17 -07003045 /* Update image size, and check for resize of mapped image */
3046 rbd_dev->header.image_size = h.image_size;
3047 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003048
Alex Elder849b4262012-07-09 21:04:24 -05003049 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003050 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003051 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003052 /* osd requests may still refer to snapc */
3053 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003054
Alex Elderb8136232012-07-25 09:32:41 -05003055 if (hver)
3056 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08003057 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08003058 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003059 rbd_dev->header.snapc = h.snapc;
3060 rbd_dev->header.snap_names = h.snap_names;
3061 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003062 /* Free the extra copy of the object prefix */
3063 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3064 kfree(h.object_prefix);
3065
Alex Elder304f6802012-08-31 17:29:52 -05003066 ret = rbd_dev_snaps_update(rbd_dev);
3067 if (!ret)
3068 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003069
Josh Durginc6666012011-11-21 17:11:12 -08003070 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003071
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003072 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003073}
3074
Alex Elder117973f2012-08-31 17:29:55 -05003075static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05003076{
3077 int ret;
3078
Alex Elder117973f2012-08-31 17:29:55 -05003079 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05003080 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003081 if (rbd_dev->image_format == 1)
3082 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3083 else
3084 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05003085 mutex_unlock(&ctl_mutex);
Laurent Barbed98df632013-04-10 17:47:46 -05003086 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003087
3088 return ret;
3089}
3090
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003091static int rbd_init_disk(struct rbd_device *rbd_dev)
3092{
3093 struct gendisk *disk;
3094 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003095 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003098 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3099 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003100 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003101
Alex Elderf0f8cef2012-01-29 13:57:44 -06003102 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003103 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003104 disk->major = rbd_dev->major;
3105 disk->first_minor = 0;
3106 disk->fops = &rbd_bd_ops;
3107 disk->private_data = rbd_dev;
3108
Alex Elderbf0d5f502012-11-22 00:00:08 -06003109 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003110 if (!q)
3111 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003112
Alex Elder593a9e72012-02-07 12:03:37 -06003113 /* We use the default size, but let's be explicit about it. */
3114 blk_queue_physical_block_size(q, SECTOR_SIZE);
3115
Josh Durgin029bcbd2011-07-22 11:35:23 -07003116 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003117 segment_size = rbd_obj_bytes(&rbd_dev->header);
3118 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3119 blk_queue_max_segment_size(q, segment_size);
3120 blk_queue_io_min(q, segment_size);
3121 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003122
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003123 blk_queue_merge_bvec(q, rbd_merge_bvec);
3124 disk->queue = q;
3125
3126 q->queuedata = rbd_dev;
3127
3128 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003129
Alex Elder12f02942012-08-29 17:11:07 -05003130 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
3131
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003132 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133out_disk:
3134 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003135
3136 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003137}
3138
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003139/*
3140 sysfs
3141*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003142
Alex Elder593a9e72012-02-07 12:03:37 -06003143static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3144{
3145 return container_of(dev, struct rbd_device, dev);
3146}
3147
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003148static ssize_t rbd_size_show(struct device *dev,
3149 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003150{
Alex Elder593a9e72012-02-07 12:03:37 -06003151 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08003152 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003153
Josh Durgina51aa0c2011-12-05 10:35:04 -08003154 down_read(&rbd_dev->header_rwsem);
3155 size = get_capacity(rbd_dev->disk);
3156 up_read(&rbd_dev->header_rwsem);
3157
3158 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003159}
3160
Alex Elder34b13182012-07-13 20:35:12 -05003161/*
3162 * Note this shows the features for whatever's mapped, which is not
3163 * necessarily the base image.
3164 */
3165static ssize_t rbd_features_show(struct device *dev,
3166 struct device_attribute *attr, char *buf)
3167{
3168 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3169
3170 return sprintf(buf, "0x%016llx\n",
3171 (unsigned long long) rbd_dev->mapping.features);
3172}
3173
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003174static ssize_t rbd_major_show(struct device *dev,
3175 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003176{
Alex Elder593a9e72012-02-07 12:03:37 -06003177 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003178
3179 return sprintf(buf, "%d\n", rbd_dev->major);
3180}
3181
3182static ssize_t rbd_client_id_show(struct device *dev,
3183 struct device_attribute *attr, char *buf)
3184{
Alex Elder593a9e72012-02-07 12:03:37 -06003185 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003186
Alex Elder1dbb4392012-01-24 10:08:37 -06003187 return sprintf(buf, "client%lld\n",
3188 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003189}
3190
3191static ssize_t rbd_pool_show(struct device *dev,
3192 struct device_attribute *attr, char *buf)
3193{
Alex Elder593a9e72012-02-07 12:03:37 -06003194 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003195
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003196 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003197}
3198
Alex Elder9bb2f332012-07-12 10:46:35 -05003199static ssize_t rbd_pool_id_show(struct device *dev,
3200 struct device_attribute *attr, char *buf)
3201{
3202 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3203
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003204 return sprintf(buf, "%llu\n",
3205 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003206}
3207
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003208static ssize_t rbd_name_show(struct device *dev,
3209 struct device_attribute *attr, char *buf)
3210{
Alex Elder593a9e72012-02-07 12:03:37 -06003211 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003212
Alex Eldera92ffdf2012-10-30 19:40:33 -05003213 if (rbd_dev->spec->image_name)
3214 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3215
3216 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003217}
3218
Alex Elder589d30e2012-07-10 20:30:11 -05003219static ssize_t rbd_image_id_show(struct device *dev,
3220 struct device_attribute *attr, char *buf)
3221{
3222 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3223
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003224 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003225}
3226
Alex Elder34b13182012-07-13 20:35:12 -05003227/*
3228 * Shows the name of the currently-mapped snapshot (or
3229 * RBD_SNAP_HEAD_NAME for the base image).
3230 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003231static ssize_t rbd_snap_show(struct device *dev,
3232 struct device_attribute *attr,
3233 char *buf)
3234{
Alex Elder593a9e72012-02-07 12:03:37 -06003235 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003236
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003237 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003238}
3239
Alex Elder86b00e02012-10-25 23:34:42 -05003240/*
3241 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3242 * for the parent image. If there is no parent, simply shows
3243 * "(no parent image)".
3244 */
3245static ssize_t rbd_parent_show(struct device *dev,
3246 struct device_attribute *attr,
3247 char *buf)
3248{
3249 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3250 struct rbd_spec *spec = rbd_dev->parent_spec;
3251 int count;
3252 char *bufp = buf;
3253
3254 if (!spec)
3255 return sprintf(buf, "(no parent image)\n");
3256
3257 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3258 (unsigned long long) spec->pool_id, spec->pool_name);
3259 if (count < 0)
3260 return count;
3261 bufp += count;
3262
3263 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3264 spec->image_name ? spec->image_name : "(unknown)");
3265 if (count < 0)
3266 return count;
3267 bufp += count;
3268
3269 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3270 (unsigned long long) spec->snap_id, spec->snap_name);
3271 if (count < 0)
3272 return count;
3273 bufp += count;
3274
3275 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3276 if (count < 0)
3277 return count;
3278 bufp += count;
3279
3280 return (ssize_t) (bufp - buf);
3281}
3282
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003283static ssize_t rbd_image_refresh(struct device *dev,
3284 struct device_attribute *attr,
3285 const char *buf,
3286 size_t size)
3287{
Alex Elder593a9e72012-02-07 12:03:37 -06003288 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003289 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003290
Alex Elder117973f2012-08-31 17:29:55 -05003291 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05003292
3293 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003294}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003295
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003296static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003297static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003298static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3299static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3300static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003301static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003302static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003303static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003304static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3305static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003306static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003307
3308static struct attribute *rbd_attrs[] = {
3309 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003310 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003311 &dev_attr_major.attr,
3312 &dev_attr_client_id.attr,
3313 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003314 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003315 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003316 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003317 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003318 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003319 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003320 NULL
3321};
3322
3323static struct attribute_group rbd_attr_group = {
3324 .attrs = rbd_attrs,
3325};
3326
3327static const struct attribute_group *rbd_attr_groups[] = {
3328 &rbd_attr_group,
3329 NULL
3330};
3331
3332static void rbd_sysfs_dev_release(struct device *dev)
3333{
3334}
3335
3336static struct device_type rbd_device_type = {
3337 .name = "rbd",
3338 .groups = rbd_attr_groups,
3339 .release = rbd_sysfs_dev_release,
3340};
3341
3342
3343/*
3344 sysfs - snapshots
3345*/
3346
3347static ssize_t rbd_snap_size_show(struct device *dev,
3348 struct device_attribute *attr,
3349 char *buf)
3350{
3351 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3352
Josh Durgin3591538f2011-12-05 18:25:13 -08003353 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003354}
3355
3356static ssize_t rbd_snap_id_show(struct device *dev,
3357 struct device_attribute *attr,
3358 char *buf)
3359{
3360 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3361
Josh Durgin3591538f2011-12-05 18:25:13 -08003362 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003363}
3364
Alex Elder34b13182012-07-13 20:35:12 -05003365static ssize_t rbd_snap_features_show(struct device *dev,
3366 struct device_attribute *attr,
3367 char *buf)
3368{
3369 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3370
3371 return sprintf(buf, "0x%016llx\n",
3372 (unsigned long long) snap->features);
3373}
3374
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003375static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
3376static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003377static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003378
3379static struct attribute *rbd_snap_attrs[] = {
3380 &dev_attr_snap_size.attr,
3381 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003382 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003383 NULL,
3384};
3385
3386static struct attribute_group rbd_snap_attr_group = {
3387 .attrs = rbd_snap_attrs,
3388};
3389
3390static void rbd_snap_dev_release(struct device *dev)
3391{
3392 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3393 kfree(snap->name);
3394 kfree(snap);
3395}
3396
3397static const struct attribute_group *rbd_snap_attr_groups[] = {
3398 &rbd_snap_attr_group,
3399 NULL
3400};
3401
3402static struct device_type rbd_snap_device_type = {
3403 .groups = rbd_snap_attr_groups,
3404 .release = rbd_snap_dev_release,
3405};
3406
Alex Elder8b8fb992012-10-26 17:25:24 -05003407static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3408{
3409 kref_get(&spec->kref);
3410
3411 return spec;
3412}
3413
3414static void rbd_spec_free(struct kref *kref);
3415static void rbd_spec_put(struct rbd_spec *spec)
3416{
3417 if (spec)
3418 kref_put(&spec->kref, rbd_spec_free);
3419}
3420
3421static struct rbd_spec *rbd_spec_alloc(void)
3422{
3423 struct rbd_spec *spec;
3424
3425 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3426 if (!spec)
3427 return NULL;
3428 kref_init(&spec->kref);
3429
Alex Elder8b8fb992012-10-26 17:25:24 -05003430 return spec;
3431}
3432
3433static void rbd_spec_free(struct kref *kref)
3434{
3435 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3436
3437 kfree(spec->pool_name);
3438 kfree(spec->image_id);
3439 kfree(spec->image_name);
3440 kfree(spec->snap_name);
3441 kfree(spec);
3442}
3443
Alex Eldercc344fa2013-02-19 12:25:56 -06003444static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003445 struct rbd_spec *spec)
3446{
3447 struct rbd_device *rbd_dev;
3448
3449 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3450 if (!rbd_dev)
3451 return NULL;
3452
3453 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003454 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003455 INIT_LIST_HEAD(&rbd_dev->node);
3456 INIT_LIST_HEAD(&rbd_dev->snaps);
3457 init_rwsem(&rbd_dev->header_rwsem);
3458
3459 rbd_dev->spec = spec;
3460 rbd_dev->rbd_client = rbdc;
3461
Alex Elder0903e872012-11-14 12:25:19 -06003462 /* Initialize the layout used for all rbd requests */
3463
3464 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3465 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3466 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3467 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3468
Alex Elderc53d5892012-10-25 23:34:42 -05003469 return rbd_dev;
3470}
3471
3472static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3473{
Alex Elder86b00e02012-10-25 23:34:42 -05003474 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05003475 kfree(rbd_dev->header_name);
3476 rbd_put_client(rbd_dev->rbd_client);
3477 rbd_spec_put(rbd_dev->spec);
3478 kfree(rbd_dev);
3479}
3480
Alex Elder304f6802012-08-31 17:29:52 -05003481static bool rbd_snap_registered(struct rbd_snap *snap)
3482{
3483 bool ret = snap->dev.type == &rbd_snap_device_type;
3484 bool reg = device_is_registered(&snap->dev);
3485
3486 rbd_assert(!ret ^ reg);
3487
3488 return ret;
3489}
3490
Alex Elder41f38c22012-10-25 23:34:40 -05003491static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003492{
3493 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05003494 if (device_is_registered(&snap->dev))
3495 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003496}
3497
Alex Elder14e70852012-07-19 09:09:27 -05003498static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003499 struct device *parent)
3500{
3501 struct device *dev = &snap->dev;
3502 int ret;
3503
3504 dev->type = &rbd_snap_device_type;
3505 dev->parent = parent;
3506 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05003507 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05003508 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
3509
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003510 ret = device_register(dev);
3511
3512 return ret;
3513}
3514
Alex Elder4e891e02012-07-10 20:30:10 -05003515static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003516 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003517 u64 snap_id, u64 snap_size,
3518 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003519{
Alex Elder4e891e02012-07-10 20:30:10 -05003520 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003521 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05003522
3523 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003524 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003525 return ERR_PTR(-ENOMEM);
3526
3527 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05003528 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05003529 if (!snap->name)
3530 goto err;
3531
Alex Elderc8d18422012-07-10 20:30:11 -05003532 snap->id = snap_id;
3533 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003534 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003535
3536 return snap;
3537
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003538err:
3539 kfree(snap->name);
3540 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05003541
3542 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003543}
3544
Alex Eldercd892122012-07-03 16:01:19 -05003545static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3546 u64 *snap_size, u64 *snap_features)
3547{
3548 char *snap_name;
3549
3550 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3551
3552 *snap_size = rbd_dev->header.snap_sizes[which];
3553 *snap_features = 0; /* No features for v1 */
3554
3555 /* Skip over names until we find the one we are looking for */
3556
3557 snap_name = rbd_dev->header.snap_names;
3558 while (which--)
3559 snap_name += strlen(snap_name) + 1;
3560
3561 return snap_name;
3562}
3563
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003564/*
Alex Elder9d475de2012-07-03 16:01:19 -05003565 * Get the size and object order for an image snapshot, or if
3566 * snap_id is CEPH_NOSNAP, gets this information for the base
3567 * image.
3568 */
3569static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3570 u8 *order, u64 *snap_size)
3571{
3572 __le64 snapid = cpu_to_le64(snap_id);
3573 int ret;
3574 struct {
3575 u8 order;
3576 __le64 size;
3577 } __attribute__ ((packed)) size_buf = { 0 };
3578
Alex Elder36be9a72013-01-19 00:30:28 -06003579 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003580 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003581 &snapid, sizeof (snapid),
3582 &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003583 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003584 if (ret < 0)
3585 return ret;
3586
3587 *order = size_buf.order;
3588 *snap_size = le64_to_cpu(size_buf.size);
3589
3590 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
3591 (unsigned long long) snap_id, (unsigned int) *order,
3592 (unsigned long long) *snap_size);
3593
3594 return 0;
3595}
3596
3597static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3598{
3599 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3600 &rbd_dev->header.obj_order,
3601 &rbd_dev->header.image_size);
3602}
3603
Alex Elder1e130192012-07-03 16:01:19 -05003604static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3605{
3606 void *reply_buf;
3607 int ret;
3608 void *p;
3609
3610 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3611 if (!reply_buf)
3612 return -ENOMEM;
3613
Alex Elder36be9a72013-01-19 00:30:28 -06003614 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003615 "rbd", "get_object_prefix", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003616 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003617 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003618 if (ret < 0)
3619 goto out;
3620
3621 p = reply_buf;
3622 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3623 p + RBD_OBJ_PREFIX_LEN_MAX,
3624 NULL, GFP_NOIO);
3625
3626 if (IS_ERR(rbd_dev->header.object_prefix)) {
3627 ret = PTR_ERR(rbd_dev->header.object_prefix);
3628 rbd_dev->header.object_prefix = NULL;
3629 } else {
3630 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3631 }
3632
3633out:
3634 kfree(reply_buf);
3635
3636 return ret;
3637}
3638
Alex Elderb1b54022012-07-03 16:01:19 -05003639static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3640 u64 *snap_features)
3641{
3642 __le64 snapid = cpu_to_le64(snap_id);
3643 struct {
3644 __le64 features;
3645 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003646 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003647 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003648 int ret;
3649
Alex Elder36be9a72013-01-19 00:30:28 -06003650 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003651 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003652 &snapid, sizeof (snapid),
3653 &features_buf, sizeof (features_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003654 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003655 if (ret < 0)
3656 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07003657
3658 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003659 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003660 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003661
Alex Elderb1b54022012-07-03 16:01:19 -05003662 *snap_features = le64_to_cpu(features_buf.features);
3663
3664 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3665 (unsigned long long) snap_id,
3666 (unsigned long long) *snap_features,
3667 (unsigned long long) le64_to_cpu(features_buf.incompat));
3668
3669 return 0;
3670}
3671
3672static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3673{
3674 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3675 &rbd_dev->header.features);
3676}
3677
Alex Elder86b00e02012-10-25 23:34:42 -05003678static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3679{
3680 struct rbd_spec *parent_spec;
3681 size_t size;
3682 void *reply_buf = NULL;
3683 __le64 snapid;
3684 void *p;
3685 void *end;
3686 char *image_id;
3687 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003688 int ret;
3689
3690 parent_spec = rbd_spec_alloc();
3691 if (!parent_spec)
3692 return -ENOMEM;
3693
3694 size = sizeof (__le64) + /* pool_id */
3695 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3696 sizeof (__le64) + /* snap_id */
3697 sizeof (__le64); /* overlap */
3698 reply_buf = kmalloc(size, GFP_KERNEL);
3699 if (!reply_buf) {
3700 ret = -ENOMEM;
3701 goto out_err;
3702 }
3703
3704 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003705 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003706 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003707 &snapid, sizeof (snapid),
3708 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003709 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003710 if (ret < 0)
3711 goto out_err;
3712
3713 ret = -ERANGE;
3714 p = reply_buf;
Alex Elder41579762013-04-21 12:14:45 -05003715 end = reply_buf + size;
Alex Elder86b00e02012-10-25 23:34:42 -05003716 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3717 if (parent_spec->pool_id == CEPH_NOPOOL)
3718 goto out; /* No parent? No problem. */
3719
Alex Elder0903e872012-11-14 12:25:19 -06003720 /* The ceph file layout needs to fit pool id in 32 bits */
3721
3722 ret = -EIO;
3723 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3724 goto out;
3725
Alex Elder979ed482012-11-01 08:39:26 -05003726 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003727 if (IS_ERR(image_id)) {
3728 ret = PTR_ERR(image_id);
3729 goto out_err;
3730 }
3731 parent_spec->image_id = image_id;
3732 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3733 ceph_decode_64_safe(&p, end, overlap, out_err);
3734
3735 rbd_dev->parent_overlap = overlap;
3736 rbd_dev->parent_spec = parent_spec;
3737 parent_spec = NULL; /* rbd_dev now owns this */
3738out:
3739 ret = 0;
3740out_err:
3741 kfree(reply_buf);
3742 rbd_spec_put(parent_spec);
3743
3744 return ret;
3745}
3746
Alex Elder9e15b772012-10-30 19:40:33 -05003747static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3748{
3749 size_t image_id_size;
3750 char *image_id;
3751 void *p;
3752 void *end;
3753 size_t size;
3754 void *reply_buf = NULL;
3755 size_t len = 0;
3756 char *image_name = NULL;
3757 int ret;
3758
3759 rbd_assert(!rbd_dev->spec->image_name);
3760
Alex Elder69e7a022012-11-01 08:39:26 -05003761 len = strlen(rbd_dev->spec->image_id);
3762 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003763 image_id = kmalloc(image_id_size, GFP_KERNEL);
3764 if (!image_id)
3765 return NULL;
3766
3767 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003768 end = image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003769 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003770
3771 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3772 reply_buf = kmalloc(size, GFP_KERNEL);
3773 if (!reply_buf)
3774 goto out;
3775
Alex Elder36be9a72013-01-19 00:30:28 -06003776 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003777 "rbd", "dir_get_name",
3778 image_id, image_id_size,
Alex Elder41579762013-04-21 12:14:45 -05003779 reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003780 if (ret < 0)
3781 goto out;
3782 p = reply_buf;
Alex Elder41579762013-04-21 12:14:45 -05003783 end = reply_buf + size;
Alex Elder9e15b772012-10-30 19:40:33 -05003784 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3785 if (IS_ERR(image_name))
3786 image_name = NULL;
3787 else
3788 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3789out:
3790 kfree(reply_buf);
3791 kfree(image_id);
3792
3793 return image_name;
3794}
3795
3796/*
3797 * When a parent image gets probed, we only have the pool, image,
3798 * and snapshot ids but not the names of any of them. This call
3799 * is made later to fill in those names. It has to be done after
3800 * rbd_dev_snaps_update() has completed because some of the
3801 * information (in particular, snapshot name) is not available
3802 * until then.
3803 */
3804static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3805{
3806 struct ceph_osd_client *osdc;
3807 const char *name;
3808 void *reply_buf = NULL;
3809 int ret;
3810
3811 if (rbd_dev->spec->pool_name)
3812 return 0; /* Already have the names */
3813
3814 /* Look up the pool name */
3815
3816 osdc = &rbd_dev->rbd_client->client->osdc;
3817 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003818 if (!name) {
3819 rbd_warn(rbd_dev, "there is no pool with id %llu",
3820 rbd_dev->spec->pool_id); /* Really a BUG() */
3821 return -EIO;
3822 }
Alex Elder9e15b772012-10-30 19:40:33 -05003823
3824 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3825 if (!rbd_dev->spec->pool_name)
3826 return -ENOMEM;
3827
3828 /* Fetch the image name; tolerate failure here */
3829
3830 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003831 if (name)
Alex Elder41579762013-04-21 12:14:45 -05003832 rbd_dev->spec->image_name = (char *)name;
Alex Elder69e7a022012-11-01 08:39:26 -05003833 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003834 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003835
3836 /* Look up the snapshot name. */
3837
3838 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3839 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003840 rbd_warn(rbd_dev, "no snapshot with id %llu",
3841 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003842 ret = -EIO;
3843 goto out_err;
3844 }
3845 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3846 if(!rbd_dev->spec->snap_name)
3847 goto out_err;
3848
3849 return 0;
3850out_err:
3851 kfree(reply_buf);
3852 kfree(rbd_dev->spec->pool_name);
3853 rbd_dev->spec->pool_name = NULL;
3854
3855 return ret;
3856}
3857
Alex Elder6e14b1a2012-07-03 16:01:19 -05003858static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003859{
3860 size_t size;
3861 int ret;
3862 void *reply_buf;
3863 void *p;
3864 void *end;
3865 u64 seq;
3866 u32 snap_count;
3867 struct ceph_snap_context *snapc;
3868 u32 i;
3869
3870 /*
3871 * We'll need room for the seq value (maximum snapshot id),
3872 * snapshot count, and array of that many snapshot ids.
3873 * For now we have a fixed upper limit on the number we're
3874 * prepared to receive.
3875 */
3876 size = sizeof (__le64) + sizeof (__le32) +
3877 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3878 reply_buf = kzalloc(size, GFP_KERNEL);
3879 if (!reply_buf)
3880 return -ENOMEM;
3881
Alex Elder36be9a72013-01-19 00:30:28 -06003882 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003883 "rbd", "get_snapcontext", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003884 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003885 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003886 if (ret < 0)
3887 goto out;
3888
3889 ret = -ERANGE;
3890 p = reply_buf;
Alex Elder41579762013-04-21 12:14:45 -05003891 end = reply_buf + size;
Alex Elder35d489f2012-07-03 16:01:19 -05003892 ceph_decode_64_safe(&p, end, seq, out);
3893 ceph_decode_32_safe(&p, end, snap_count, out);
3894
3895 /*
3896 * Make sure the reported number of snapshot ids wouldn't go
3897 * beyond the end of our buffer. But before checking that,
3898 * make sure the computed size of the snapshot context we
3899 * allocate is representable in a size_t.
3900 */
3901 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3902 / sizeof (u64)) {
3903 ret = -EINVAL;
3904 goto out;
3905 }
3906 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3907 goto out;
3908
3909 size = sizeof (struct ceph_snap_context) +
3910 snap_count * sizeof (snapc->snaps[0]);
3911 snapc = kmalloc(size, GFP_KERNEL);
3912 if (!snapc) {
3913 ret = -ENOMEM;
3914 goto out;
3915 }
3916
3917 atomic_set(&snapc->nref, 1);
3918 snapc->seq = seq;
3919 snapc->num_snaps = snap_count;
3920 for (i = 0; i < snap_count; i++)
3921 snapc->snaps[i] = ceph_decode_64(&p);
3922
3923 rbd_dev->header.snapc = snapc;
3924
3925 dout(" snap context seq = %llu, snap_count = %u\n",
3926 (unsigned long long) seq, (unsigned int) snap_count);
3927
3928out:
3929 kfree(reply_buf);
3930
3931 return 0;
3932}
3933
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003934static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3935{
3936 size_t size;
3937 void *reply_buf;
3938 __le64 snap_id;
3939 int ret;
3940 void *p;
3941 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003942 char *snap_name;
3943
3944 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3945 reply_buf = kmalloc(size, GFP_KERNEL);
3946 if (!reply_buf)
3947 return ERR_PTR(-ENOMEM);
3948
3949 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003950 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003951 "rbd", "get_snapshot_name",
Alex Elder41579762013-04-21 12:14:45 -05003952 &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003953 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003954 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003955 if (ret < 0)
3956 goto out;
3957
3958 p = reply_buf;
Alex Elder41579762013-04-21 12:14:45 -05003959 end = reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003960 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003961 if (IS_ERR(snap_name)) {
3962 ret = PTR_ERR(snap_name);
3963 goto out;
3964 } else {
3965 dout(" snap_id 0x%016llx snap_name = %s\n",
3966 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3967 }
3968 kfree(reply_buf);
3969
3970 return snap_name;
3971out:
3972 kfree(reply_buf);
3973
3974 return ERR_PTR(ret);
3975}
3976
3977static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3978 u64 *snap_size, u64 *snap_features)
3979{
Alex Eldere0b49862013-01-09 14:44:18 -06003980 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003981 u8 order;
3982 int ret;
3983
3984 snap_id = rbd_dev->header.snapc->snaps[which];
3985 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3986 if (ret)
3987 return ERR_PTR(ret);
3988 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3989 if (ret)
3990 return ERR_PTR(ret);
3991
3992 return rbd_dev_v2_snap_name(rbd_dev, which);
3993}
3994
3995static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3996 u64 *snap_size, u64 *snap_features)
3997{
3998 if (rbd_dev->image_format == 1)
3999 return rbd_dev_v1_snap_info(rbd_dev, which,
4000 snap_size, snap_features);
4001 if (rbd_dev->image_format == 2)
4002 return rbd_dev_v2_snap_info(rbd_dev, which,
4003 snap_size, snap_features);
4004 return ERR_PTR(-EINVAL);
4005}
4006
Alex Elder117973f2012-08-31 17:29:55 -05004007static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
4008{
4009 int ret;
4010 __u8 obj_order;
4011
4012 down_write(&rbd_dev->header_rwsem);
4013
4014 /* Grab old order first, to see if it changes */
4015
4016 obj_order = rbd_dev->header.obj_order,
4017 ret = rbd_dev_v2_image_size(rbd_dev);
4018 if (ret)
4019 goto out;
4020 if (rbd_dev->header.obj_order != obj_order) {
4021 ret = -EIO;
4022 goto out;
4023 }
4024 rbd_update_mapping_size(rbd_dev);
4025
4026 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4027 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4028 if (ret)
4029 goto out;
4030 ret = rbd_dev_snaps_update(rbd_dev);
4031 dout("rbd_dev_snaps_update returned %d\n", ret);
4032 if (ret)
4033 goto out;
4034 ret = rbd_dev_snaps_register(rbd_dev);
4035 dout("rbd_dev_snaps_register returned %d\n", ret);
4036out:
4037 up_write(&rbd_dev->header_rwsem);
4038
4039 return ret;
4040}
4041
Alex Elder9d475de2012-07-03 16:01:19 -05004042/*
Alex Elder35938152012-08-02 11:29:46 -05004043 * Scan the rbd device's current snapshot list and compare it to the
4044 * newly-received snapshot context. Remove any existing snapshots
4045 * not present in the new snapshot context. Add a new snapshot for
4046 * any snaphots in the snapshot context not in the current list.
4047 * And verify there are no changes to snapshots we already know
4048 * about.
4049 *
4050 * Assumes the snapshots in the snapshot context are sorted by
4051 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4052 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053 */
Alex Elder304f6802012-08-31 17:29:52 -05004054static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004055{
Alex Elder35938152012-08-02 11:29:46 -05004056 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4057 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004058 struct list_head *head = &rbd_dev->snaps;
4059 struct list_head *links = head->next;
4060 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004061
Alex Elder9fcbb802012-08-23 23:48:49 -05004062 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004063 while (index < snap_count || links != head) {
4064 u64 snap_id;
4065 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05004066 char *snap_name;
4067 u64 snap_size = 0;
4068 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004069
Alex Elder35938152012-08-02 11:29:46 -05004070 snap_id = index < snap_count ? snapc->snaps[index]
4071 : CEPH_NOSNAP;
4072 snap = links != head ? list_entry(links, struct rbd_snap, node)
4073 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004074 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004075
Alex Elder35938152012-08-02 11:29:46 -05004076 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4077 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004078
Alex Elder6d292902013-01-14 12:43:31 -06004079 /*
4080 * A previously-existing snapshot is not in
4081 * the new snap context.
4082 *
4083 * If the now missing snapshot is the one the
4084 * image is mapped to, clear its exists flag
4085 * so we can avoid sending any more requests
4086 * to it.
4087 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004088 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004089 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder41f38c22012-10-25 23:34:40 -05004090 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05004091 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004092 rbd_dev->spec->snap_id == snap->id ?
4093 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05004094 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004095
Alex Elder35938152012-08-02 11:29:46 -05004096 /* Done with this list entry; advance */
4097
4098 links = next;
4099 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004100 }
Alex Elder35938152012-08-02 11:29:46 -05004101
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004102 snap_name = rbd_dev_snap_info(rbd_dev, index,
4103 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05004104 if (IS_ERR(snap_name))
4105 return PTR_ERR(snap_name);
4106
Alex Elder9fcbb802012-08-23 23:48:49 -05004107 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
4108 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004109 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4110 struct rbd_snap *new_snap;
4111
4112 /* We haven't seen this snapshot before */
4113
Alex Elderc8d18422012-07-10 20:30:11 -05004114 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004115 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004116 if (IS_ERR(new_snap)) {
4117 int err = PTR_ERR(new_snap);
4118
4119 dout(" failed to add dev, error %d\n", err);
4120
4121 return err;
4122 }
Alex Elder35938152012-08-02 11:29:46 -05004123
4124 /* New goes before existing, or at end of list */
4125
Alex Elder9fcbb802012-08-23 23:48:49 -05004126 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004127 if (snap)
4128 list_add_tail(&new_snap->node, &snap->node);
4129 else
Alex Elder523f3252012-08-30 00:16:37 -05004130 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004131 } else {
4132 /* Already have this one */
4133
Alex Elder9fcbb802012-08-23 23:48:49 -05004134 dout(" already present\n");
4135
Alex Eldercd892122012-07-03 16:01:19 -05004136 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004137 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004138 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004139
4140 /* Done with this list entry; advance */
4141
4142 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004143 }
Alex Elder35938152012-08-02 11:29:46 -05004144
4145 /* Advance to the next entry in the snapshot context */
4146
4147 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004148 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004149 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004150
4151 return 0;
4152}
4153
Alex Elder304f6802012-08-31 17:29:52 -05004154/*
4155 * Scan the list of snapshots and register the devices for any that
4156 * have not already been registered.
4157 */
4158static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
4159{
4160 struct rbd_snap *snap;
4161 int ret = 0;
4162
Alex Elder37206ee2013-02-20 17:32:08 -06004163 dout("%s:\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05004164 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
4165 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05004166
4167 list_for_each_entry(snap, &rbd_dev->snaps, node) {
4168 if (!rbd_snap_registered(snap)) {
4169 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
4170 if (ret < 0)
4171 break;
4172 }
4173 }
4174 dout("%s: returning %d\n", __func__, ret);
4175
4176 return ret;
4177}
4178
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4180{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004181 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004182 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004183
4184 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004185
Alex Eldercd789ab2012-08-30 00:16:38 -05004186 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004187 dev->bus = &rbd_bus_type;
4188 dev->type = &rbd_device_type;
4189 dev->parent = &rbd_root_dev;
4190 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05004191 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004192 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004193
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004194 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004195
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004196 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004197}
4198
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004199static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4200{
4201 device_unregister(&rbd_dev->dev);
4202}
4203
Alex Eldere2839302012-08-29 17:11:06 -05004204static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004205
4206/*
Alex Elder499afd52012-02-02 08:13:29 -06004207 * Get a unique rbd identifier for the given new rbd_dev, and add
4208 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004209 */
Alex Eldere2839302012-08-29 17:11:06 -05004210static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004211{
Alex Eldere2839302012-08-29 17:11:06 -05004212 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004213
4214 spin_lock(&rbd_dev_list_lock);
4215 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4216 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004217 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4218 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004219}
Alex Elderb7f23c32012-01-29 13:57:43 -06004220
Alex Elder1ddbe942012-01-29 13:57:44 -06004221/*
Alex Elder499afd52012-02-02 08:13:29 -06004222 * Remove an rbd_dev from the global list, and record that its
4223 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004224 */
Alex Eldere2839302012-08-29 17:11:06 -05004225static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004226{
Alex Elderd184f6b2012-01-29 13:57:44 -06004227 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004228 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004229 int max_id;
4230
Alex Elderaafb2302012-09-06 16:00:54 -05004231 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004232
Alex Eldere2839302012-08-29 17:11:06 -05004233 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4234 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004235 spin_lock(&rbd_dev_list_lock);
4236 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004237
4238 /*
4239 * If the id being "put" is not the current maximum, there
4240 * is nothing special we need to do.
4241 */
Alex Eldere2839302012-08-29 17:11:06 -05004242 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004243 spin_unlock(&rbd_dev_list_lock);
4244 return;
4245 }
4246
4247 /*
4248 * We need to update the current maximum id. Search the
4249 * list to find out what it is. We're more likely to find
4250 * the maximum at the end, so search the list backward.
4251 */
4252 max_id = 0;
4253 list_for_each_prev(tmp, &rbd_dev_list) {
4254 struct rbd_device *rbd_dev;
4255
4256 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004257 if (rbd_dev->dev_id > max_id)
4258 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004259 }
Alex Elder499afd52012-02-02 08:13:29 -06004260 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004261
Alex Elder1ddbe942012-01-29 13:57:44 -06004262 /*
Alex Eldere2839302012-08-29 17:11:06 -05004263 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004264 * which case it now accurately reflects the new maximum.
4265 * Be careful not to overwrite the maximum value in that
4266 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004267 */
Alex Eldere2839302012-08-29 17:11:06 -05004268 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4269 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004270}
4271
Alex Eldera725f65e2012-02-02 08:13:30 -06004272/*
Alex Eldere28fff262012-02-02 08:13:30 -06004273 * Skips over white space at *buf, and updates *buf to point to the
4274 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004275 * the token (string of non-white space characters) found. Note
4276 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004277 */
4278static inline size_t next_token(const char **buf)
4279{
4280 /*
4281 * These are the characters that produce nonzero for
4282 * isspace() in the "C" and "POSIX" locales.
4283 */
4284 const char *spaces = " \f\n\r\t\v";
4285
4286 *buf += strspn(*buf, spaces); /* Find start of token */
4287
4288 return strcspn(*buf, spaces); /* Return token length */
4289}
4290
4291/*
4292 * Finds the next token in *buf, and if the provided token buffer is
4293 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004294 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4295 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004296 *
4297 * Returns the length of the token found (not including the '\0').
4298 * Return value will be 0 if no token is found, and it will be >=
4299 * token_size if the token would not fit.
4300 *
Alex Elder593a9e72012-02-07 12:03:37 -06004301 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004302 * found token. Note that this occurs even if the token buffer is
4303 * too small to hold it.
4304 */
4305static inline size_t copy_token(const char **buf,
4306 char *token,
4307 size_t token_size)
4308{
4309 size_t len;
4310
4311 len = next_token(buf);
4312 if (len < token_size) {
4313 memcpy(token, *buf, len);
4314 *(token + len) = '\0';
4315 }
4316 *buf += len;
4317
4318 return len;
4319}
4320
4321/*
Alex Elderea3352f2012-07-09 21:04:23 -05004322 * Finds the next token in *buf, dynamically allocates a buffer big
4323 * enough to hold a copy of it, and copies the token into the new
4324 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4325 * that a duplicate buffer is created even for a zero-length token.
4326 *
4327 * Returns a pointer to the newly-allocated duplicate, or a null
4328 * pointer if memory for the duplicate was not available. If
4329 * the lenp argument is a non-null pointer, the length of the token
4330 * (not including the '\0') is returned in *lenp.
4331 *
4332 * If successful, the *buf pointer will be updated to point beyond
4333 * the end of the found token.
4334 *
4335 * Note: uses GFP_KERNEL for allocation.
4336 */
4337static inline char *dup_token(const char **buf, size_t *lenp)
4338{
4339 char *dup;
4340 size_t len;
4341
4342 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004343 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004344 if (!dup)
4345 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004346 *(dup + len) = '\0';
4347 *buf += len;
4348
4349 if (lenp)
4350 *lenp = len;
4351
4352 return dup;
4353}
4354
4355/*
Alex Elder859c31d2012-10-25 23:34:42 -05004356 * Parse the options provided for an "rbd add" (i.e., rbd image
4357 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4358 * and the data written is passed here via a NUL-terminated buffer.
4359 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004360 *
Alex Elder859c31d2012-10-25 23:34:42 -05004361 * The information extracted from these options is recorded in
4362 * the other parameters which return dynamically-allocated
4363 * structures:
4364 * ceph_opts
4365 * The address of a pointer that will refer to a ceph options
4366 * structure. Caller must release the returned pointer using
4367 * ceph_destroy_options() when it is no longer needed.
4368 * rbd_opts
4369 * Address of an rbd options pointer. Fully initialized by
4370 * this function; caller must release with kfree().
4371 * spec
4372 * Address of an rbd image specification pointer. Fully
4373 * initialized by this function based on parsed options.
4374 * Caller must release with rbd_spec_put().
4375 *
4376 * The options passed take this form:
4377 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4378 * where:
4379 * <mon_addrs>
4380 * A comma-separated list of one or more monitor addresses.
4381 * A monitor address is an ip address, optionally followed
4382 * by a port number (separated by a colon).
4383 * I.e.: ip1[:port1][,ip2[:port2]...]
4384 * <options>
4385 * A comma-separated list of ceph and/or rbd options.
4386 * <pool_name>
4387 * The name of the rados pool containing the rbd image.
4388 * <image_name>
4389 * The name of the image in that pool to map.
4390 * <snap_id>
4391 * An optional snapshot id. If provided, the mapping will
4392 * present data from the image at the time that snapshot was
4393 * created. The image head is used if no snapshot id is
4394 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004395 */
Alex Elder859c31d2012-10-25 23:34:42 -05004396static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004397 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004398 struct rbd_options **opts,
4399 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004400{
Alex Elderd22f76e2012-07-12 10:46:35 -05004401 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004402 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004403 const char *mon_addrs;
4404 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004405 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004406 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004407 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004408 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004409
4410 /* The first four tokens are required */
4411
Alex Elder7ef32142012-02-02 08:13:30 -06004412 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004413 if (!len) {
4414 rbd_warn(NULL, "no monitor address(es) provided");
4415 return -EINVAL;
4416 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004417 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004418 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004419 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004420
Alex Elderdc79b112012-10-25 23:34:41 -05004421 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004422 options = dup_token(&buf, NULL);
4423 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004424 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004425 if (!*options) {
4426 rbd_warn(NULL, "no options provided");
4427 goto out_err;
4428 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004429
Alex Elder859c31d2012-10-25 23:34:42 -05004430 spec = rbd_spec_alloc();
4431 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004432 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004433
4434 spec->pool_name = dup_token(&buf, NULL);
4435 if (!spec->pool_name)
4436 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004437 if (!*spec->pool_name) {
4438 rbd_warn(NULL, "no pool name provided");
4439 goto out_err;
4440 }
Alex Eldere28fff262012-02-02 08:13:30 -06004441
Alex Elder69e7a022012-11-01 08:39:26 -05004442 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004443 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004444 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004445 if (!*spec->image_name) {
4446 rbd_warn(NULL, "no image name provided");
4447 goto out_err;
4448 }
Alex Eldere28fff262012-02-02 08:13:30 -06004449
Alex Elderf28e5652012-10-25 23:34:41 -05004450 /*
4451 * Snapshot name is optional; default is to use "-"
4452 * (indicating the head/no snapshot).
4453 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004454 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004455 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004456 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4457 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004458 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004459 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004460 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004461 }
Alex Elder4caf35f2012-11-01 08:39:27 -05004462 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05004463 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004464 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004465 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05004466
Alex Elder0ddebc02012-10-25 23:34:41 -05004467 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004468
Alex Elder4e9afeb2012-10-25 23:34:41 -05004469 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4470 if (!rbd_opts)
4471 goto out_mem;
4472
4473 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004474
Alex Elder859c31d2012-10-25 23:34:42 -05004475 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004476 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004477 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004478 if (IS_ERR(copts)) {
4479 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004480 goto out_err;
4481 }
Alex Elder859c31d2012-10-25 23:34:42 -05004482 kfree(options);
4483
4484 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004485 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004486 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004487
Alex Elderdc79b112012-10-25 23:34:41 -05004488 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004489out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004490 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004491out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004492 kfree(rbd_opts);
4493 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004494 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004495
Alex Elderdc79b112012-10-25 23:34:41 -05004496 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004497}
4498
Alex Elder589d30e2012-07-10 20:30:11 -05004499/*
4500 * An rbd format 2 image has a unique identifier, distinct from the
4501 * name given to it by the user. Internally, that identifier is
4502 * what's used to specify the names of objects related to the image.
4503 *
4504 * A special "rbd id" object is used to map an rbd image name to its
4505 * id. If that object doesn't exist, then there is no v2 rbd image
4506 * with the supplied name.
4507 *
4508 * This function will record the given rbd_dev's image_id field if
4509 * it can be determined, and in that case will return 0. If any
4510 * errors occur a negative errno will be returned and the rbd_dev's
4511 * image_id field will be unchanged (and should be NULL).
4512 */
4513static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4514{
4515 int ret;
4516 size_t size;
4517 char *object_name;
4518 void *response;
4519 void *p;
4520
Alex Elder2f82ee52012-10-30 19:40:33 -05004521 /* If we already have it we don't need to look it up */
4522
4523 if (rbd_dev->spec->image_id)
4524 return 0;
4525
Alex Elder589d30e2012-07-10 20:30:11 -05004526 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004527 * When probing a parent image, the image id is already
4528 * known (and the image name likely is not). There's no
4529 * need to fetch the image id again in this case.
4530 */
4531 if (rbd_dev->spec->image_id)
4532 return 0;
4533
4534 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004535 * First, see if the format 2 image id file exists, and if
4536 * so, get the image's persistent id from it.
4537 */
Alex Elder69e7a022012-11-01 08:39:26 -05004538 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004539 object_name = kmalloc(size, GFP_NOIO);
4540 if (!object_name)
4541 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004542 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004543 dout("rbd id object name is %s\n", object_name);
4544
4545 /* Response will be an encoded string, which includes a length */
4546
4547 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4548 response = kzalloc(size, GFP_NOIO);
4549 if (!response) {
4550 ret = -ENOMEM;
4551 goto out;
4552 }
4553
Alex Elder36be9a72013-01-19 00:30:28 -06004554 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004555 "rbd", "get_id", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06004556 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06004557 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05004558 if (ret < 0)
4559 goto out;
4560
4561 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004562 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05004563 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05004564 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004565 if (IS_ERR(rbd_dev->spec->image_id)) {
4566 ret = PTR_ERR(rbd_dev->spec->image_id);
4567 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05004568 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004569 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004570 }
4571out:
4572 kfree(response);
4573 kfree(object_name);
4574
4575 return ret;
4576}
4577
Alex Eldera30b71b2012-07-10 20:30:11 -05004578static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4579{
4580 int ret;
4581 size_t size;
4582
4583 /* Version 1 images have no id; empty string is used */
4584
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004585 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
4586 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05004587 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05004588
4589 /* Record the header object name for this rbd image. */
4590
Alex Elder69e7a022012-11-01 08:39:26 -05004591 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004592 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4593 if (!rbd_dev->header_name) {
4594 ret = -ENOMEM;
4595 goto out_err;
4596 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004597 sprintf(rbd_dev->header_name, "%s%s",
4598 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004599
4600 /* Populate rbd image metadata */
4601
4602 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4603 if (ret < 0)
4604 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004605
4606 /* Version 1 images have no parent (no layering) */
4607
4608 rbd_dev->parent_spec = NULL;
4609 rbd_dev->parent_overlap = 0;
4610
Alex Eldera30b71b2012-07-10 20:30:11 -05004611 rbd_dev->image_format = 1;
4612
4613 dout("discovered version 1 image, header name is %s\n",
4614 rbd_dev->header_name);
4615
4616 return 0;
4617
4618out_err:
4619 kfree(rbd_dev->header_name);
4620 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004621 kfree(rbd_dev->spec->image_id);
4622 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004623
4624 return ret;
4625}
4626
4627static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4628{
4629 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05004630 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004631 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004632
4633 /*
4634 * Image id was filled in by the caller. Record the header
4635 * object name for this rbd image.
4636 */
Alex Elder979ed482012-11-01 08:39:26 -05004637 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05004638 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4639 if (!rbd_dev->header_name)
4640 return -ENOMEM;
4641 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004642 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05004643
4644 /* Get the size and object order for the image */
4645
4646 ret = rbd_dev_v2_image_size(rbd_dev);
4647 if (ret < 0)
4648 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004649
4650 /* Get the object prefix (a.k.a. block_name) for the image */
4651
4652 ret = rbd_dev_v2_object_prefix(rbd_dev);
4653 if (ret < 0)
4654 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004655
Alex Elderd8891402012-10-09 13:50:17 -07004656 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004657
4658 ret = rbd_dev_v2_features(rbd_dev);
4659 if (ret < 0)
4660 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004661
Alex Elder86b00e02012-10-25 23:34:42 -05004662 /* If the image supports layering, get the parent info */
4663
4664 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4665 ret = rbd_dev_v2_parent_info(rbd_dev);
4666 if (ret < 0)
4667 goto out_err;
4668 }
4669
Alex Elder6e14b1a2012-07-03 16:01:19 -05004670 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004671
Alex Elder6e14b1a2012-07-03 16:01:19 -05004672 rbd_dev->header.crypt_type = 0;
4673 rbd_dev->header.comp_type = 0;
4674
4675 /* Get the snapshot context, plus the header version */
4676
4677 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004678 if (ret)
4679 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004680 rbd_dev->header.obj_version = ver;
4681
Alex Eldera30b71b2012-07-10 20:30:11 -05004682 rbd_dev->image_format = 2;
4683
4684 dout("discovered version 2 image, header name is %s\n",
4685 rbd_dev->header_name);
4686
Alex Elder35152972012-08-31 17:29:55 -05004687 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004688out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004689 rbd_dev->parent_overlap = 0;
4690 rbd_spec_put(rbd_dev->parent_spec);
4691 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004692 kfree(rbd_dev->header_name);
4693 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004694 kfree(rbd_dev->header.object_prefix);
4695 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004696
4697 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004698}
4699
Alex Elder83a06262012-10-30 15:47:17 -05004700static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4701{
Alex Elder2f82ee52012-10-30 19:40:33 -05004702 struct rbd_device *parent = NULL;
4703 struct rbd_spec *parent_spec = NULL;
4704 struct rbd_client *rbdc = NULL;
Alex Elder83a06262012-10-30 15:47:17 -05004705 int ret;
4706
4707 /* no need to lock here, as rbd_dev is not registered yet */
4708 ret = rbd_dev_snaps_update(rbd_dev);
4709 if (ret)
4710 return ret;
4711
Alex Elder9e15b772012-10-30 19:40:33 -05004712 ret = rbd_dev_probe_update_spec(rbd_dev);
4713 if (ret)
4714 goto err_out_snaps;
4715
Alex Elder83a06262012-10-30 15:47:17 -05004716 ret = rbd_dev_set_mapping(rbd_dev);
4717 if (ret)
4718 goto err_out_snaps;
4719
4720 /* generate unique id: find highest unique id, add one */
4721 rbd_dev_id_get(rbd_dev);
4722
4723 /* Fill in the device name, now that we have its id. */
4724 BUILD_BUG_ON(DEV_NAME_LEN
4725 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4726 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4727
4728 /* Get our block major device number. */
4729
4730 ret = register_blkdev(0, rbd_dev->name);
4731 if (ret < 0)
4732 goto err_out_id;
4733 rbd_dev->major = ret;
4734
4735 /* Set up the blkdev mapping. */
4736
4737 ret = rbd_init_disk(rbd_dev);
4738 if (ret)
4739 goto err_out_blkdev;
4740
4741 ret = rbd_bus_add_dev(rbd_dev);
4742 if (ret)
4743 goto err_out_disk;
4744
4745 /*
4746 * At this point cleanup in the event of an error is the job
4747 * of the sysfs code (initiated by rbd_bus_del_dev()).
4748 */
Alex Elder2f82ee52012-10-30 19:40:33 -05004749 /* Probe the parent if there is one */
4750
4751 if (rbd_dev->parent_spec) {
4752 /*
4753 * We need to pass a reference to the client and the
4754 * parent spec when creating the parent rbd_dev.
4755 * Images related by parent/child relationships
4756 * always share both.
4757 */
4758 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4759 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4760
4761 parent = rbd_dev_create(rbdc, parent_spec);
4762 if (!parent) {
4763 ret = -ENOMEM;
4764 goto err_out_spec;
4765 }
4766 rbdc = NULL; /* parent now owns reference */
4767 parent_spec = NULL; /* parent now owns reference */
4768 ret = rbd_dev_probe(parent);
4769 if (ret < 0)
4770 goto err_out_parent;
4771 rbd_dev->parent = parent;
4772 }
4773
Alex Elder83a06262012-10-30 15:47:17 -05004774 down_write(&rbd_dev->header_rwsem);
4775 ret = rbd_dev_snaps_register(rbd_dev);
4776 up_write(&rbd_dev->header_rwsem);
4777 if (ret)
4778 goto err_out_bus;
4779
Alex Elder9969ebc2013-01-18 12:31:10 -06004780 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004781 if (ret)
4782 goto err_out_bus;
4783
4784 /* Everything's ready. Announce the disk to the world. */
4785
4786 add_disk(rbd_dev->disk);
4787
4788 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4789 (unsigned long long) rbd_dev->mapping.size);
4790
4791 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004792
4793err_out_parent:
4794 rbd_dev_destroy(parent);
4795err_out_spec:
4796 rbd_spec_put(parent_spec);
4797 rbd_put_client(rbdc);
Alex Elder83a06262012-10-30 15:47:17 -05004798err_out_bus:
4799 /* this will also clean up rest of rbd_dev stuff */
4800
4801 rbd_bus_del_dev(rbd_dev);
4802
4803 return ret;
4804err_out_disk:
4805 rbd_free_disk(rbd_dev);
4806err_out_blkdev:
4807 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4808err_out_id:
4809 rbd_dev_id_put(rbd_dev);
4810err_out_snaps:
4811 rbd_remove_all_snaps(rbd_dev);
4812
4813 return ret;
4814}
4815
Alex Eldera30b71b2012-07-10 20:30:11 -05004816/*
4817 * Probe for the existence of the header object for the given rbd
4818 * device. For format 2 images this includes determining the image
4819 * id.
4820 */
4821static int rbd_dev_probe(struct rbd_device *rbd_dev)
4822{
4823 int ret;
4824
4825 /*
4826 * Get the id from the image id object. If it's not a
4827 * format 2 image, we'll get ENOENT back, and we'll assume
4828 * it's a format 1 image.
4829 */
4830 ret = rbd_dev_image_id(rbd_dev);
4831 if (ret)
4832 ret = rbd_dev_v1_probe(rbd_dev);
4833 else
4834 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004835 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004836 dout("probe failed, returning %d\n", ret);
4837
Alex Elder83a06262012-10-30 15:47:17 -05004838 return ret;
4839 }
4840
4841 ret = rbd_dev_probe_finish(rbd_dev);
4842 if (ret)
4843 rbd_header_free(&rbd_dev->header);
4844
Alex Eldera30b71b2012-07-10 20:30:11 -05004845 return ret;
4846}
4847
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004848static ssize_t rbd_add(struct bus_type *bus,
4849 const char *buf,
4850 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004851{
Alex Eldercb8627c2012-07-09 21:04:23 -05004852 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004853 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004854 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004855 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004856 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004857 struct ceph_osd_client *osdc;
4858 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004859
4860 if (!try_module_get(THIS_MODULE))
4861 return -ENODEV;
4862
Alex Eldera725f65e2012-02-02 08:13:30 -06004863 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004864 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004865 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004866 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004867
Alex Elder9d3997f2012-10-25 23:34:42 -05004868 rbdc = rbd_get_client(ceph_opts);
4869 if (IS_ERR(rbdc)) {
4870 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004871 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004872 }
Alex Elderc53d5892012-10-25 23:34:42 -05004873 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004874
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004875 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004876 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004877 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004878 if (rc < 0)
4879 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004880 spec->pool_id = (u64) rc;
4881
Alex Elder0903e872012-11-14 12:25:19 -06004882 /* The ceph file layout needs to fit pool id in 32 bits */
4883
4884 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4885 rc = -EIO;
4886 goto err_out_client;
4887 }
4888
Alex Elderc53d5892012-10-25 23:34:42 -05004889 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004890 if (!rbd_dev)
4891 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004892 rbdc = NULL; /* rbd_dev now owns this */
4893 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004894
Alex Elderbd4ba652012-10-25 23:34:42 -05004895 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004896 kfree(rbd_opts);
4897 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004898
Alex Eldera30b71b2012-07-10 20:30:11 -05004899 rc = rbd_dev_probe(rbd_dev);
4900 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004901 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004902
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004903 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004904err_out_rbd_dev:
4905 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004906err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004907 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004908err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004909 if (ceph_opts)
4910 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004911 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004912 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004913err_out_module:
4914 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004915
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004916 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004917
4918 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004919}
4920
Alex Elderde71a292012-07-03 16:01:19 -05004921static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004922{
4923 struct list_head *tmp;
4924 struct rbd_device *rbd_dev;
4925
Alex Eldere124a82f2012-01-29 13:57:44 -06004926 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004927 list_for_each(tmp, &rbd_dev_list) {
4928 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004929 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004930 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004931 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004932 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004933 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004934 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004935 return NULL;
4936}
4937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004938static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004939{
Alex Elder593a9e72012-02-07 12:03:37 -06004940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004941
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004942 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004943 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004944
4945 /* clean up and free blkdev */
4946 rbd_free_disk(rbd_dev);
4947 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004948
Alex Elder2ac4e752012-07-10 20:30:10 -05004949 /* release allocated disk header fields */
4950 rbd_header_free(&rbd_dev->header);
4951
Alex Elder32eec682012-02-08 16:11:14 -06004952 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004953 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004954 rbd_assert(rbd_dev->rbd_client != NULL);
4955 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004956
4957 /* release module ref */
4958 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004959}
4960
Alex Elder2f82ee52012-10-30 19:40:33 -05004961static void __rbd_remove(struct rbd_device *rbd_dev)
4962{
4963 rbd_remove_all_snaps(rbd_dev);
4964 rbd_bus_del_dev(rbd_dev);
4965}
4966
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004967static ssize_t rbd_remove(struct bus_type *bus,
4968 const char *buf,
4969 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004970{
4971 struct rbd_device *rbd_dev = NULL;
4972 int target_id, rc;
4973 unsigned long ul;
4974 int ret = count;
4975
4976 rc = strict_strtoul(buf, 10, &ul);
4977 if (rc)
4978 return rc;
4979
4980 /* convert to int; abort if we lost anything in the conversion */
4981 target_id = (int) ul;
4982 if (target_id != ul)
4983 return -EINVAL;
4984
4985 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4986
4987 rbd_dev = __rbd_get_dev(target_id);
4988 if (!rbd_dev) {
4989 ret = -ENOENT;
4990 goto done;
4991 }
4992
Alex Eldera14ea262013-02-05 13:23:12 -06004993 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004994 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004995 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004996 else
4997 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004998 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004999 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06005000 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06005001
Alex Elder2f82ee52012-10-30 19:40:33 -05005002 while (rbd_dev->parent_spec) {
5003 struct rbd_device *first = rbd_dev;
5004 struct rbd_device *second = first->parent;
5005 struct rbd_device *third;
5006
5007 /*
5008 * Follow to the parent with no grandparent and
5009 * remove it.
5010 */
5011 while (second && (third = second->parent)) {
5012 first = second;
5013 second = third;
5014 }
5015 __rbd_remove(second);
5016 rbd_spec_put(first->parent_spec);
5017 first->parent_spec = NULL;
5018 first->parent_overlap = 0;
5019 first->parent = NULL;
5020 }
5021 __rbd_remove(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005022
5023done:
5024 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05005025
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005026 return ret;
5027}
5028
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005029/*
5030 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005031 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005032 */
5033static int rbd_sysfs_init(void)
5034{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005035 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005036
Alex Elderfed4c142012-02-07 12:03:36 -06005037 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005038 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005039 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005040
Alex Elderfed4c142012-02-07 12:03:36 -06005041 ret = bus_register(&rbd_bus_type);
5042 if (ret < 0)
5043 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005044
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005045 return ret;
5046}
5047
5048static void rbd_sysfs_cleanup(void)
5049{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005050 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005051 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005052}
5053
Alex Eldercc344fa2013-02-19 12:25:56 -06005054static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005055{
5056 int rc;
5057
Alex Elder1e32d342013-01-30 11:13:33 -06005058 if (!libceph_compatible(NULL)) {
5059 rbd_warn(NULL, "libceph incompatibility (quitting)");
5060
5061 return -EINVAL;
5062 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005063 rc = rbd_sysfs_init();
5064 if (rc)
5065 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005066 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005067 return 0;
5068}
5069
Alex Eldercc344fa2013-02-19 12:25:56 -06005070static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005071{
5072 rbd_sysfs_cleanup();
5073}
5074
5075module_init(rbd_init);
5076module_exit(rbd_exit);
5077
5078MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5079MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5080MODULE_DESCRIPTION("rados block device");
5081
5082/* following authorship retained from original osdblk.c */
5083MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5084
5085MODULE_LICENSE("GPL");