-
Notifications
You must be signed in to change notification settings - Fork 117
/
Copy pathimage-mdraid.c
488 lines (430 loc) · 16.4 KB
/
image-mdraid.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
/*
* Copyright (c) 2025 Tomas Mudrunka <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* MDRAID Superblock generator
* This should create valid mdraid superblock for raid1 with 1 device (more devices can be added once mounted).
* Unlike mdadm this works completely in userspace and does not need kernel to create the ondisk structures.
* It is still very basic, but following seems to be working:
*
* mdadm --examine test.img
* losetup /dev/loop1 test.img
* mdadm --assemble md /dev/loop1
*
* Some docs:
* https://raid.wiki.kernel.org/index.php/RAID_superblock_formats#Sub-versions_of_the_version-1_superblock
* https://docs.huihoo.com/doxygen/linux/kernel/3.7/md__p_8h_source.html
*/
#include <confuse.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <linux/raid/md_p.h>
#include "genimage.h"
#define DATA_OFFSET_SECTORS (2048)
#define DATA_OFFSET_BYTES (DATA_OFFSET_SECTORS * 512)
#define BITMAP_SECTORS_MAX 256
/* (should be divisible by 8 sectors to keep 4kB alignment) */
#define MDRAID_ALIGN_BYTES 8 * 512
/*
* Array creation timestamp has to be identical across all the raid members,
* so we share it between invocations
*/
static time_t mdraid_time = 0;
/*
* bitmap structures:
* Taken from Linux kernel drivers/md/md-bitmap.h
* (Currently it's missing from linux-libc-dev debian package, so cannot be simply included)
*/
/* clang-format off */
#ifndef BITMAP_MAGIC
#define BITMAP_MAGIC 0x6d746962 // This is actualy just char string saying "bitm" :-)
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
BITMAP_HOSTENDIAN =15,
};
/* the superblock at the front of the bitmap file -- little endian */
typedef struct bitmap_super_s {
__le32 magic; /* 0 BITMAP_MAGIC */
__le32 version; /* 4 the bitmap major for now, could change... */
__u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
__le64 events; /* 24 event counter for the bitmap (1)*/
__le64 events_cleared;/*32 event counter when last bit cleared (2) */
__le64 sync_size; /* 40 the size of the md device's sync range(3) */
__le32 state; /* 48 bitmap state information */
__le32 chunksize; /* 52 the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56 seconds between disk flushes */
__le32 write_behind; /* 60 number of outstanding write-behind writes */
__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
* reserved for the bitmap. */
__le32 nodes; /* 68 the maximum number of nodes in cluster. */
__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
__u8 pad[256 - 136]; /* set to zero */
} bitmap_super_t;
/* clang-format on */
/*
* notes:
* (1) This event counter is updated before the eventcounter in the md superblock
* When a bitmap is loaded, it is only accepted if this event counter is equal
* to, or one greater than, the event counter in the superblock.
* (2) This event counter is updated when the other one is *if*and*only*if* the
* array is not degraded. As bits are not cleared when the array is degraded,
* this represents the last time that any bits were cleared.
* If a device is being added that has an event count with this value or
* higher, it is accepted as conforming to the bitmap.
* (3)This is the number of sectors represented by the bitmap, and is the range that
* resync happens across. For raid1 and raid5/6 it is the size of individual
* devices. For raid10 it is the size of the array.
*/
#endif //BITMAP_MAGIC
/* Superblock struct sanity check */
ct_assert(offsetof(struct mdp_superblock_1, data_offset) == 128);
ct_assert(offsetof(struct mdp_superblock_1, utime) == 192);
ct_assert(sizeof(struct mdp_superblock_1) == 256);
/* This structure is used to store mdraid state data in handler_priv */
typedef struct mdraid_img_s {
/* Partition of data to be imported to raid */
struct partition img_data_part;
/* Partition of parent raid image (we can inherit config from it) */
struct partition img_parent_part;
/* Images for aforementioned partitions */
struct image *img_data;
/* Dtto */
struct image *img_parent;
/* Actual mdraid superblock that is gonna be stored on disk */
struct mdp_superblock_1 *sb;
/* Actual bitmap superblock that is gonna be stored on disk */
bitmap_super_t bsb;
/* This is counter used by slave devices to take roles */
__le16 last_role;
} mdraid_img_t;
static unsigned int calc_sb_1_csum(struct mdp_superblock_1 *sb)
{
unsigned int disk_csum, csum;
unsigned long long newcsum;
int size = sizeof(*sb) + __le32_to_cpu(sb->max_dev) * 2;
unsigned int *isuper = (unsigned int *)sb;
/* Temporarily set checksum in struct to 0 while remembering original value */
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
newcsum = 0;
for (; size >= 4; size -= 4) {
newcsum += __le32_to_cpu(*isuper);
isuper++;
}
if (size == 2)
newcsum += __le16_to_cpu(*(unsigned short *)isuper);
csum = (newcsum & 0xffffffff) + (newcsum >> 32);
/* Set checksum in struct back to original value */
sb->sb_csum = disk_csum;
return __cpu_to_le32(csum);
}
static int mdraid_generate(struct image *image)
{
mdraid_img_t *md = image->handler_priv;
/* Inheriting from this parent if not NULL */
mdraid_img_t *mdp = NULL;
__le16 max_devices;
/* Determine max_devices while considering possibility of inheritance from other image */
if (md->img_parent) {
mdp = md->img_parent->handler_priv;
max_devices = mdp->sb->raid_disks;
} else {
max_devices = cfg_getint(image->imagesec, "devices");
}
/* Determine role of this device in array */
__le16 role = cfg_getint(image->imagesec, "role");
if (cfg_getint(image->imagesec, "role") == -1) {
/* If role is -1 it should be autoassigned to parenting devices */
if (mdp) {
/* Take role from master and increment its counter */
role = ++mdp->last_role;
} else {
/* Master has role of 0 */
role = 0;
}
image_info(image, "MDRAID automaticaly assigned role %d.\n", role);
}
if (role > MD_DISK_ROLE_MAX) {
image_error(image, "MDRAID role has to be >= 0 and <= %d.\n", MD_DISK_ROLE_MAX);
return 6;
}
if (role >= max_devices) {
image_error(image, "MDRAID role of this image has to be lower than total number of %d devices (roles are counted from 0).\n",
max_devices);
return 5;
}
/* MD Superblock and Bitmap Superblock */
size_t superblock_size = sizeof(struct mdp_superblock_1) + max_devices * 2;
struct mdp_superblock_1 *sb = md->sb = xzalloc(superblock_size);
bitmap_super_t *bsb = &md->bsb;
if (mdp) {
/* We are inheriting the superblock in this case */
memcpy(md->sb, mdp->sb, superblock_size);
//memcpy(&md->bsb, &mdp->bsb, sizeof(bitmap_super_t));
} else {
/* We are not inheriting superblock, therefore we need to fully initialize the array */
char *name = cfg_getstr(image->imagesec, "label");
/* constant array information - 128 bytes */
/* MD_SB_MAGIC: 0xa92b4efc - little endian. */
sb->magic = MD_SB_MAGIC;
/* Always 1 for 1.xx metadata version :-) */
sb->major_version = 1;
/* bit 0 set if 'bitmap_offset' is meaningful */
sb->feature_map = MD_FEATURE_BITMAP_OFFSET;
/* always set to 0 when writing */
sb->pad0 = 0;
char *raid_uuid = cfg_getstr(image->imagesec, "raid-uuid");
if (!raid_uuid)
raid_uuid = uuid_random();
/* user-space generated. U8[16] */
uuid_parse(raid_uuid, sb->set_uuid);
strncpy(sb->set_name, name, 32);
/* set and interpreted by user-space. CHAR[32] */
sb->set_name[31] = 0;
long int timestamp = cfg_getint(image->imagesec, "timestamp");
if (timestamp >= 0) {
sb->ctime = timestamp & 0xffffffffff;
} else {
/* lo 40 bits are seconds, top 24 are microseconds or 0 */
sb->ctime = mdraid_time & 0xffffffffff;
}
/* -4 (multipath), -1 (linear), 0,1,4,5 */
sb->level = 1;
/* only for raid5 and raid10 currently */
// sb->layout;
/* used size of component devices, in 512byte sectors */
sb->size = (image->size - DATA_OFFSET_BYTES) / 512;
/* in 512byte sectors - not used in raid 1 */
sb->chunksize = 0;
sb->raid_disks = max_devices;
}
/*
* sectors after start of superblock that bitmap starts
* NOTE: signed, so bitmap can be before superblock
* only meaningful of feature_map[0] is set.
*/
sb->bitmap_offset = 8;
/* constant this-device information - 64 bytes */
/* sector start of data, often 0 */
sb->data_offset = DATA_OFFSET_SECTORS;
/* sectors in this device that can be used for data */
sb->data_size = sb->size;
/* sector start of this superblock */
sb->super_offset = 8;
/* permanent identifier of this device - not role in raid (They can be equal tho). */
sb->dev_number = role;
/* number of read errors that were corrected by re-writing */
sb->cnt_corrected_read = 0;
char *disk_uuid = cfg_getstr(image->imagesec, "disk-uuid");
if (!disk_uuid)
disk_uuid = uuid_random();
/* user-space setable, ignored by kernel U8[16] */
uuid_parse(disk_uuid, sb->device_uuid);
/* per-device flags. Only two defined... */
sb->devflags = 0;
/* mask for writemostly flag in above */
//#define WriteMostly1 1
/* Should avoid retries and fixups and just fail */
//#define FailFast1 2
/*
* Bad block log. If there are any bad blocks the feature flag is set.
* If offset and size are non-zero, that space is reserved and available
*/
/* shift from sectors to badblock size, typicaly 9-12 (shift by 9 is equal to 512 sectors per badblock) */
sb->bblog_shift = 9;
/* number of sectors reserved for list */
sb->bblog_size = 8;
/* sector offset from superblock to bblog, signed - not unsigned */
sb->bblog_offset = sb->bitmap_offset + BITMAP_SECTORS_MAX + 8;
/* array state information - 64 bytes */
/* 40 bits second, 24 bits microseconds */
sb->utime = sb->ctime;
/* incremented when superblock updated */
sb->events = 0;
/* data before this offset (from data_offset) known to be in sync */
sb->resync_offset = 0;
/* size of devs[] array to consider */
sb->max_dev = max_devices;
/* set to 0 when writing */
// pad3[64-32];
/*
* device state information. Indexed by dev_number.
* 2 bytes per device
* Note there are no per-device state flags. State information is rolled
* into the 'roles' value. If a device is spare or faulty, then it doesn't
* have a meaningful role.
*/
/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
__le16 *dev_roles = (__le16 *)((char *)sb + sizeof(struct mdp_superblock_1));
/* All devices in array are set as inactive initialy */
//memset(dev_roles, 0xFF, max_devices*2);
/* All devices are assigned roles equal to their dev_number initialy */
for (int i = 0; i < max_devices; i++) {
/* Assign active role to all devices */
dev_roles[i] = i;
}
/* Calculate superblock checksum */
sb->sb_csum = calc_sb_1_csum(sb);
/* Prepare bitmap superblock (bitmaps don't have checksums for performance reasons) */
/* 0 BITMAP_MAGIC - This is actualy just char string saying "bitm" :-) */
bsb->magic = BITMAP_MAGIC;
/* 4 the bitmap major for now, could change... */
bsb->version = 4; /* v4 is compatible with mdraid v1.2 */
/* 8 128 bit uuid - must match md device uuid */
memcpy(bsb->uuid, sb->set_uuid, sizeof(bsb->uuid));
/* 24 event counter for the bitmap (1) */
bsb->events = 0;
/* 32 event counter when last bit cleared (2) */
bsb->events_cleared = 0;
/* 40 the size of the md device's sync range(3) */
bsb->sync_size = sb->data_size;
/* 48 bitmap state information */
bsb->state = 0;
/* 52 the bitmap chunk size in bytes, 64MB is default on linux */
bsb->chunksize = 64 * 1024 * 1024;
/* 5 is considered safe default. 56 seconds between disk flushes */
bsb->daemon_sleep = 5;
/* 60 number of outstanding write-behind writes */
bsb->write_behind = 0;
/* 64 number of 512-byte sectors that are reserved for the bitmap. */
bsb->sectors_reserved = roundup(bsb->sync_size / bsb->chunksize, 8);
/* 68 the maximum number of nodes in cluster. */
bsb->nodes = 0;
/* 72 cluster name to which this md belongs */
//bsb->cluster_name[64];
/* set to zero */
// pad[256 - 136];
/* Increase bitmap chunk size till we fit in sectors max */
while (bsb->sectors_reserved > BITMAP_SECTORS_MAX) {
bsb->chunksize *= 2;
bsb->sectors_reserved = roundup(bsb->sync_size / bsb->chunksize, 8);
}
/* Construct image file */
int ret;
ret = prepare_image(image, image->size);
if (ret)
return ret;
/* Write superblock */
ret = insert_data(image, sb, imageoutfile(image), superblock_size, sb->super_offset * 512);
if (ret)
return ret;
/* Write bitmap */
if (sb->feature_map & MD_FEATURE_BITMAP_OFFSET) {
ret = insert_data(image, bsb, imageoutfile(image), sizeof(*bsb),
(sb->super_offset + sb->bitmap_offset) * 512);
if (ret)
return ret;
}
/* Write data */
if (md->img_data) {
ret = insert_image(image, md->img_data, md->img_data->size, DATA_OFFSET_BYTES, 0, cfg_true);
if (ret)
return ret;
}
return 0;
}
static int mdraid_parse(struct image *image, cfg_t *cfg)
{
mdraid_img_t *md = xzalloc(sizeof(mdraid_img_t));
image->handler_priv = md;
/* Common MDRAID subsystem init */
if (!mdraid_time)
mdraid_time = time(NULL);
/* Sanity checks */
int raid_level = cfg_getint(image->imagesec, "level");
if (raid_level != 1) {
image_error(image, "MDRAID Currently only supporting raid level 1 (mirror)!\n");
return 1;
}
/* Inherit config from parent */
md->img_parent_part.image = cfg_getstr(image->imagesec, "parent");
if (md->img_parent_part.image) {
/* Add parent partition as dependency (so it's built first) */
list_add_tail(&md->img_parent_part.list, &image->partitions);
/* Find parent image */
md->img_parent = image_get(md->img_parent_part.image);
if (!md->img_parent) {
image_error(image, "MDRAID cannot find parent image to inherit metadata config from: %s\n",
md->img_parent_part.image);
return 9;
}
/* Inherit image size from parent */
image_info(image, "MDRAID will inherit array metadata config from parent: %s\n",
md->img_parent->file);
image->size = md->img_parent->size;
}
/* Find data partition to be put inside the array */
if (md->img_parent) {
md->img_data_part.image = cfg_getstr(md->img_parent->imagesec, "image");
} else {
md->img_data_part.image = cfg_getstr(image->imagesec, "image");
}
/* Add data partition as dependency (so it's built first) */
if (md->img_data_part.image) {
list_add_tail(&md->img_data_part.list, &image->partitions);
}
return 0;
}
static int mdraid_setup(struct image *image, cfg_t *cfg)
{
mdraid_img_t *md = image->handler_priv;
/* Find data image and its metadata if data partition exists */
if (md->img_data_part.image) {
image_info(image, "MDRAID using data from: %s\n", md->img_data_part.image);
md->img_data = image_get(md->img_data_part.image);
if (!md->img_data) {
image_error(image, "MDRAID cannot get image definition: %s\n", md->img_data_part.image);
return 8;
}
if (image->size == 0)
image->size = roundup(md->img_data->size + DATA_OFFSET_BYTES, MDRAID_ALIGN_BYTES);
if (image->size < (md->img_data->size + DATA_OFFSET_BYTES)) {
image_error(image, "MDRAID image too small to fit %s\n", md->img_data->file);
return 3;
}
} else {
image_info(image, "MDRAID is created without data.\n");
}
/* Make sure size is aligned */
if (image->size != roundup(image->size, MDRAID_ALIGN_BYTES)) {
image_error(image, "MDRAID image size has to be aligned to %d bytes!\n", MDRAID_ALIGN_BYTES);
return 4;
}
return 0;
}
static cfg_opt_t mdraid_opts[] = {
CFG_STR("label", "localhost:42", CFGF_NONE),
CFG_INT("level", 1, CFGF_NONE),
CFG_INT("devices", 1, CFGF_NONE),
CFG_INT("role", -1, CFGF_NONE),
CFG_INT("timestamp", -1, CFGF_NONE),
CFG_STR("raid-uuid", NULL, CFGF_NONE),
CFG_STR("disk-uuid", NULL, CFGF_NONE),
CFG_STR("image", NULL, CFGF_NONE),
CFG_STR("parent", NULL, CFGF_NONE),
CFG_END()
};
struct image_handler mdraid_handler = {
.type = "mdraid",
.no_rootpath = cfg_true,
.parse = mdraid_parse,
.setup = mdraid_setup,
.generate = mdraid_generate,
.opts = mdraid_opts,
};