Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _BCACHEFS_EXTENTS_FORMAT_H
3#define _BCACHEFS_EXTENTS_FORMAT_H
4
5/*
6 * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
7 * preceded by checksum/compression information (bch_extent_crc32 or
8 * bch_extent_crc64).
9 *
10 * One major determining factor in the format of extents is how we handle and
11 * represent extents that have been partially overwritten and thus trimmed:
12 *
13 * If an extent is not checksummed or compressed, when the extent is trimmed we
14 * don't have to remember the extent we originally allocated and wrote: we can
15 * merely adjust ptr->offset to point to the start of the data that is currently
16 * live. The size field in struct bkey records the current (live) size of the
17 * extent, and is also used to mean "size of region on disk that we point to" in
18 * this case.
19 *
20 * Thus an extent that is not checksummed or compressed will consist only of a
21 * list of bch_extent_ptrs, with none of the fields in
22 * bch_extent_crc32/bch_extent_crc64.
23 *
24 * When an extent is checksummed or compressed, it's not possible to read only
25 * the data that is currently live: we have to read the entire extent that was
26 * originally written, and then return only the part of the extent that is
27 * currently live.
28 *
29 * Thus, in addition to the current size of the extent in struct bkey, we need
30 * to store the size of the originally allocated space - this is the
31 * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
32 * when the extent is trimmed, instead of modifying the offset field of the
33 * pointer, we keep a second smaller offset field - "offset into the original
34 * extent of the currently live region".
35 *
36 * The other major determining factor is replication and data migration:
37 *
38 * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
39 * write, we will initially write all the replicas in the same format, with the
40 * same checksum type and compression format - however, when copygc runs later (or
41 * tiering/cache promotion, anything that moves data), it is not in general
42 * going to rewrite all the pointers at once - one of the replicas may be in a
43 * bucket on one device that has very little fragmentation while another lives
44 * in a bucket that has become heavily fragmented, and thus is being rewritten
45 * sooner than the rest.
46 *
47 * Thus it will only move a subset of the pointers (or in the case of
48 * tiering/cache promotion perhaps add a single pointer without dropping any
49 * current pointers), and if the extent has been partially overwritten it must
50 * write only the currently live portion (or copygc would not be able to reduce
51 * fragmentation!) - which necessitates a different bch_extent_crc format for
52 * the new pointer.
53 *
54 * But in the interests of space efficiency, we don't want to store one
55 * bch_extent_crc for each pointer if we don't have to.
56 *
57 * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
58 * bch_extent_ptrs appended arbitrarily one after the other. We determine the
59 * type of a given entry with a scheme similar to utf8 (except we're encoding a
60 * type, not a size), encoding the type in the position of the first set bit:
61 *
62 * bch_extent_crc32 - 0b1
63 * bch_extent_ptr - 0b10
64 * bch_extent_crc64 - 0b100
65 *
66 * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
67 * bch_extent_crc64 is the least constrained).
68 *
69 * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
70 * until the next bch_extent_crc32/64.
71 *
72 * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
73 * is neither checksummed nor compressed.
74 */
75
76#define BCH_EXTENT_ENTRY_TYPES() \
77 x(ptr, 0) \
78 x(crc32, 1) \
79 x(crc64, 2) \
80 x(crc128, 3) \
81 x(stripe_ptr, 4) \
82 x(rebalance, 5) \
83 x(flags, 6)
84#define BCH_EXTENT_ENTRY_MAX 7
85
86enum bch_extent_entry_type {
87#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
88 BCH_EXTENT_ENTRY_TYPES()
89#undef x
90};
91
92/* Compressed/uncompressed size are stored biased by 1: */
93struct bch_extent_crc32 {
94#if defined(__LITTLE_ENDIAN_BITFIELD)
95 __u32 type:2,
96 _compressed_size:7,
97 _uncompressed_size:7,
98 offset:7,
99 _unused:1,
100 csum_type:4,
101 compression_type:4;
102 __u32 csum;
103#elif defined (__BIG_ENDIAN_BITFIELD)
104 __u32 csum;
105 __u32 compression_type:4,
106 csum_type:4,
107 _unused:1,
108 offset:7,
109 _uncompressed_size:7,
110 _compressed_size:7,
111 type:2;
112#endif
113} __packed __aligned(8);
114
115#define CRC32_SIZE_MAX (1U << 7)
116#define CRC32_NONCE_MAX 0
117
118struct bch_extent_crc64 {
119#if defined(__LITTLE_ENDIAN_BITFIELD)
120 __u64 type:3,
121 _compressed_size:9,
122 _uncompressed_size:9,
123 offset:9,
124 nonce:10,
125 csum_type:4,
126 compression_type:4,
127 csum_hi:16;
128#elif defined (__BIG_ENDIAN_BITFIELD)
129 __u64 csum_hi:16,
130 compression_type:4,
131 csum_type:4,
132 nonce:10,
133 offset:9,
134 _uncompressed_size:9,
135 _compressed_size:9,
136 type:3;
137#endif
138 __u64 csum_lo;
139} __packed __aligned(8);
140
141#define CRC64_SIZE_MAX (1U << 9)
142#define CRC64_NONCE_MAX ((1U << 10) - 1)
143
144struct bch_extent_crc128 {
145#if defined(__LITTLE_ENDIAN_BITFIELD)
146 __u64 type:4,
147 _compressed_size:13,
148 _uncompressed_size:13,
149 offset:13,
150 nonce:13,
151 csum_type:4,
152 compression_type:4;
153#elif defined (__BIG_ENDIAN_BITFIELD)
154 __u64 compression_type:4,
155 csum_type:4,
156 nonce:13,
157 offset:13,
158 _uncompressed_size:13,
159 _compressed_size:13,
160 type:4;
161#endif
162 struct bch_csum csum;
163} __packed __aligned(8);
164
165#define CRC128_SIZE_MAX (1U << 13)
166#define CRC128_NONCE_MAX ((1U << 13) - 1)
167
168/*
169 * @reservation - pointer hasn't been written to, just reserved
170 */
171struct bch_extent_ptr {
172#if defined(__LITTLE_ENDIAN_BITFIELD)
173 __u64 type:1,
174 cached:1,
175 unused:1,
176 unwritten:1,
177 offset:44, /* 8 petabytes */
178 dev:8,
179 gen:8;
180#elif defined (__BIG_ENDIAN_BITFIELD)
181 __u64 gen:8,
182 dev:8,
183 offset:44,
184 unwritten:1,
185 unused:1,
186 cached:1,
187 type:1;
188#endif
189} __packed __aligned(8);
190
191struct bch_extent_stripe_ptr {
192#if defined(__LITTLE_ENDIAN_BITFIELD)
193 __u64 type:5,
194 block:8,
195 redundancy:4,
196 idx:47;
197#elif defined (__BIG_ENDIAN_BITFIELD)
198 __u64 idx:47,
199 redundancy:4,
200 block:8,
201 type:5;
202#endif
203};
204
205#define BCH_EXTENT_FLAGS() \
206 x(poisoned, 0)
207
208enum bch_extent_flags_e {
209#define x(n, v) BCH_EXTENT_FLAG_##n = v,
210 BCH_EXTENT_FLAGS()
211#undef x
212};
213
214struct bch_extent_flags {
215#if defined(__LITTLE_ENDIAN_BITFIELD)
216 __u64 type:7,
217 flags:57;
218#elif defined (__BIG_ENDIAN_BITFIELD)
219 __u64 flags:57,
220 type:7;
221#endif
222};
223
224/* bch_extent_rebalance: */
225#include "rebalance_format.h"
226
227union bch_extent_entry {
228#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
229 unsigned long type;
230#elif __BITS_PER_LONG == 32
231 struct {
232 unsigned long pad;
233 unsigned long type;
234 };
235#else
236#error edit for your odd byteorder.
237#endif
238
239#define x(f, n) struct bch_extent_##f f;
240 BCH_EXTENT_ENTRY_TYPES()
241#undef x
242};
243
244struct bch_btree_ptr {
245 struct bch_val v;
246
247 __u64 _data[0];
248 struct bch_extent_ptr start[];
249} __packed __aligned(8);
250
251struct bch_btree_ptr_v2 {
252 struct bch_val v;
253
254 __u64 mem_ptr;
255 __le64 seq;
256 __le16 sectors_written;
257 __le16 flags;
258 struct bpos min_key;
259 __u64 _data[0];
260 struct bch_extent_ptr start[];
261} __packed __aligned(8);
262
263LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
264
265struct bch_extent {
266 struct bch_val v;
267
268 __u64 _data[0];
269 union bch_extent_entry start[];
270} __packed __aligned(8);
271
272/* Maximum size (in u64s) a single pointer could be: */
273#define BKEY_EXTENT_PTR_U64s_MAX\
274 ((sizeof(struct bch_extent_crc128) + \
275 sizeof(struct bch_extent_ptr)) / sizeof(__u64))
276
277/* Maximum possible size of an entire extent value: */
278#define BKEY_EXTENT_VAL_U64s_MAX \
279 (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
280
281/* * Maximum possible size of an entire extent, key + value: */
282#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
283
284/* Btree pointers don't carry around checksums: */
285#define BKEY_BTREE_PTR_VAL_U64s_MAX \
286 ((sizeof(struct bch_btree_ptr_v2) + \
287 sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
288#define BKEY_BTREE_PTR_U64s_MAX \
289 (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
290
291struct bch_reservation {
292 struct bch_val v;
293
294 __le32 generation;
295 __u8 nr_replicas;
296 __u8 pad[3];
297} __packed __aligned(8);
298
299struct bch_inline_data {
300 struct bch_val v;
301 u8 data[];
302};
303
304#endif /* _BCACHEFS_EXTENTS_FORMAT_H */