Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ctnetlink: support filtering by zone

conntrack zones are heavily used by tools like openvswitch to run
multiple virtual "routers" on a single machine. In this context each
conntrack zone matches to a single router, thereby preventing
overlapping IPs from becoming issues.
In these systems it is common to operate on all conntrack entries of a
given zone, e.g. to delete them when a router is deleted. Previously this
required these tools to dump the full conntrack table and filter out the
relevant entries in userspace potentially causing performance issues.

To do this we reuse the existing CTA_ZONE attribute. This was previous
parsed but not used during dump and flush requests. Now if CTA_ZONE is
set we filter these operations based on the provided zone.
However this means that users that previously passed CTA_ZONE will
experience a difference in functionality.

Alternatively CTA_FILTER could have been used for the same
functionality. However it is not yet supported during flush requests and
is only available when using AF_INET or AF_INET6.

Co-developed-by: Luca Czesla <luca.czesla@mail.schwarz>
Signed-off-by: Luca Czesla <luca.czesla@mail.schwarz>
Co-developed-by: Max Lamprecht <max.lamprecht@mail.schwarz>
Signed-off-by: Max Lamprecht <max.lamprecht@mail.schwarz>
Signed-off-by: Felix Huettner <felix.huettner@mail.schwarz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Felix Huettner and committed by
Pablo Neira Ayuso
eff3c558 08e4c8c5

+442 -5
+8 -4
net/netfilter/nf_conntrack_netlink.c
··· 992 992 if (err) 993 993 goto err_filter; 994 994 995 - if (!cda[CTA_FILTER]) 996 - return filter; 997 - 998 995 err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); 999 996 if (err < 0) 1000 997 goto err_filter; 998 + 999 + if (!cda[CTA_FILTER]) 1000 + return filter; 1001 1001 1002 1002 err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); 1003 1003 if (err < 0) ··· 1043 1043 1044 1044 static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) 1045 1045 { 1046 - return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS]; 1046 + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; 1047 1047 } 1048 1048 1049 1049 static int ctnetlink_start(struct netlink_callback *cb) ··· 1146 1146 * then match everything. 1147 1147 */ 1148 1148 if (filter->family && nf_ct_l3num(ct) != filter->family) 1149 + goto ignore_entry; 1150 + 1151 + if (filter->zone.id != NF_CT_DEFAULT_ZONE_ID && 1152 + !nf_ct_zone_equal_any(ct, &filter->zone)) 1149 1153 goto ignore_entry; 1150 1154 1151 1155 if (filter->orig_flags) {
+2
tools/testing/selftests/netfilter/.gitignore
··· 2 2 nf-queue 3 3 connect_close 4 4 audit_logread 5 + conntrack_dump_flush 6 + sctp_collision
+2 -1
tools/testing/selftests/netfilter/Makefile
··· 14 14 CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) 15 15 LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) 16 16 17 - TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision 17 + TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision \ 18 + conntrack_dump_flush 18 19 19 20 include ../lib.mk
+430
tools/testing/selftests/netfilter/conntrack_dump_flush.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #define _GNU_SOURCE 4 + 5 + #include <time.h> 6 + #include <libmnl/libmnl.h> 7 + #include <netinet/ip.h> 8 + 9 + #include <linux/netlink.h> 10 + #include <linux/netfilter/nfnetlink.h> 11 + #include <linux/netfilter/nfnetlink_conntrack.h> 12 + #include <linux/netfilter/nf_conntrack_tcp.h> 13 + #include "../kselftest_harness.h" 14 + 15 + #define TEST_ZONE_ID 123 16 + #define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) 17 + 18 + static int reply_counter; 19 + 20 + static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, 21 + uint32_t src_ip, uint32_t dst_ip, 22 + uint16_t src_port, uint16_t dst_port) 23 + { 24 + struct nlattr *nest, *nest_ip, *nest_proto; 25 + 26 + nest = mnl_attr_nest_start(nlh, type); 27 + if (!nest) 28 + return -1; 29 + 30 + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); 31 + if (!nest_ip) 32 + return -1; 33 + mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip); 34 + mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip); 35 + mnl_attr_nest_end(nlh, nest_ip); 36 + 37 + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); 38 + if (!nest_proto) 39 + return -1; 40 + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); 41 + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); 42 + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); 43 + mnl_attr_nest_end(nlh, nest_proto); 44 + 45 + mnl_attr_nest_end(nlh, nest); 46 + } 47 + 48 + static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, 49 + struct in6_addr src_ip, struct in6_addr dst_ip, 50 + uint16_t src_port, uint16_t dst_port) 51 + { 52 + struct nlattr *nest, *nest_ip, *nest_proto; 53 + 54 + nest = mnl_attr_nest_start(nlh, type); 55 + if (!nest) 56 + return -1; 57 + 58 + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); 59 + if (!nest_ip) 60 + return -1; 61 + mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip); 62 + mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip); 63 + mnl_attr_nest_end(nlh, nest_ip); 64 + 65 + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); 66 + if (!nest_proto) 67 + return -1; 68 + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); 69 + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); 70 + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); 71 + mnl_attr_nest_end(nlh, nest_proto); 72 + 73 + mnl_attr_nest_end(nlh, nest); 74 + } 75 + 76 + static int build_cta_proto(struct nlmsghdr *nlh) 77 + { 78 + struct nlattr *nest, *nest_proto; 79 + 80 + nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO); 81 + if (!nest) 82 + return -1; 83 + 84 + nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP); 85 + if (!nest_proto) 86 + return -1; 87 + mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED); 88 + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a); 89 + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a); 90 + mnl_attr_nest_end(nlh, nest_proto); 91 + 92 + mnl_attr_nest_end(nlh, nest); 93 + } 94 + 95 + static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, 96 + uint16_t zone) 97 + { 98 + char buf[MNL_SOCKET_BUFFER_SIZE]; 99 + struct nlmsghdr *rplnlh; 100 + unsigned int portid; 101 + int err, ret; 102 + 103 + portid = mnl_socket_get_portid(sock); 104 + 105 + ret = build_cta_proto(nlh); 106 + if (ret < 0) { 107 + perror("build_cta_proto"); 108 + return -1; 109 + } 110 + mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000)); 111 + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); 112 + 113 + if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) { 114 + perror("mnl_socket_sendto"); 115 + return -1; 116 + } 117 + 118 + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); 119 + if (ret < 0) { 120 + perror("mnl_socket_recvfrom"); 121 + return ret; 122 + } 123 + 124 + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); 125 + if (ret < 0) { 126 + if (errno == EEXIST) { 127 + /* The entries are probably still there from a previous 128 + * run. So we are good 129 + */ 130 + return 0; 131 + } 132 + perror("mnl_cb_run"); 133 + return ret; 134 + } 135 + 136 + return 0; 137 + } 138 + 139 + static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip, 140 + uint32_t dst_ip, uint16_t zone) 141 + { 142 + char buf[MNL_SOCKET_BUFFER_SIZE]; 143 + struct nlmsghdr *nlh; 144 + struct nfgenmsg *nfh; 145 + int ret; 146 + 147 + nlh = mnl_nlmsg_put_header(buf); 148 + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; 149 + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 150 + NLM_F_ACK | NLM_F_EXCL; 151 + nlh->nlmsg_seq = time(NULL); 152 + 153 + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); 154 + nfh->nfgen_family = AF_INET; 155 + nfh->version = NFNETLINK_V0; 156 + nfh->res_id = 0; 157 + 158 + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443); 159 + if (ret < 0) { 160 + perror("build_cta_tuple_v4"); 161 + return ret; 162 + } 163 + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345); 164 + if (ret < 0) { 165 + perror("build_cta_tuple_v4"); 166 + return ret; 167 + } 168 + return conntrack_data_insert(sock, nlh, zone); 169 + } 170 + 171 + static int conntrack_data_generate_v6(struct mnl_socket *sock, 172 + struct in6_addr src_ip, 173 + struct in6_addr dst_ip, 174 + uint16_t zone) 175 + { 176 + char buf[MNL_SOCKET_BUFFER_SIZE]; 177 + struct nlmsghdr *nlh; 178 + struct nfgenmsg *nfh; 179 + int ret; 180 + 181 + nlh = mnl_nlmsg_put_header(buf); 182 + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; 183 + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | 184 + NLM_F_ACK | NLM_F_EXCL; 185 + nlh->nlmsg_seq = time(NULL); 186 + 187 + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); 188 + nfh->nfgen_family = AF_INET6; 189 + nfh->version = NFNETLINK_V0; 190 + nfh->res_id = 0; 191 + 192 + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 193 + 12345, 443); 194 + if (ret < 0) { 195 + perror("build_cta_tuple_v6"); 196 + return ret; 197 + } 198 + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 199 + 12345, 443); 200 + if (ret < 0) { 201 + perror("build_cta_tuple_v6"); 202 + return ret; 203 + } 204 + return conntrack_data_insert(sock, nlh, zone); 205 + } 206 + 207 + static int count_entries(const struct nlmsghdr *nlh, void *data) 208 + { 209 + reply_counter++; 210 + } 211 + 212 + static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone) 213 + { 214 + char buf[MNL_SOCKET_BUFFER_SIZE]; 215 + struct nlmsghdr *nlh, *rplnlh; 216 + struct nfgenmsg *nfh; 217 + struct nlattr *nest; 218 + unsigned int portid; 219 + int err, ret; 220 + 221 + portid = mnl_socket_get_portid(sock); 222 + 223 + nlh = mnl_nlmsg_put_header(buf); 224 + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET; 225 + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; 226 + nlh->nlmsg_seq = time(NULL); 227 + 228 + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); 229 + nfh->nfgen_family = AF_UNSPEC; 230 + nfh->version = NFNETLINK_V0; 231 + nfh->res_id = 0; 232 + 233 + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); 234 + 235 + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); 236 + if (ret < 0) { 237 + perror("mnl_socket_sendto"); 238 + return ret; 239 + } 240 + 241 + reply_counter = 0; 242 + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); 243 + while (ret > 0) { 244 + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, 245 + count_entries, NULL); 246 + if (ret <= MNL_CB_STOP) 247 + break; 248 + 249 + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); 250 + } 251 + if (ret < 0) { 252 + perror("mnl_socket_recvfrom"); 253 + return ret; 254 + } 255 + 256 + return reply_counter; 257 + } 258 + 259 + static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone) 260 + { 261 + char buf[MNL_SOCKET_BUFFER_SIZE]; 262 + struct nlmsghdr *nlh, *rplnlh; 263 + struct nfgenmsg *nfh; 264 + struct nlattr *nest; 265 + unsigned int portid; 266 + int err, ret; 267 + 268 + portid = mnl_socket_get_portid(sock); 269 + 270 + nlh = mnl_nlmsg_put_header(buf); 271 + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE; 272 + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 273 + nlh->nlmsg_seq = time(NULL); 274 + 275 + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); 276 + nfh->nfgen_family = AF_UNSPEC; 277 + nfh->version = NFNETLINK_V0; 278 + nfh->res_id = 0; 279 + 280 + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); 281 + 282 + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); 283 + if (ret < 0) { 284 + perror("mnl_socket_sendto"); 285 + return ret; 286 + } 287 + 288 + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); 289 + if (ret < 0) { 290 + perror("mnl_socket_recvfrom"); 291 + return ret; 292 + } 293 + 294 + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); 295 + if (ret < 0) { 296 + perror("mnl_cb_run"); 297 + return ret; 298 + } 299 + 300 + return 0; 301 + } 302 + 303 + FIXTURE(conntrack_dump_flush) 304 + { 305 + struct mnl_socket *sock; 306 + }; 307 + 308 + FIXTURE_SETUP(conntrack_dump_flush) 309 + { 310 + struct in6_addr src, dst; 311 + int ret; 312 + 313 + self->sock = mnl_socket_open(NETLINK_NETFILTER); 314 + if (!self->sock) { 315 + perror("mnl_socket_open"); 316 + exit(EXIT_FAILURE); 317 + } 318 + 319 + if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { 320 + perror("mnl_socket_bind"); 321 + exit(EXIT_FAILURE); 322 + } 323 + 324 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); 325 + if (ret < 0 && errno == EPERM) 326 + SKIP(return, "Needs to be run as root"); 327 + else if (ret < 0 && errno == EOPNOTSUPP) 328 + SKIP(return, "Kernel does not seem to support conntrack zones"); 329 + 330 + ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1, 331 + TEST_ZONE_ID); 332 + EXPECT_EQ(ret, 0); 333 + ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3, 334 + TEST_ZONE_ID + 1); 335 + EXPECT_EQ(ret, 0); 336 + ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, 337 + TEST_ZONE_ID + 2); 338 + EXPECT_EQ(ret, 0); 339 + 340 + src = (struct in6_addr) {{ 341 + .__u6_addr32 = { 342 + 0xb80d0120, 343 + 0x00000000, 344 + 0x00000000, 345 + 0x01000000 346 + } 347 + }}; 348 + dst = (struct in6_addr) {{ 349 + .__u6_addr32 = { 350 + 0xb80d0120, 351 + 0x00000000, 352 + 0x00000000, 353 + 0x02000000 354 + } 355 + }}; 356 + ret = conntrack_data_generate_v6(self->sock, src, dst, 357 + TEST_ZONE_ID); 358 + EXPECT_EQ(ret, 0); 359 + src = (struct in6_addr) {{ 360 + .__u6_addr32 = { 361 + 0xb80d0120, 362 + 0x00000000, 363 + 0x00000000, 364 + 0x03000000 365 + } 366 + }}; 367 + dst = (struct in6_addr) {{ 368 + .__u6_addr32 = { 369 + 0xb80d0120, 370 + 0x00000000, 371 + 0x00000000, 372 + 0x04000000 373 + } 374 + }}; 375 + ret = conntrack_data_generate_v6(self->sock, src, dst, 376 + TEST_ZONE_ID + 1); 377 + EXPECT_EQ(ret, 0); 378 + src = (struct in6_addr) {{ 379 + .__u6_addr32 = { 380 + 0xb80d0120, 381 + 0x00000000, 382 + 0x00000000, 383 + 0x05000000 384 + } 385 + }}; 386 + dst = (struct in6_addr) {{ 387 + .__u6_addr32 = { 388 + 0xb80d0120, 389 + 0x00000000, 390 + 0x00000000, 391 + 0x06000000 392 + } 393 + }}; 394 + ret = conntrack_data_generate_v6(self->sock, src, dst, 395 + TEST_ZONE_ID + 2); 396 + EXPECT_EQ(ret, 0); 397 + 398 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); 399 + EXPECT_GE(ret, 2); 400 + if (ret > 2) 401 + SKIP(return, "kernel does not support filtering by zone"); 402 + } 403 + 404 + FIXTURE_TEARDOWN(conntrack_dump_flush) 405 + { 406 + } 407 + 408 + TEST_F(conntrack_dump_flush, test_dump_by_zone) 409 + { 410 + int ret; 411 + 412 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); 413 + EXPECT_EQ(ret, 2); 414 + } 415 + 416 + TEST_F(conntrack_dump_flush, test_flush_by_zone) 417 + { 418 + int ret; 419 + 420 + ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID); 421 + EXPECT_EQ(ret, 0); 422 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); 423 + EXPECT_EQ(ret, 0); 424 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); 425 + EXPECT_EQ(ret, 2); 426 + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); 427 + EXPECT_EQ(ret, 2); 428 + } 429 + 430 + TEST_HARNESS_MAIN