diff options
author | Lennart Poettering <lennart@poettering.net> | 2021-02-18 18:20:36 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-18 18:20:36 +0100 |
commit | 669963968c29d127f4b5a7ee6bed38b1cba527c2 (patch) | |
tree | 1a45d56a8593f07e51802bc7e5d929487ade3e10 /src | |
parent | 5f94d96c4782eeb272f7815bdfc34ec72add8b26 (diff) | |
parent | 980821f3f0b57c32fbda347f5f764e077388b81d (diff) | |
download | systemd-669963968c29d127f4b5a7ee6bed38b1cba527c2.tar.gz |
Merge pull request #18664 from poettering/resolved-defrag
resolved: many UDP fragmentation tweaks
Diffstat (limited to 'src')
-rw-r--r-- | src/resolve/resolved-dns-packet.c | 14 | ||||
-rw-r--r-- | src/resolve/resolved-dns-packet.h | 40 | ||||
-rw-r--r-- | src/resolve/resolved-dns-scope.c | 92 | ||||
-rw-r--r-- | src/resolve/resolved-dns-scope.h | 2 | ||||
-rw-r--r-- | src/resolve/resolved-dns-server.c | 83 | ||||
-rw-r--r-- | src/resolve/resolved-dns-server.h | 8 | ||||
-rw-r--r-- | src/resolve/resolved-dns-stub.c | 10 | ||||
-rw-r--r-- | src/resolve/resolved-dns-transaction.c | 31 | ||||
-rw-r--r-- | src/resolve/resolved-manager.c | 68 | ||||
-rw-r--r-- | src/resolve/resolved-manager.h | 2 | ||||
-rw-r--r-- | src/resolve/resolved-mdns.c | 2 |
11 files changed, 300 insertions, 52 deletions
diff --git a/src/resolve/resolved-dns-packet.c b/src/resolve/resolved-dns-packet.c index 6db82ba217..9d50336c3f 100644 --- a/src/resolve/resolved-dns-packet.c +++ b/src/resolve/resolved-dns-packet.c @@ -2626,6 +2626,20 @@ int dns_packet_has_nsid_request(DnsPacket *p) { return has_nsid; } +size_t dns_packet_size_unfragmented(DnsPacket *p) { + assert(p); + + if (p->fragsize == 0) /* Wasn't fragmented */ + return p->size; + + /* The fragment size (p->fragsize) covers the whole (fragmented) IP packet, while the regular packet + * size (p->size) only covers the DNS part. Thus, subtract the UDP header from the largest fragment + * size, in order to determine which size of DNS packet would have gone through without + * fragmenting. */ + + return LESS_BY(p->fragsize, udp_header_size(p->family)); +} + static const char* const dns_rcode_table[_DNS_RCODE_MAX_DEFINED] = { [DNS_RCODE_SUCCESS] = "SUCCESS", [DNS_RCODE_FORMERR] = "FORMERR", diff --git a/src/resolve/resolved-dns-packet.h b/src/resolve/resolved-dns-packet.h index ee069537c3..7b2abe3e76 100644 --- a/src/resolve/resolved-dns-packet.h +++ b/src/resolve/resolved-dns-packet.h @@ -2,6 +2,7 @@ #pragma once #include <netinet/ip.h> +#include <netinet/ip6.h> #include <netinet/udp.h> #include "hashmap.h" @@ -32,14 +33,19 @@ struct DnsPacketHeader { be16_t ancount; be16_t nscount; be16_t arcount; -}; +} _packed_; #define DNS_PACKET_HEADER_SIZE sizeof(DnsPacketHeader) -#define UDP_PACKET_HEADER_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr)) +#define UDP4_PACKET_HEADER_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr)) +#define UDP6_PACKET_HEADER_SIZE (sizeof(struct ip6_hdr) + sizeof(struct udphdr)) + +assert_cc(sizeof(struct ip6_hdr) == 40); +assert_cc(sizeof(struct iphdr) == 20); +assert_cc(sizeof(struct udphdr) == 8); +assert_cc(sizeof(DnsPacketHeader) == 12); -/* The various DNS protocols deviate in how large a packet can grow, - * but the TCP transport has a 16bit size field, hence that appears to - * be the absolute maximum. */ +/* The various DNS protocols deviate in how large a packet can grow, but the TCP transport has a 16bit size + * field, hence that appears to be the absolute maximum. */ #define DNS_PACKET_SIZE_MAX 0xFFFFu /* The default size to use for allocation when we don't know how large @@ -55,7 +61,7 @@ struct DnsPacketHeader { struct DnsPacket { unsigned n_ref; DnsProtocol protocol; - size_t size, allocated, rindex, max_size; + size_t size, allocated, rindex, max_size, fragsize; void *_data; /* don't access directly, use DNS_PACKET_DATA()! */ Hashmap *names; /* For name compression */ size_t opt_start, opt_size; @@ -146,6 +152,14 @@ static inline bool DNS_PACKET_VERSION_SUPPORTED(DnsPacket *p) { return DNS_RESOURCE_RECORD_OPT_VERSION_SUPPORTED(p->opt); } +static inline bool DNS_PACKET_IS_FRAGMENTED(DnsPacket *p) { + assert(p); + + /* For ingress packets: was this packet fragmented according to our knowledge? */ + + return p->fragsize != 0; +} + /* LLMNR defines some bits differently */ #define DNS_PACKET_LLMNR_C(p) DNS_PACKET_AA(p) #define DNS_PACKET_LLMNR_T(p) DNS_PACKET_RD(p) @@ -307,3 +321,17 @@ static inline size_t dns_packet_size_max(DnsPacket *p) { return p->max_size != 0 ? p->max_size : DNS_PACKET_SIZE_MAX; } + +static inline size_t udp_header_size(int af) { + + switch (af) { + case AF_INET: + return UDP4_PACKET_HEADER_SIZE; + case AF_INET6: + return UDP6_PACKET_HEADER_SIZE; + default: + assert_not_reached("Unexpected address family"); + } +} + +size_t dns_packet_size_unfragmented(DnsPacket *p); diff --git a/src/resolve/resolved-dns-scope.c b/src/resolve/resolved-dns-scope.c index f1dff95a86..2bbb85c1bd 100644 --- a/src/resolve/resolved-dns-scope.c +++ b/src/resolve/resolved-dns-scope.c @@ -5,6 +5,7 @@ #include "af-list.h" #include "alloc-util.h" #include "dns-domain.h" +#include "errno-util.h" #include "fd-util.h" #include "hostname-util.h" #include "missing_network.h" @@ -185,43 +186,73 @@ void dns_scope_packet_lost(DnsScope *s, usec_t usec) { s->resend_timeout = MIN(s->resend_timeout * 2, MULTICAST_RESEND_TIMEOUT_MAX_USEC); } -static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) { - union in_addr_union addr; - int ifindex = 0, r; - int family; - uint32_t mtu; +static int dns_scope_emit_one(DnsScope *s, int fd, int family, DnsPacket *p) { + int r; assert(s); assert(p); assert(p->protocol == s->protocol); - if (s->link) { - mtu = s->link->mtu; - ifindex = s->link->ifindex; - } else - mtu = manager_find_mtu(s->manager); + if (family == AF_UNSPEC) { + if (s->family == AF_UNSPEC) + return -EAFNOSUPPORT; + + family = s->family; + } switch (s->protocol) { - case DNS_PROTOCOL_DNS: + case DNS_PROTOCOL_DNS: { + size_t mtu, udp_size, min_mtu, socket_mtu = 0; + assert(fd >= 0); - if (DNS_PACKET_QDCOUNT(p) > 1) + if (DNS_PACKET_QDCOUNT(p) > 1) /* Classic DNS only allows one question per packet */ return -EOPNOTSUPP; if (p->size > DNS_PACKET_UNICAST_SIZE_MAX) return -EMSGSIZE; - if (p->size + UDP_PACKET_HEADER_SIZE > mtu) - return -EMSGSIZE; + /* Determine the local most accurate MTU */ + if (s->link) + mtu = s->link->mtu; + else + mtu = manager_find_mtu(s->manager); + + /* Acquire the socket's PMDU MTU */ + r = socket_get_mtu(fd, family, &socket_mtu); + if (r < 0 && !ERRNO_IS_DISCONNECT(r)) /* Will return ENOTCONN if no information is available yet */ + return log_debug_errno(r, "Failed to read socket MTU: %m"); + + /* Determine the appropriate UDP header size */ + udp_size = udp_header_size(family); + min_mtu = udp_size + DNS_PACKET_HEADER_SIZE; + + log_debug("Emitting UDP, link MTU is %zu, socket MTU is %zu, minimal MTU is %zu", + mtu, socket_mtu, min_mtu); + + /* Clamp by the kernel's idea of the (path) MTU */ + if (socket_mtu != 0 && socket_mtu < mtu) + mtu = socket_mtu; + + /* Put a lower limit, in case all MTU data we acquired was rubbish */ + if (mtu < min_mtu) + mtu = min_mtu; + + /* Now check our packet size against the MTU we determined */ + if (udp_size + p->size > mtu) + return -EMSGSIZE; /* This means: try TCP instead */ r = manager_write(s->manager, fd, p); if (r < 0) return r; break; + } + + case DNS_PROTOCOL_LLMNR: { + union in_addr_union addr; - case DNS_PROTOCOL_LLMNR: assert(fd < 0); if (DNS_PACKET_QDCOUNT(p) > 1) @@ -230,8 +261,6 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) { if (!ratelimit_below(&s->ratelimit)) return -EBUSY; - family = s->family; - if (family == AF_INET) { addr.in = LLMNR_MULTICAST_IPV4_ADDRESS; fd = manager_llmnr_ipv4_udp_fd(s->manager); @@ -243,20 +272,20 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) { if (fd < 0) return fd; - r = manager_send(s->manager, fd, ifindex, family, &addr, LLMNR_PORT, NULL, p); + r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, LLMNR_PORT, NULL, p); if (r < 0) return r; break; + } - case DNS_PROTOCOL_MDNS: + case DNS_PROTOCOL_MDNS: { + union in_addr_union addr; assert(fd < 0); if (!ratelimit_below(&s->ratelimit)) return -EBUSY; - family = s->family; - if (family == AF_INET) { addr.in = MDNS_MULTICAST_IPV4_ADDRESS; fd = manager_mdns_ipv4_fd(s->manager); @@ -268,11 +297,12 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) { if (fd < 0) return fd; - r = manager_send(s->manager, fd, ifindex, family, &addr, MDNS_PORT, NULL, p); + r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, MDNS_PORT, NULL, p); if (r < 0) return r; break; + } default: return -EAFNOSUPPORT; @@ -281,7 +311,7 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) { return 1; } -int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p) { +int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p) { int r; assert(s); @@ -296,7 +326,7 @@ int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p) { dns_packet_set_flags(p, true, true); } - r = dns_scope_emit_one(s, fd, p); + r = dns_scope_emit_one(s, fd, af, p); if (r < 0) return r; @@ -410,6 +440,16 @@ static int dns_scope_socket( r = socket_set_recvpktinfo(fd, sa.sa.sa_family, true); if (r < 0) return r; + + /* Turn of path MTU discovery for security reasons */ + r = socket_disable_pmtud(fd, sa.sa.sa_family); + if (r < 0) + log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m"); + + /* Learn about fragmentation taking place */ + r = socket_set_recvfragsize(fd, sa.sa.sa_family, true); + if (r < 0) + log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m"); } if (ret_socket_address) @@ -1123,7 +1163,7 @@ static int on_conflict_dispatch(sd_event_source *es, usec_t usec, void *userdata return 0; } - r = dns_scope_emit_udp(scope, -1, p); + r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p); if (r < 0) log_debug_errno(r, "Failed to send conflict packet: %m"); } @@ -1420,7 +1460,7 @@ int dns_scope_announce(DnsScope *scope, bool goodbye) { if (r < 0) return log_debug_errno(r, "Failed to build reply packet: %m"); - r = dns_scope_emit_udp(scope, -1, p); + r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p); if (r < 0) return log_debug_errno(r, "Failed to send reply packet: %m"); diff --git a/src/resolve/resolved-dns-scope.h b/src/resolve/resolved-dns-scope.h index 7e863d3f66..f63452330c 100644 --- a/src/resolve/resolved-dns-scope.h +++ b/src/resolve/resolved-dns-scope.h @@ -71,7 +71,7 @@ DnsScope* dns_scope_free(DnsScope *s); void dns_scope_packet_received(DnsScope *s, usec_t rtt); void dns_scope_packet_lost(DnsScope *s, usec_t usec); -int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p); +int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p); int dns_scope_socket_tcp(DnsScope *s, int family, const union in_addr_union *address, DnsServer *server, uint16_t port, union sockaddr_union *ret_socket_address); int dns_scope_socket_udp(DnsScope *s, DnsServer *server); diff --git a/src/resolve/resolved-dns-server.c b/src/resolve/resolved-dns-server.c index b0eeb04137..e027cdf765 100644 --- a/src/resolve/resolved-dns-server.c +++ b/src/resolve/resolved-dns-server.c @@ -255,7 +255,7 @@ static void dns_server_reset_counters(DnsServer *s) { * incomplete. */ } -void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t size) { +void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize) { assert(s); if (protocol == IPPROTO_UDP) { @@ -289,10 +289,10 @@ void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLeve dns_server_verified(s, level); - /* Remember the size of the largest UDP packet we received from a server, we know that we can always - * announce support for packets with at least this size. */ - if (protocol == IPPROTO_UDP && s->received_udp_packet_max < size) - s->received_udp_packet_max = size; + /* Remember the size of the largest UDP packet fragment we received from a server, we know that we + * can always announce support for packets with at least this size. */ + if (protocol == IPPROTO_UDP && s->received_udp_fragment_max < fragsize) + s->received_udp_fragment_max = fragsize; } void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level) { @@ -389,6 +389,19 @@ void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level) { s->packet_do_off = true; } +void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize) { + assert(s); + + /* Invoked whenever we got a fragmented UDP packet. Let's do two things: keep track of the largest + * fragment we ever received from the server, and remember this, so that we can use it to lower the + * advertised packet size in EDNS0 */ + + if (s->received_udp_fragment_max < fragsize) + s->received_udp_fragment_max = fragsize; + + s->packet_fragmented = true; +} + static bool dns_server_grace_period_expired(DnsServer *s) { usec_t ts; @@ -604,10 +617,47 @@ int dns_server_adjust_opt(DnsServer *server, DnsPacket *packet, DnsServerFeature edns_do = level >= DNS_SERVER_FEATURE_LEVEL_DO; - if (level == DNS_SERVER_FEATURE_LEVEL_LARGE) - packet_size = DNS_PACKET_UNICAST_SIZE_LARGE_MAX; - else - packet_size = server->received_udp_packet_max; + if (level == DNS_SERVER_FEATURE_LEVEL_LARGE) { + size_t udp_size; + + /* In large mode, advertise the local MTU, in order to avoid fragmentation (for security + * reasons) – except if we are talking to localhost (where the security considerations don't + * matter). If we see fragmentation, lower the reported size to the largest fragment, to + * avoid it. */ + + udp_size = udp_header_size(server->family); + + if (in_addr_is_localhost(server->family, &server->address) > 0) + packet_size = 65536 - udp_size; /* force linux loopback MTU if localhost address */ + else { + /* Use the MTU pointing to the server, subtract the IP/UDP header size */ + packet_size = LESS_BY(dns_server_get_mtu(server), udp_size); + + /* On the Internet we want to avoid fragmentation for security reasons. If we saw + * fragmented packets, the above was too large, let's clamp it to the largest + * fragment we saw */ + if (server->packet_fragmented) + packet_size = MIN(server->received_udp_fragment_max, packet_size); + + /* Let's not pick ridiculously large sizes, i.e. not more than 4K. Noone appears to + * ever use such large sized on the Internet IRL, hence let's not either. */ + packet_size = MIN(packet_size, 4096U); + } + + /* Strictly speaking we quite possibly can receive larger datagrams than the MTU (since the + * MTU is for egress, not for ingress), but more often than not the value is symmetric, and + * we want something that does the right thing in the majority of cases, and not just in the + * theoretical edge case. */ + } else + /* In non-large mode, let's advertise the size of the largest fragment we ever managed to accept. */ + packet_size = server->received_udp_fragment_max; + + /* Safety clamp, never advertise less than 512 or more than 65535 */ + packet_size = CLAMP(packet_size, + DNS_PACKET_UNICAST_SIZE_MAX, + DNS_PACKET_SIZE_MAX); + + log_debug("Announcing packet size %zu in egress EDNS(0) packet.", packet_size); return dns_packet_append_opt(packet, packet_size, edns_do, /* include_rfc6975 = */ true, NULL, 0, NULL); } @@ -700,6 +750,15 @@ void dns_server_warn_downgrade(DnsServer *server) { server->warned_downgrade = true; } +size_t dns_server_get_mtu(DnsServer *s) { + assert(s); + + if (s->link && s->link->mtu != 0) + return s->link->mtu; + + return manager_find_mtu(s->manager); +} + static void dns_server_hash_func(const DnsServer *s, struct siphash *state) { assert(s); @@ -923,7 +982,7 @@ void dns_server_reset_features(DnsServer *s) { s->verified_feature_level = _DNS_SERVER_FEATURE_LEVEL_INVALID; s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_BEST; - s->received_udp_packet_max = DNS_PACKET_UNICAST_SIZE_MAX; + s->received_udp_fragment_max = DNS_PACKET_UNICAST_SIZE_MAX; s->packet_bad_opt = false; s->packet_rrsig_missing = false; @@ -983,7 +1042,7 @@ void dns_server_dump(DnsServer *s, FILE *f) { fputc('\n', f); fprintf(f, - "\tMaximum UDP packet size received: %zu\n" + "\tMaximum UDP fragment size received: %zu\n" "\tFailed UDP attempts: %u\n" "\tFailed TCP attempts: %u\n" "\tSeen truncated packet: %s\n" @@ -991,7 +1050,7 @@ void dns_server_dump(DnsServer *s, FILE *f) { "\tSeen RRSIG RR missing: %s\n" "\tSeen invalid packet: %s\n" "\tServer dropped DO flag: %s\n", - s->received_udp_packet_max, + s->received_udp_fragment_max, s->n_failed_udp, s->n_failed_tcp, yes_no(s->packet_truncated), diff --git a/src/resolve/resolved-dns-server.h b/src/resolve/resolved-dns-server.h index 689fd42db4..fe0eaee49c 100644 --- a/src/resolve/resolved-dns-server.h +++ b/src/resolve/resolved-dns-server.h @@ -75,7 +75,7 @@ struct DnsServer { DnsServerFeatureLevel verified_feature_level; DnsServerFeatureLevel possible_feature_level; - size_t received_udp_packet_max; + size_t received_udp_fragment_max; /* largest packet or fragment (without IP/UDP header) we saw so far */ unsigned n_failed_udp; unsigned n_failed_tcp; @@ -86,6 +86,7 @@ struct DnsServer { bool packet_rrsig_missing:1; /* Set when RRSIG was missing */ bool packet_invalid:1; /* Set when we failed to parse a reply */ bool packet_do_off:1; /* Set when the server didn't copy DNSSEC DO flag from request to response */ + bool packet_fragmented:1; /* Set when we ever saw a fragmented packet */ usec_t verified_usec; usec_t features_grace_period_usec; @@ -118,7 +119,7 @@ DnsServer* dns_server_unref(DnsServer *s); void dns_server_unlink(DnsServer *s); void dns_server_move_back_and_unmark(DnsServer *s); -void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t size); +void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize); void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level); void dns_server_packet_truncated(DnsServer *s, DnsServerFeatureLevel level); void dns_server_packet_rrsig_missing(DnsServer *s, DnsServerFeatureLevel level); @@ -126,6 +127,7 @@ void dns_server_packet_bad_opt(DnsServer *s, DnsServerFeatureLevel level); void dns_server_packet_rcode_downgrade(DnsServer *s, DnsServerFeatureLevel level); void dns_server_packet_invalid(DnsServer *s, DnsServerFeatureLevel level); void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level); +void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize); DnsServerFeatureLevel dns_server_possible_feature_level(DnsServer *s); @@ -155,6 +157,8 @@ void manager_next_dns_server(Manager *m, DnsServer *if_current); DnssecMode dns_server_get_dnssec_mode(DnsServer *s); DnsOverTlsMode dns_server_get_dns_over_tls_mode(DnsServer *s); +size_t dns_server_get_mtu(DnsServer *s); + DEFINE_TRIVIAL_CLEANUP_FUNC(DnsServer*, dns_server_unref); extern const struct hash_ops dns_server_hash_ops; diff --git a/src/resolve/resolved-dns-stub.c b/src/resolve/resolved-dns-stub.c index 5311bc8874..a18998f1c8 100644 --- a/src/resolve/resolved-dns-stub.c +++ b/src/resolve/resolved-dns-stub.c @@ -1117,6 +1117,16 @@ static int manager_dns_stub_fd_extra(Manager *m, DnsStubListenerExtra *l, int ty if (r < 0) goto fail; + if (type == SOCK_DGRAM) { + r = socket_disable_pmtud(fd, l->family); + if (r < 0) + log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m"); + + r = socket_set_recvfragsize(fd, l->family, true); + if (r < 0) + log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m"); + } + if (bind(fd, &sa.sa, SOCKADDR_LEN(sa)) < 0) { r = -errno; goto fail; diff --git a/src/resolve/resolved-dns-transaction.c b/src/resolve/resolved-dns-transaction.c index 1f396239f9..260ce76b98 100644 --- a/src/resolve/resolved-dns-transaction.c +++ b/src/resolve/resolved-dns-transaction.c @@ -1031,6 +1031,7 @@ static int dns_transaction_fix_rcode(DnsTransaction *t) { } void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypted) { + bool retry_with_tcp = false; int r; assert(t); @@ -1193,9 +1194,29 @@ void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypt return; } + /* Response was truncated, let's try again with good old TCP */ log_debug("Reply truncated, retrying via TCP."); + retry_with_tcp = true; - /* Response was truncated, let's try again with good old TCP */ + } else if (t->scope->protocol == DNS_PROTOCOL_DNS && + DNS_PACKET_IS_FRAGMENTED(p)) { + + /* Report the fragment size, so that we downgrade from LARGE to regular EDNS0 if needed */ + if (t->server) + dns_server_packet_udp_fragmented(t->server, dns_packet_size_unfragmented(p)); + + if (t->current_feature_level > DNS_SERVER_FEATURE_LEVEL_UDP) { + /* Packet was fragmented. Let's retry with TCP to avoid fragmentation attack + * issues. (We don't do that on the lowest feature level however, since crappy DNS + * servers often do not implement TCP, hence falling back to TCP on fragmentation is + * counter-productive there.) */ + + log_debug("Reply fragmented, retrying via TCP."); + retry_with_tcp = true; + } + } + + if (retry_with_tcp) { r = dns_transaction_emit_tcp(t); if (r == -ESRCH) { /* No servers found? Damn! */ @@ -1296,8 +1317,10 @@ void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypt if (DNS_PACKET_DO(t->sent) && !DNS_PACKET_DO(t->received)) dns_server_packet_do_off(t->server, t->current_feature_level); - /* Report that we successfully received a packet */ - dns_server_packet_received(t->server, p->ipproto, t->current_feature_level, p->size); + /* Report that we successfully received a packet. We keep track of the largest packet + * size/fragment size we got. Which is useful for announcing the EDNS(0) packet size we can + * receive to our server. */ + dns_server_packet_received(t->server, p->ipproto, t->current_feature_level, dns_packet_size_unfragmented(p)); } /* See if we know things we didn't know before that indicate we better restart the lookup immediately. */ @@ -1470,7 +1493,7 @@ static int dns_transaction_emit_udp(DnsTransaction *t) { } else dns_transaction_close_connection(t, true); - r = dns_scope_emit_udp(t->scope, t->dns_udp_fd, t->sent); + r = dns_scope_emit_udp(t->scope, t->dns_udp_fd, t->server ? t->server->family : AF_UNSPEC, t->sent); if (r < 0) return r; diff --git a/src/resolve/resolved-manager.c b/src/resolve/resolved-manager.c index 6cc3a5d56f..fc5f8c79d3 100644 --- a/src/resolve/resolved-manager.c +++ b/src/resolve/resolved-manager.c @@ -19,6 +19,7 @@ #include "idn-util.h" #include "io-util.h" #include "missing_network.h" +#include "missing_socket.h" #include "netlink-util.h" #include "ordered-set.h" #include "parse-util.h" @@ -881,6 +882,9 @@ int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) { p->ttl = *(int *) CMSG_DATA(cmsg); break; + case IPV6_RECVFRAGSIZE: + p->fragsize = *(int *) CMSG_DATA(cmsg); + break; } } else if (cmsg->cmsg_level == IPPROTO_IP) { assert(p->family == AF_INET); @@ -900,6 +904,10 @@ int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) { case IP_TTL: p->ttl = *(int *) CMSG_DATA(cmsg); break; + + case IP_RECVFRAGSIZE: + p->fragsize = *(int *) CMSG_DATA(cmsg); + break; } } } @@ -1658,3 +1666,63 @@ bool manager_server_is_stub(Manager *m, DnsServer *s) { return false; } + +int socket_disable_pmtud(int fd, int af) { + int r; + + assert(fd >= 0); + + if (af == AF_UNSPEC) { + r = socket_get_family(fd, &af); + if (r < 0) + return r; + } + + switch (af) { + + case AF_INET: { + /* Turn off path MTU discovery, let's rather fragment on the way than to open us up against + * PMTU forgery vulnerabilities. + * + * There appears to be no documentation about IP_PMTUDISC_OMIT, but it has the effect that + * the "Don't Fragment" bit in the IPv4 header is turned off, thus enforcing fragmentation if + * our datagram size exceeds the MTU of a router in the path, and turning off path MTU + * discovery. + * + * This helps mitigating the PMTUD vulnerability described here: + * + * https://blog.apnic.net/2019/07/12/its-time-to-consider-avoiding-ip-fragmentation-in-the-dns/ + * + * Similar logic is in place in most DNS servers. + * + * There are multiple conflicting goals: we want to allow the largest datagrams possible (for + * efficiency reasons), but not have fragmentation (for security reasons), nor use PMTUD (for + * security reasons, too). Our strategy to deal with this is: use large packets, turn off + * PMTUD, but watch fragmentation taking place, and then size our packets to the max of the + * fragments seen — and if we need larger packets always go to TCP. + */ + + r = setsockopt_int(fd, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_OMIT); + if (r < 0) + return r; + + return 0; + } + + case AF_INET6: { + /* On IPv6 fragmentation only is done by the sender — never by routers on the path. PMTUD is + * mandatory. If we want to turn off PMTUD, the only way is by sending with minimal MTU only, + * so that we apply maximum fragmentation locally already, and thus PMTUD doesn't happen + * because there's nothing that could be fragmented further anymore. */ + + r = setsockopt_int(fd, IPPROTO_IPV6, IPV6_MTU, IPV6_MIN_MTU); + if (r < 0) + return r; + + return 0; + } + + default: + return -EAFNOSUPPORT; + } +} diff --git a/src/resolve/resolved-manager.h b/src/resolve/resolved-manager.h index 90f5586230..1371c41b92 100644 --- a/src/resolve/resolved-manager.h +++ b/src/resolve/resolved-manager.h @@ -204,3 +204,5 @@ void manager_cleanup_saved_user(Manager *m); bool manager_next_dnssd_names(Manager *m); bool manager_server_is_stub(Manager *m, DnsServer *s); + +int socket_disable_pmtud(int fd, int af); diff --git a/src/resolve/resolved-mdns.c b/src/resolve/resolved-mdns.c index 5b4d08cce8..cf310ea01e 100644 --- a/src/resolve/resolved-mdns.c +++ b/src/resolve/resolved-mdns.c @@ -237,7 +237,7 @@ static int mdns_scope_process_query(DnsScope *s, DnsPacket *p) { if (!ratelimit_below(&s->ratelimit)) return 0; - r = dns_scope_emit_udp(s, -1, reply); + r = dns_scope_emit_udp(s, -1, AF_UNSPEC, reply); if (r < 0) return log_debug_errno(r, "Failed to send reply packet: %m"); |