summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2021-02-18 18:20:36 +0100
committerGitHub <noreply@github.com>2021-02-18 18:20:36 +0100
commit669963968c29d127f4b5a7ee6bed38b1cba527c2 (patch)
tree1a45d56a8593f07e51802bc7e5d929487ade3e10 /src
parent5f94d96c4782eeb272f7815bdfc34ec72add8b26 (diff)
parent980821f3f0b57c32fbda347f5f764e077388b81d (diff)
downloadsystemd-669963968c29d127f4b5a7ee6bed38b1cba527c2.tar.gz
Merge pull request #18664 from poettering/resolved-defrag
resolved: many UDP fragmentation tweaks
Diffstat (limited to 'src')
-rw-r--r--src/resolve/resolved-dns-packet.c14
-rw-r--r--src/resolve/resolved-dns-packet.h40
-rw-r--r--src/resolve/resolved-dns-scope.c92
-rw-r--r--src/resolve/resolved-dns-scope.h2
-rw-r--r--src/resolve/resolved-dns-server.c83
-rw-r--r--src/resolve/resolved-dns-server.h8
-rw-r--r--src/resolve/resolved-dns-stub.c10
-rw-r--r--src/resolve/resolved-dns-transaction.c31
-rw-r--r--src/resolve/resolved-manager.c68
-rw-r--r--src/resolve/resolved-manager.h2
-rw-r--r--src/resolve/resolved-mdns.c2
11 files changed, 300 insertions, 52 deletions
diff --git a/src/resolve/resolved-dns-packet.c b/src/resolve/resolved-dns-packet.c
index 6db82ba217..9d50336c3f 100644
--- a/src/resolve/resolved-dns-packet.c
+++ b/src/resolve/resolved-dns-packet.c
@@ -2626,6 +2626,20 @@ int dns_packet_has_nsid_request(DnsPacket *p) {
return has_nsid;
}
+size_t dns_packet_size_unfragmented(DnsPacket *p) {
+ assert(p);
+
+ if (p->fragsize == 0) /* Wasn't fragmented */
+ return p->size;
+
+ /* The fragment size (p->fragsize) covers the whole (fragmented) IP packet, while the regular packet
+ * size (p->size) only covers the DNS part. Thus, subtract the UDP header from the largest fragment
+ * size, in order to determine which size of DNS packet would have gone through without
+ * fragmenting. */
+
+ return LESS_BY(p->fragsize, udp_header_size(p->family));
+}
+
static const char* const dns_rcode_table[_DNS_RCODE_MAX_DEFINED] = {
[DNS_RCODE_SUCCESS] = "SUCCESS",
[DNS_RCODE_FORMERR] = "FORMERR",
diff --git a/src/resolve/resolved-dns-packet.h b/src/resolve/resolved-dns-packet.h
index ee069537c3..7b2abe3e76 100644
--- a/src/resolve/resolved-dns-packet.h
+++ b/src/resolve/resolved-dns-packet.h
@@ -2,6 +2,7 @@
#pragma once
#include <netinet/ip.h>
+#include <netinet/ip6.h>
#include <netinet/udp.h>
#include "hashmap.h"
@@ -32,14 +33,19 @@ struct DnsPacketHeader {
be16_t ancount;
be16_t nscount;
be16_t arcount;
-};
+} _packed_;
#define DNS_PACKET_HEADER_SIZE sizeof(DnsPacketHeader)
-#define UDP_PACKET_HEADER_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr))
+#define UDP4_PACKET_HEADER_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr))
+#define UDP6_PACKET_HEADER_SIZE (sizeof(struct ip6_hdr) + sizeof(struct udphdr))
+
+assert_cc(sizeof(struct ip6_hdr) == 40);
+assert_cc(sizeof(struct iphdr) == 20);
+assert_cc(sizeof(struct udphdr) == 8);
+assert_cc(sizeof(DnsPacketHeader) == 12);
-/* The various DNS protocols deviate in how large a packet can grow,
- * but the TCP transport has a 16bit size field, hence that appears to
- * be the absolute maximum. */
+/* The various DNS protocols deviate in how large a packet can grow, but the TCP transport has a 16bit size
+ * field, hence that appears to be the absolute maximum. */
#define DNS_PACKET_SIZE_MAX 0xFFFFu
/* The default size to use for allocation when we don't know how large
@@ -55,7 +61,7 @@ struct DnsPacketHeader {
struct DnsPacket {
unsigned n_ref;
DnsProtocol protocol;
- size_t size, allocated, rindex, max_size;
+ size_t size, allocated, rindex, max_size, fragsize;
void *_data; /* don't access directly, use DNS_PACKET_DATA()! */
Hashmap *names; /* For name compression */
size_t opt_start, opt_size;
@@ -146,6 +152,14 @@ static inline bool DNS_PACKET_VERSION_SUPPORTED(DnsPacket *p) {
return DNS_RESOURCE_RECORD_OPT_VERSION_SUPPORTED(p->opt);
}
+static inline bool DNS_PACKET_IS_FRAGMENTED(DnsPacket *p) {
+ assert(p);
+
+ /* For ingress packets: was this packet fragmented according to our knowledge? */
+
+ return p->fragsize != 0;
+}
+
/* LLMNR defines some bits differently */
#define DNS_PACKET_LLMNR_C(p) DNS_PACKET_AA(p)
#define DNS_PACKET_LLMNR_T(p) DNS_PACKET_RD(p)
@@ -307,3 +321,17 @@ static inline size_t dns_packet_size_max(DnsPacket *p) {
return p->max_size != 0 ? p->max_size : DNS_PACKET_SIZE_MAX;
}
+
+static inline size_t udp_header_size(int af) {
+
+ switch (af) {
+ case AF_INET:
+ return UDP4_PACKET_HEADER_SIZE;
+ case AF_INET6:
+ return UDP6_PACKET_HEADER_SIZE;
+ default:
+ assert_not_reached("Unexpected address family");
+ }
+}
+
+size_t dns_packet_size_unfragmented(DnsPacket *p);
diff --git a/src/resolve/resolved-dns-scope.c b/src/resolve/resolved-dns-scope.c
index f1dff95a86..2bbb85c1bd 100644
--- a/src/resolve/resolved-dns-scope.c
+++ b/src/resolve/resolved-dns-scope.c
@@ -5,6 +5,7 @@
#include "af-list.h"
#include "alloc-util.h"
#include "dns-domain.h"
+#include "errno-util.h"
#include "fd-util.h"
#include "hostname-util.h"
#include "missing_network.h"
@@ -185,43 +186,73 @@ void dns_scope_packet_lost(DnsScope *s, usec_t usec) {
s->resend_timeout = MIN(s->resend_timeout * 2, MULTICAST_RESEND_TIMEOUT_MAX_USEC);
}
-static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) {
- union in_addr_union addr;
- int ifindex = 0, r;
- int family;
- uint32_t mtu;
+static int dns_scope_emit_one(DnsScope *s, int fd, int family, DnsPacket *p) {
+ int r;
assert(s);
assert(p);
assert(p->protocol == s->protocol);
- if (s->link) {
- mtu = s->link->mtu;
- ifindex = s->link->ifindex;
- } else
- mtu = manager_find_mtu(s->manager);
+ if (family == AF_UNSPEC) {
+ if (s->family == AF_UNSPEC)
+ return -EAFNOSUPPORT;
+
+ family = s->family;
+ }
switch (s->protocol) {
- case DNS_PROTOCOL_DNS:
+ case DNS_PROTOCOL_DNS: {
+ size_t mtu, udp_size, min_mtu, socket_mtu = 0;
+
assert(fd >= 0);
- if (DNS_PACKET_QDCOUNT(p) > 1)
+ if (DNS_PACKET_QDCOUNT(p) > 1) /* Classic DNS only allows one question per packet */
return -EOPNOTSUPP;
if (p->size > DNS_PACKET_UNICAST_SIZE_MAX)
return -EMSGSIZE;
- if (p->size + UDP_PACKET_HEADER_SIZE > mtu)
- return -EMSGSIZE;
+ /* Determine the local most accurate MTU */
+ if (s->link)
+ mtu = s->link->mtu;
+ else
+ mtu = manager_find_mtu(s->manager);
+
+ /* Acquire the socket's PMDU MTU */
+ r = socket_get_mtu(fd, family, &socket_mtu);
+ if (r < 0 && !ERRNO_IS_DISCONNECT(r)) /* Will return ENOTCONN if no information is available yet */
+ return log_debug_errno(r, "Failed to read socket MTU: %m");
+
+ /* Determine the appropriate UDP header size */
+ udp_size = udp_header_size(family);
+ min_mtu = udp_size + DNS_PACKET_HEADER_SIZE;
+
+ log_debug("Emitting UDP, link MTU is %zu, socket MTU is %zu, minimal MTU is %zu",
+ mtu, socket_mtu, min_mtu);
+
+ /* Clamp by the kernel's idea of the (path) MTU */
+ if (socket_mtu != 0 && socket_mtu < mtu)
+ mtu = socket_mtu;
+
+ /* Put a lower limit, in case all MTU data we acquired was rubbish */
+ if (mtu < min_mtu)
+ mtu = min_mtu;
+
+ /* Now check our packet size against the MTU we determined */
+ if (udp_size + p->size > mtu)
+ return -EMSGSIZE; /* This means: try TCP instead */
r = manager_write(s->manager, fd, p);
if (r < 0)
return r;
break;
+ }
+
+ case DNS_PROTOCOL_LLMNR: {
+ union in_addr_union addr;
- case DNS_PROTOCOL_LLMNR:
assert(fd < 0);
if (DNS_PACKET_QDCOUNT(p) > 1)
@@ -230,8 +261,6 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) {
if (!ratelimit_below(&s->ratelimit))
return -EBUSY;
- family = s->family;
-
if (family == AF_INET) {
addr.in = LLMNR_MULTICAST_IPV4_ADDRESS;
fd = manager_llmnr_ipv4_udp_fd(s->manager);
@@ -243,20 +272,20 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) {
if (fd < 0)
return fd;
- r = manager_send(s->manager, fd, ifindex, family, &addr, LLMNR_PORT, NULL, p);
+ r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, LLMNR_PORT, NULL, p);
if (r < 0)
return r;
break;
+ }
- case DNS_PROTOCOL_MDNS:
+ case DNS_PROTOCOL_MDNS: {
+ union in_addr_union addr;
assert(fd < 0);
if (!ratelimit_below(&s->ratelimit))
return -EBUSY;
- family = s->family;
-
if (family == AF_INET) {
addr.in = MDNS_MULTICAST_IPV4_ADDRESS;
fd = manager_mdns_ipv4_fd(s->manager);
@@ -268,11 +297,12 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) {
if (fd < 0)
return fd;
- r = manager_send(s->manager, fd, ifindex, family, &addr, MDNS_PORT, NULL, p);
+ r = manager_send(s->manager, fd, s->link->ifindex, family, &addr, MDNS_PORT, NULL, p);
if (r < 0)
return r;
break;
+ }
default:
return -EAFNOSUPPORT;
@@ -281,7 +311,7 @@ static int dns_scope_emit_one(DnsScope *s, int fd, DnsPacket *p) {
return 1;
}
-int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p) {
+int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p) {
int r;
assert(s);
@@ -296,7 +326,7 @@ int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p) {
dns_packet_set_flags(p, true, true);
}
- r = dns_scope_emit_one(s, fd, p);
+ r = dns_scope_emit_one(s, fd, af, p);
if (r < 0)
return r;
@@ -410,6 +440,16 @@ static int dns_scope_socket(
r = socket_set_recvpktinfo(fd, sa.sa.sa_family, true);
if (r < 0)
return r;
+
+ /* Turn of path MTU discovery for security reasons */
+ r = socket_disable_pmtud(fd, sa.sa.sa_family);
+ if (r < 0)
+ log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m");
+
+ /* Learn about fragmentation taking place */
+ r = socket_set_recvfragsize(fd, sa.sa.sa_family, true);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m");
}
if (ret_socket_address)
@@ -1123,7 +1163,7 @@ static int on_conflict_dispatch(sd_event_source *es, usec_t usec, void *userdata
return 0;
}
- r = dns_scope_emit_udp(scope, -1, p);
+ r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p);
if (r < 0)
log_debug_errno(r, "Failed to send conflict packet: %m");
}
@@ -1420,7 +1460,7 @@ int dns_scope_announce(DnsScope *scope, bool goodbye) {
if (r < 0)
return log_debug_errno(r, "Failed to build reply packet: %m");
- r = dns_scope_emit_udp(scope, -1, p);
+ r = dns_scope_emit_udp(scope, -1, AF_UNSPEC, p);
if (r < 0)
return log_debug_errno(r, "Failed to send reply packet: %m");
diff --git a/src/resolve/resolved-dns-scope.h b/src/resolve/resolved-dns-scope.h
index 7e863d3f66..f63452330c 100644
--- a/src/resolve/resolved-dns-scope.h
+++ b/src/resolve/resolved-dns-scope.h
@@ -71,7 +71,7 @@ DnsScope* dns_scope_free(DnsScope *s);
void dns_scope_packet_received(DnsScope *s, usec_t rtt);
void dns_scope_packet_lost(DnsScope *s, usec_t usec);
-int dns_scope_emit_udp(DnsScope *s, int fd, DnsPacket *p);
+int dns_scope_emit_udp(DnsScope *s, int fd, int af, DnsPacket *p);
int dns_scope_socket_tcp(DnsScope *s, int family, const union in_addr_union *address, DnsServer *server, uint16_t port, union sockaddr_union *ret_socket_address);
int dns_scope_socket_udp(DnsScope *s, DnsServer *server);
diff --git a/src/resolve/resolved-dns-server.c b/src/resolve/resolved-dns-server.c
index b0eeb04137..e027cdf765 100644
--- a/src/resolve/resolved-dns-server.c
+++ b/src/resolve/resolved-dns-server.c
@@ -255,7 +255,7 @@ static void dns_server_reset_counters(DnsServer *s) {
* incomplete. */
}
-void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t size) {
+void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize) {
assert(s);
if (protocol == IPPROTO_UDP) {
@@ -289,10 +289,10 @@ void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLeve
dns_server_verified(s, level);
- /* Remember the size of the largest UDP packet we received from a server, we know that we can always
- * announce support for packets with at least this size. */
- if (protocol == IPPROTO_UDP && s->received_udp_packet_max < size)
- s->received_udp_packet_max = size;
+ /* Remember the size of the largest UDP packet fragment we received from a server, we know that we
+ * can always announce support for packets with at least this size. */
+ if (protocol == IPPROTO_UDP && s->received_udp_fragment_max < fragsize)
+ s->received_udp_fragment_max = fragsize;
}
void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level) {
@@ -389,6 +389,19 @@ void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level) {
s->packet_do_off = true;
}
+void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize) {
+ assert(s);
+
+ /* Invoked whenever we got a fragmented UDP packet. Let's do two things: keep track of the largest
+ * fragment we ever received from the server, and remember this, so that we can use it to lower the
+ * advertised packet size in EDNS0 */
+
+ if (s->received_udp_fragment_max < fragsize)
+ s->received_udp_fragment_max = fragsize;
+
+ s->packet_fragmented = true;
+}
+
static bool dns_server_grace_period_expired(DnsServer *s) {
usec_t ts;
@@ -604,10 +617,47 @@ int dns_server_adjust_opt(DnsServer *server, DnsPacket *packet, DnsServerFeature
edns_do = level >= DNS_SERVER_FEATURE_LEVEL_DO;
- if (level == DNS_SERVER_FEATURE_LEVEL_LARGE)
- packet_size = DNS_PACKET_UNICAST_SIZE_LARGE_MAX;
- else
- packet_size = server->received_udp_packet_max;
+ if (level == DNS_SERVER_FEATURE_LEVEL_LARGE) {
+ size_t udp_size;
+
+ /* In large mode, advertise the local MTU, in order to avoid fragmentation (for security
+ * reasons) – except if we are talking to localhost (where the security considerations don't
+ * matter). If we see fragmentation, lower the reported size to the largest fragment, to
+ * avoid it. */
+
+ udp_size = udp_header_size(server->family);
+
+ if (in_addr_is_localhost(server->family, &server->address) > 0)
+ packet_size = 65536 - udp_size; /* force linux loopback MTU if localhost address */
+ else {
+ /* Use the MTU pointing to the server, subtract the IP/UDP header size */
+ packet_size = LESS_BY(dns_server_get_mtu(server), udp_size);
+
+ /* On the Internet we want to avoid fragmentation for security reasons. If we saw
+ * fragmented packets, the above was too large, let's clamp it to the largest
+ * fragment we saw */
+ if (server->packet_fragmented)
+ packet_size = MIN(server->received_udp_fragment_max, packet_size);
+
+ /* Let's not pick ridiculously large sizes, i.e. not more than 4K. Noone appears to
+ * ever use such large sized on the Internet IRL, hence let's not either. */
+ packet_size = MIN(packet_size, 4096U);
+ }
+
+ /* Strictly speaking we quite possibly can receive larger datagrams than the MTU (since the
+ * MTU is for egress, not for ingress), but more often than not the value is symmetric, and
+ * we want something that does the right thing in the majority of cases, and not just in the
+ * theoretical edge case. */
+ } else
+ /* In non-large mode, let's advertise the size of the largest fragment we ever managed to accept. */
+ packet_size = server->received_udp_fragment_max;
+
+ /* Safety clamp, never advertise less than 512 or more than 65535 */
+ packet_size = CLAMP(packet_size,
+ DNS_PACKET_UNICAST_SIZE_MAX,
+ DNS_PACKET_SIZE_MAX);
+
+ log_debug("Announcing packet size %zu in egress EDNS(0) packet.", packet_size);
return dns_packet_append_opt(packet, packet_size, edns_do, /* include_rfc6975 = */ true, NULL, 0, NULL);
}
@@ -700,6 +750,15 @@ void dns_server_warn_downgrade(DnsServer *server) {
server->warned_downgrade = true;
}
+size_t dns_server_get_mtu(DnsServer *s) {
+ assert(s);
+
+ if (s->link && s->link->mtu != 0)
+ return s->link->mtu;
+
+ return manager_find_mtu(s->manager);
+}
+
static void dns_server_hash_func(const DnsServer *s, struct siphash *state) {
assert(s);
@@ -923,7 +982,7 @@ void dns_server_reset_features(DnsServer *s) {
s->verified_feature_level = _DNS_SERVER_FEATURE_LEVEL_INVALID;
s->possible_feature_level = DNS_SERVER_FEATURE_LEVEL_BEST;
- s->received_udp_packet_max = DNS_PACKET_UNICAST_SIZE_MAX;
+ s->received_udp_fragment_max = DNS_PACKET_UNICAST_SIZE_MAX;
s->packet_bad_opt = false;
s->packet_rrsig_missing = false;
@@ -983,7 +1042,7 @@ void dns_server_dump(DnsServer *s, FILE *f) {
fputc('\n', f);
fprintf(f,
- "\tMaximum UDP packet size received: %zu\n"
+ "\tMaximum UDP fragment size received: %zu\n"
"\tFailed UDP attempts: %u\n"
"\tFailed TCP attempts: %u\n"
"\tSeen truncated packet: %s\n"
@@ -991,7 +1050,7 @@ void dns_server_dump(DnsServer *s, FILE *f) {
"\tSeen RRSIG RR missing: %s\n"
"\tSeen invalid packet: %s\n"
"\tServer dropped DO flag: %s\n",
- s->received_udp_packet_max,
+ s->received_udp_fragment_max,
s->n_failed_udp,
s->n_failed_tcp,
yes_no(s->packet_truncated),
diff --git a/src/resolve/resolved-dns-server.h b/src/resolve/resolved-dns-server.h
index 689fd42db4..fe0eaee49c 100644
--- a/src/resolve/resolved-dns-server.h
+++ b/src/resolve/resolved-dns-server.h
@@ -75,7 +75,7 @@ struct DnsServer {
DnsServerFeatureLevel verified_feature_level;
DnsServerFeatureLevel possible_feature_level;
- size_t received_udp_packet_max;
+ size_t received_udp_fragment_max; /* largest packet or fragment (without IP/UDP header) we saw so far */
unsigned n_failed_udp;
unsigned n_failed_tcp;
@@ -86,6 +86,7 @@ struct DnsServer {
bool packet_rrsig_missing:1; /* Set when RRSIG was missing */
bool packet_invalid:1; /* Set when we failed to parse a reply */
bool packet_do_off:1; /* Set when the server didn't copy DNSSEC DO flag from request to response */
+ bool packet_fragmented:1; /* Set when we ever saw a fragmented packet */
usec_t verified_usec;
usec_t features_grace_period_usec;
@@ -118,7 +119,7 @@ DnsServer* dns_server_unref(DnsServer *s);
void dns_server_unlink(DnsServer *s);
void dns_server_move_back_and_unmark(DnsServer *s);
-void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t size);
+void dns_server_packet_received(DnsServer *s, int protocol, DnsServerFeatureLevel level, size_t fragsize);
void dns_server_packet_lost(DnsServer *s, int protocol, DnsServerFeatureLevel level);
void dns_server_packet_truncated(DnsServer *s, DnsServerFeatureLevel level);
void dns_server_packet_rrsig_missing(DnsServer *s, DnsServerFeatureLevel level);
@@ -126,6 +127,7 @@ void dns_server_packet_bad_opt(DnsServer *s, DnsServerFeatureLevel level);
void dns_server_packet_rcode_downgrade(DnsServer *s, DnsServerFeatureLevel level);
void dns_server_packet_invalid(DnsServer *s, DnsServerFeatureLevel level);
void dns_server_packet_do_off(DnsServer *s, DnsServerFeatureLevel level);
+void dns_server_packet_udp_fragmented(DnsServer *s, size_t fragsize);
DnsServerFeatureLevel dns_server_possible_feature_level(DnsServer *s);
@@ -155,6 +157,8 @@ void manager_next_dns_server(Manager *m, DnsServer *if_current);
DnssecMode dns_server_get_dnssec_mode(DnsServer *s);
DnsOverTlsMode dns_server_get_dns_over_tls_mode(DnsServer *s);
+size_t dns_server_get_mtu(DnsServer *s);
+
DEFINE_TRIVIAL_CLEANUP_FUNC(DnsServer*, dns_server_unref);
extern const struct hash_ops dns_server_hash_ops;
diff --git a/src/resolve/resolved-dns-stub.c b/src/resolve/resolved-dns-stub.c
index 5311bc8874..a18998f1c8 100644
--- a/src/resolve/resolved-dns-stub.c
+++ b/src/resolve/resolved-dns-stub.c
@@ -1117,6 +1117,16 @@ static int manager_dns_stub_fd_extra(Manager *m, DnsStubListenerExtra *l, int ty
if (r < 0)
goto fail;
+ if (type == SOCK_DGRAM) {
+ r = socket_disable_pmtud(fd, l->family);
+ if (r < 0)
+ log_debug_errno(r, "Failed to disable UDP PMTUD, ignoring: %m");
+
+ r = socket_set_recvfragsize(fd, l->family, true);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable fragment size reception, ignoring: %m");
+ }
+
if (bind(fd, &sa.sa, SOCKADDR_LEN(sa)) < 0) {
r = -errno;
goto fail;
diff --git a/src/resolve/resolved-dns-transaction.c b/src/resolve/resolved-dns-transaction.c
index 1f396239f9..260ce76b98 100644
--- a/src/resolve/resolved-dns-transaction.c
+++ b/src/resolve/resolved-dns-transaction.c
@@ -1031,6 +1031,7 @@ static int dns_transaction_fix_rcode(DnsTransaction *t) {
}
void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypted) {
+ bool retry_with_tcp = false;
int r;
assert(t);
@@ -1193,9 +1194,29 @@ void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypt
return;
}
+ /* Response was truncated, let's try again with good old TCP */
log_debug("Reply truncated, retrying via TCP.");
+ retry_with_tcp = true;
- /* Response was truncated, let's try again with good old TCP */
+ } else if (t->scope->protocol == DNS_PROTOCOL_DNS &&
+ DNS_PACKET_IS_FRAGMENTED(p)) {
+
+ /* Report the fragment size, so that we downgrade from LARGE to regular EDNS0 if needed */
+ if (t->server)
+ dns_server_packet_udp_fragmented(t->server, dns_packet_size_unfragmented(p));
+
+ if (t->current_feature_level > DNS_SERVER_FEATURE_LEVEL_UDP) {
+ /* Packet was fragmented. Let's retry with TCP to avoid fragmentation attack
+ * issues. (We don't do that on the lowest feature level however, since crappy DNS
+ * servers often do not implement TCP, hence falling back to TCP on fragmentation is
+ * counter-productive there.) */
+
+ log_debug("Reply fragmented, retrying via TCP.");
+ retry_with_tcp = true;
+ }
+ }
+
+ if (retry_with_tcp) {
r = dns_transaction_emit_tcp(t);
if (r == -ESRCH) {
/* No servers found? Damn! */
@@ -1296,8 +1317,10 @@ void dns_transaction_process_reply(DnsTransaction *t, DnsPacket *p, bool encrypt
if (DNS_PACKET_DO(t->sent) && !DNS_PACKET_DO(t->received))
dns_server_packet_do_off(t->server, t->current_feature_level);
- /* Report that we successfully received a packet */
- dns_server_packet_received(t->server, p->ipproto, t->current_feature_level, p->size);
+ /* Report that we successfully received a packet. We keep track of the largest packet
+ * size/fragment size we got. Which is useful for announcing the EDNS(0) packet size we can
+ * receive to our server. */
+ dns_server_packet_received(t->server, p->ipproto, t->current_feature_level, dns_packet_size_unfragmented(p));
}
/* See if we know things we didn't know before that indicate we better restart the lookup immediately. */
@@ -1470,7 +1493,7 @@ static int dns_transaction_emit_udp(DnsTransaction *t) {
} else
dns_transaction_close_connection(t, true);
- r = dns_scope_emit_udp(t->scope, t->dns_udp_fd, t->sent);
+ r = dns_scope_emit_udp(t->scope, t->dns_udp_fd, t->server ? t->server->family : AF_UNSPEC, t->sent);
if (r < 0)
return r;
diff --git a/src/resolve/resolved-manager.c b/src/resolve/resolved-manager.c
index 6cc3a5d56f..fc5f8c79d3 100644
--- a/src/resolve/resolved-manager.c
+++ b/src/resolve/resolved-manager.c
@@ -19,6 +19,7 @@
#include "idn-util.h"
#include "io-util.h"
#include "missing_network.h"
+#include "missing_socket.h"
#include "netlink-util.h"
#include "ordered-set.h"
#include "parse-util.h"
@@ -881,6 +882,9 @@ int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) {
p->ttl = *(int *) CMSG_DATA(cmsg);
break;
+ case IPV6_RECVFRAGSIZE:
+ p->fragsize = *(int *) CMSG_DATA(cmsg);
+ break;
}
} else if (cmsg->cmsg_level == IPPROTO_IP) {
assert(p->family == AF_INET);
@@ -900,6 +904,10 @@ int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) {
case IP_TTL:
p->ttl = *(int *) CMSG_DATA(cmsg);
break;
+
+ case IP_RECVFRAGSIZE:
+ p->fragsize = *(int *) CMSG_DATA(cmsg);
+ break;
}
}
}
@@ -1658,3 +1666,63 @@ bool manager_server_is_stub(Manager *m, DnsServer *s) {
return false;
}
+
+int socket_disable_pmtud(int fd, int af) {
+ int r;
+
+ assert(fd >= 0);
+
+ if (af == AF_UNSPEC) {
+ r = socket_get_family(fd, &af);
+ if (r < 0)
+ return r;
+ }
+
+ switch (af) {
+
+ case AF_INET: {
+ /* Turn off path MTU discovery, let's rather fragment on the way than to open us up against
+ * PMTU forgery vulnerabilities.
+ *
+ * There appears to be no documentation about IP_PMTUDISC_OMIT, but it has the effect that
+ * the "Don't Fragment" bit in the IPv4 header is turned off, thus enforcing fragmentation if
+ * our datagram size exceeds the MTU of a router in the path, and turning off path MTU
+ * discovery.
+ *
+ * This helps mitigating the PMTUD vulnerability described here:
+ *
+ * https://blog.apnic.net/2019/07/12/its-time-to-consider-avoiding-ip-fragmentation-in-the-dns/
+ *
+ * Similar logic is in place in most DNS servers.
+ *
+ * There are multiple conflicting goals: we want to allow the largest datagrams possible (for
+ * efficiency reasons), but not have fragmentation (for security reasons), nor use PMTUD (for
+ * security reasons, too). Our strategy to deal with this is: use large packets, turn off
+ * PMTUD, but watch fragmentation taking place, and then size our packets to the max of the
+ * fragments seen — and if we need larger packets always go to TCP.
+ */
+
+ r = setsockopt_int(fd, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_OMIT);
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ case AF_INET6: {
+ /* On IPv6 fragmentation only is done by the sender — never by routers on the path. PMTUD is
+ * mandatory. If we want to turn off PMTUD, the only way is by sending with minimal MTU only,
+ * so that we apply maximum fragmentation locally already, and thus PMTUD doesn't happen
+ * because there's nothing that could be fragmented further anymore. */
+
+ r = setsockopt_int(fd, IPPROTO_IPV6, IPV6_MTU, IPV6_MIN_MTU);
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+}
diff --git a/src/resolve/resolved-manager.h b/src/resolve/resolved-manager.h
index 90f5586230..1371c41b92 100644
--- a/src/resolve/resolved-manager.h
+++ b/src/resolve/resolved-manager.h
@@ -204,3 +204,5 @@ void manager_cleanup_saved_user(Manager *m);
bool manager_next_dnssd_names(Manager *m);
bool manager_server_is_stub(Manager *m, DnsServer *s);
+
+int socket_disable_pmtud(int fd, int af);
diff --git a/src/resolve/resolved-mdns.c b/src/resolve/resolved-mdns.c
index 5b4d08cce8..cf310ea01e 100644
--- a/src/resolve/resolved-mdns.c
+++ b/src/resolve/resolved-mdns.c
@@ -237,7 +237,7 @@ static int mdns_scope_process_query(DnsScope *s, DnsPacket *p) {
if (!ratelimit_below(&s->ratelimit))
return 0;
- r = dns_scope_emit_udp(s, -1, reply);
+ r = dns_scope_emit_udp(s, -1, AF_UNSPEC, reply);
if (r < 0)
return log_debug_errno(r, "Failed to send reply packet: %m");