Commit 93d5efb4 authored by Thomas Haller's avatar Thomas Haller

dns: move ratelimiting and restart from NMDnsManager to NMDnsDnsmasq

Note that the only DNS plugin that actually emits the FAILED signal was
NMDnsDnsmasq. Let's not handle restart, retry and rate-limiting by
NMDnsManager but by NMDnsDnsmasq itself.

There are three goals here:

(1) we want that when dnsmasq (infrequently) crashes, that we always keep
  retrying. A random crash should be automatically resolved and
  eventually dnsmasq should be working again.
  Note that we anyway cannot fully detect whether something is wrong.
  OK, we detect crashes, but if dnsmasq just gets catatonic, it's just
  as broken. Point being: our ability to detect non-working dnsmasq is limited.

(2) when dnsmasq keeps crashing all the time, then rate limit the retry.
  Of course, at this point there is already something seriously wrong,
  but we shouldn't kill the system by respawning the process without rate
  limiting.

(3) previously, when NMDnsManager noticed that the pluging was broken
  (and rate-limiting kicked in), it would temporarily disable the plugin.
  Basically, that meant to write the real name servers to /etc/resolv.conf
  directly, instead of setting localhost. This partly conflicts with
  (1), because we want to retry and recover automatically. So what good
  is it to notice a problem, resort to plain /etc/resolv.conf for a
  short time, and then run into the issues again? If something is really
  broken, there is no way but to involve the user to investigate and
  fix the issue. Hence, we don't need to concern NMDnsManager with this either.
  The only thing that the manager notices is when the dnsmasq binary is not
  available. In that case, update() fails right away, and the manager falls back
  to configure the name servers in /etc/resolv.conf directly.

Also, change the backoff time from 5 minutes to 1 minute (twice the
burst interval). There is not particularly strong reason for either
choice, I think that if the ratelimit kicks in, then something is
already so wrong that it doesn't matter either way. Anyway, also 60
seconds is long enough to not kill the machine otherwise.
parent a780b048
Pipeline #66655 failed with stage
......@@ -31,6 +31,9 @@
#define DNSMASQ_DBUS_SERVICE "org.freedesktop.NetworkManager.dnsmasq"
#define DNSMASQ_DBUS_PATH "/uk/org/thekelleys/dnsmasq"
#define RATELIMIT_INTERVAL_MSEC 30000
#define RATELIMIT_BURST 5
#define _NMLOG_DOMAIN LOGD_DNS
/*****************************************************************************/
......@@ -647,11 +650,19 @@ typedef struct {
char *name_owner;
gint64 burst_start_at;
GPid process_pid;
guint name_owner_changed_id;
guint main_timeout_id;
guint burst_retry_timeout_id;
guint8 burst_count;
bool is_stopped:1;
} NMDnsDnsmasqPrivate;
struct _NMDnsDnsmasq {
......@@ -674,6 +685,10 @@ G_DEFINE_TYPE (NMDnsDnsmasq, nm_dns_dnsmasq, NM_TYPE_DNS_PLUGIN)
/*****************************************************************************/
static gboolean start_dnsmasq (NMDnsDnsmasq *self, gboolean force_start, GError **error);
/*****************************************************************************/
static void
add_dnsmasq_nameserver (NMDnsDnsmasq *self,
GVariantBuilder *servers,
......@@ -839,30 +854,27 @@ send_dnsmasq_update (NMDnsDnsmasq *self)
{
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
if (!priv->set_server_ex_args)
return;
if ( !priv->name_owner
|| !priv->set_server_ex_args)
return;
if (priv->name_owner) {
_LOGD ("trying to update dnsmasq nameservers");
nm_clear_g_cancellable (&priv->update_cancellable);
priv->update_cancellable = g_cancellable_new ();
g_dbus_connection_call (priv->dbus_connection,
priv->name_owner,
DNSMASQ_DBUS_PATH,
DNSMASQ_DBUS_SERVICE,
"SetServersEx",
priv->set_server_ex_args,
NULL,
G_DBUS_CALL_FLAGS_NO_AUTO_START,
20000,
priv->update_cancellable,
dnsmasq_update_done,
self);
g_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
} else
_LOGD ("dnsmasq is starting... The nameserver update will be sent when dnsmasq appears");
_LOGD ("trying to update dnsmasq nameservers");
nm_clear_g_cancellable (&priv->update_cancellable);
priv->update_cancellable = g_cancellable_new ();
g_dbus_connection_call (priv->dbus_connection,
priv->name_owner,
DNSMASQ_DBUS_PATH,
DNSMASQ_DBUS_SERVICE,
"SetServersEx",
priv->set_server_ex_args,
NULL,
G_DBUS_CALL_FLAGS_NO_AUTO_START,
20000,
priv->update_cancellable,
dnsmasq_update_done,
self);
}
/*****************************************************************************/
......@@ -888,8 +900,11 @@ _main_cleanup (NMDnsDnsmasq *self, gboolean emit_failed)
* process in the background. */
nm_clear_g_cancellable (&priv->main_cancellable);
if (emit_failed)
_nm_dns_plugin_emit_failed (self, TRUE);
if ( !priv->is_stopped
&& priv->burst_retry_timeout_id == 0) {
start_dnsmasq (self, FALSE, NULL);
send_dnsmasq_update (self);
}
}
static void
......@@ -1001,10 +1016,24 @@ spawn_notify (GCancellable *cancellable,
}
static gboolean
start_dnsmasq (NMDnsDnsmasq *self, GError **error)
_burst_retry_timeout_cb (gpointer user_data)
{
NMDnsDnsmasq *self = user_data;
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
priv->burst_retry_timeout_id = 0;
start_dnsmasq (self, TRUE, NULL);
send_dnsmasq_update (self);
return G_SOURCE_REMOVE;
}
static gboolean
start_dnsmasq (NMDnsDnsmasq *self, gboolean force_start, GError **error)
{
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
const char *dm_binary;
gint64 now;
if (G_LIKELY (priv->main_cancellable)) {
/* The process is already running or about to be started. Nothing to do. */
......@@ -1030,6 +1059,31 @@ start_dnsmasq (NMDnsDnsmasq *self, GError **error)
}
}
now = nm_utils_get_monotonic_timestamp_ms ();
if ( force_start
|| priv->burst_start_at == 0
|| priv->burst_start_at + RATELIMIT_INTERVAL_MSEC <= now) {
priv->burst_start_at = now;
priv->burst_count = 1;
nm_clear_g_source (&priv->burst_retry_timeout_id);
_LOGT ("rate-limit: start burst interval of %d seconds %s",
RATELIMIT_INTERVAL_MSEC / 1000,
force_start ? " (force)" : "");
} else if (priv->burst_count < RATELIMIT_BURST) {
nm_assert (priv->burst_retry_timeout_id == 0);
priv->burst_count++;
_LOGT ("rate-limit: %u try within burst interval of %d seconds",
(guint) priv->burst_count,
RATELIMIT_INTERVAL_MSEC / 1000);
} else {
if (priv->burst_retry_timeout_id == 0) {
_LOGW ("dnsmasq dies and gets respawned too quickly. Back off. Something is very wrong");
priv->burst_retry_timeout_id = g_timeout_add_seconds ((2 * RATELIMIT_INTERVAL_MSEC) / 1000, _burst_retry_timeout_cb, self);
} else
_LOGT ("rate-limit: currently rate-limited from restart");
return TRUE;
}
priv->main_timeout_id = g_timeout_add (10000,
spawn_timeout_cb,
self);
......@@ -1053,7 +1107,7 @@ update (NMDnsPlugin *plugin,
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (plugin);
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
if (!start_dnsmasq (self, error))
if (!start_dnsmasq (self, TRUE, error))
return FALSE;
nm_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
......@@ -1072,6 +1126,11 @@ static void
stop (NMDnsPlugin *plugin)
{
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (plugin);
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
priv->is_stopped = TRUE;
priv->burst_start_at = 0;
nm_clear_g_source (&priv->burst_retry_timeout_id);
/* Cancelling the cancellable will also terminate the
* process (in the background). */
......@@ -1097,6 +1156,10 @@ dispose (GObject *object)
NMDnsDnsmasq *self = NM_DNS_DNSMASQ (object);
NMDnsDnsmasqPrivate *priv = NM_DNS_DNSMASQ_GET_PRIVATE (self);
priv->is_stopped = TRUE;
nm_clear_g_source (&priv->burst_retry_timeout_id);
_main_cleanup (self, FALSE);
g_clear_pointer (&priv->set_server_ex_args, g_variant_unref);
......
......@@ -50,10 +50,6 @@
#define NETCONFIG_PATH "/sbin/netconfig"
#endif
#define PLUGIN_RATELIMIT_INTERVAL 30
#define PLUGIN_RATELIMIT_BURST 5
#define PLUGIN_RATELIMIT_DELAY 300
/*****************************************************************************/
typedef enum {
......@@ -1557,71 +1553,6 @@ update_dns (NMDnsManager *self,
return !update || result == SR_SUCCESS;
}
static void
plugin_failed (NMDnsManager *self, NMDnsPlugin *plugin)
{
GError *error = NULL;
/* Errors with non-caching plugins aren't fatal */
if (!nm_dns_plugin_is_caching (plugin))
return;
/* Disable caching until the next DNS update */
if (!update_dns (self, TRUE, &error)) {
_LOGW ("could not commit DNS changes: %s", error->message);
g_clear_error (&error);
}
}
static gboolean
plugin_child_quit_update_dns (gpointer user_data)
{
GError *error = NULL;
NMDnsManager *self = NM_DNS_MANAGER (user_data);
/* Let the plugin try to spawn the child again */
if (!update_dns (self, FALSE, &error)) {
_LOGW ("could not commit DNS changes: %s", error->message);
g_clear_error (&error);
}
return G_SOURCE_REMOVE;
}
static void
plugin_failed_cb (NMDnsPlugin *plugin, gboolean is_fatal, NMDnsManager *self)
{
NMDnsManagerPrivate *priv = NM_DNS_MANAGER_GET_PRIVATE (self);
if (is_fatal) {
gint64 ts = nm_utils_get_monotonic_timestamp_ms ();
_LOGW ("plugin %s died unexpectedly", nm_dns_plugin_get_name (plugin));
if ( !priv->plugin_ratelimit.ts
|| (ts - priv->plugin_ratelimit.ts) / 1000 > PLUGIN_RATELIMIT_INTERVAL) {
priv->plugin_ratelimit.ts = ts;
priv->plugin_ratelimit.num_restarts = 0;
} else {
priv->plugin_ratelimit.num_restarts++;
if (priv->plugin_ratelimit.num_restarts > PLUGIN_RATELIMIT_BURST) {
plugin_failed (self, plugin);
_LOGW ("plugin %s child respawning too fast, delaying update for %u seconds",
nm_dns_plugin_get_name (plugin), PLUGIN_RATELIMIT_DELAY);
priv->plugin_ratelimit.timer = g_timeout_add_seconds (PLUGIN_RATELIMIT_DELAY,
plugin_child_quit_update_dns,
self);
return;
}
}
plugin_child_quit_update_dns (self);
return;
}
plugin_failed (self, plugin);
}
/*****************************************************************************/
static void
......@@ -1856,7 +1787,6 @@ _clear_plugin (NMDnsManager *self)
nm_clear_g_source (&priv->plugin_ratelimit.timer);
if (priv->plugin) {
g_signal_handlers_disconnect_by_func (priv->plugin, plugin_failed_cb, self);
nm_dns_plugin_stop (priv->plugin);
g_clear_object (&priv->plugin);
return TRUE;
......@@ -2079,10 +2009,6 @@ again:
} else if (nm_clear_g_object (&priv->sd_resolve_plugin))
systemd_resolved_changed = TRUE;
if ( plugin_changed
&& priv->plugin)
g_signal_connect (priv->plugin, NM_DNS_PLUGIN_FAILED, G_CALLBACK (plugin_failed_cb), self);
g_object_freeze_notify (G_OBJECT (self));
if (!nm_streq0 (priv->mode, mode)) {
......
......@@ -17,13 +17,6 @@
/*****************************************************************************/
enum {
FAILED,
LAST_SIGNAL,
};
static guint signals[LAST_SIGNAL] = { 0 };
typedef struct _NMDnsPluginPrivate {
GPid pid;
guint watch_id;
......@@ -105,15 +98,6 @@ nm_dns_plugin_stop (NMDnsPlugin *self)
klass->stop (self);
}
void
_nm_dns_plugin_emit_failed (gpointer /* NMDnsPlugin * */ self,
gboolean is_fatal)
{
nm_assert (NM_IS_DNS_PLUGIN (self));
g_signal_emit (self, signals[FAILED], 0, (gboolean) (!!is_fatal));
}
/*****************************************************************************/
static void
......@@ -124,18 +108,4 @@ nm_dns_plugin_init (NMDnsPlugin *self)
static void
nm_dns_plugin_class_init (NMDnsPluginClass *plugin_class)
{
GObjectClass *object_class = G_OBJECT_CLASS (plugin_class);
/* Emitted by the plugin and consumed by NMDnsManager when
* some error happens with the nameserver subprocess. Causes NM to fall
* back to writing out a non-local-caching resolv.conf until the next
* DNS update.
*/
signals[FAILED] =
g_signal_new (NM_DNS_PLUGIN_FAILED,
G_OBJECT_CLASS_TYPE (object_class),
G_SIGNAL_RUN_FIRST,
0, NULL, NULL,
g_cclosure_marshal_VOID__BOOLEAN,
G_TYPE_NONE, 1, G_TYPE_BOOLEAN);
}
......@@ -15,8 +15,6 @@
#define NM_IS_DNS_PLUGIN_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), NM_TYPE_DNS_PLUGIN))
#define NM_DNS_PLUGIN_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), NM_TYPE_DNS_PLUGIN, NMDnsPluginClass))
#define NM_DNS_PLUGIN_FAILED "failed"
typedef struct {
GObject parent;
} NMDnsPlugin;
......@@ -61,9 +59,4 @@ gboolean nm_dns_plugin_update (NMDnsPlugin *self,
void nm_dns_plugin_stop (NMDnsPlugin *self);
/*****************************************************************************/
void _nm_dns_plugin_emit_failed (gpointer /* NMDnsPlugin * */ self,
gboolean is_fatal);
#endif /* __NM_DNS_PLUGIN_H__ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment