From dd4a41d654cd7dd729a50a795e9ae42a2b2c1ab7 Mon Sep 17 00:00:00 2001 From: pjd Date: Thu, 4 Oct 2018 05:54:57 +0000 Subject: [PATCH] When the adist_free list is empty and we lose connection to the receiver we move all elements from the adist_send and adist_recv lists back onto the adist_free list, but we don't wake consumers waitings for the adist_free list to become non-empty. This can lead to the sender process stopping audit trail files distribution and waiting forever. Fix the problem by adding the missing wakeup. While here slow down spinning on CPU in case of a short race in sender_disconnect() and add an explaination when it can occur. PR: 201953 Reported by: peter Approved by: re (kib) --- contrib/openbsm/bin/auditdistd/auditdistd.h | 15 +++++++++++++++ contrib/openbsm/bin/auditdistd/sender.c | 15 ++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/contrib/openbsm/bin/auditdistd/auditdistd.h b/contrib/openbsm/bin/auditdistd/auditdistd.h index d0594f2ebcd..26dd51bdff8 100644 --- a/contrib/openbsm/bin/auditdistd/auditdistd.h +++ b/contrib/openbsm/bin/auditdistd/auditdistd.h @@ -248,6 +248,21 @@ struct adrep { if (_wakeup) \ cv_signal(list##_cond); \ } while (0) +#define QUEUE_CONCAT2(tolist, fromlist1, fromlist2) do { \ + bool _wakeup; \ + \ + mtx_lock(tolist##_lock); \ + _wakeup = TAILQ_EMPTY(tolist); \ + mtx_lock(fromlist1##_lock); \ + TAILQ_CONCAT((tolist), (fromlist1), adr_next); \ + mtx_unlock(fromlist1##_lock); \ + mtx_lock(fromlist2##_lock); \ + TAILQ_CONCAT((tolist), (fromlist2), adr_next); \ + mtx_unlock(fromlist2##_lock); \ + mtx_unlock(tolist##_lock); \ + if (_wakeup) \ + cv_signal(tolist##_cond); \ +} while (0) #define QUEUE_WAIT(list) do { \ mtx_lock(list##_lock); \ while (TAILQ_EMPTY(list)) \ diff --git a/contrib/openbsm/bin/auditdistd/sender.c b/contrib/openbsm/bin/auditdistd/sender.c index 4349b0f54cc..09420f2a52c 100644 --- a/contrib/openbsm/bin/auditdistd/sender.c +++ b/contrib/openbsm/bin/auditdistd/sender.c @@ -342,14 +342,7 @@ sender_disconnect(void) pjdlog_warning("Disconnected from %s.", adhost->adh_remoteaddr); /* Move all in-flight requests back onto free list. */ - mtx_lock(&adist_free_list_lock); - mtx_lock(&adist_send_list_lock); - TAILQ_CONCAT(&adist_free_list, &adist_send_list, adr_next); - mtx_unlock(&adist_send_list_lock); - mtx_lock(&adist_recv_list_lock); - TAILQ_CONCAT(&adist_free_list, &adist_recv_list, adr_next); - mtx_unlock(&adist_recv_list_lock); - mtx_unlock(&adist_free_list_lock); + QUEUE_CONCAT2(&adist_free_list, &adist_send_list, &adist_recv_list); } static void @@ -609,9 +602,13 @@ recv_thread(void *arg __unused) if (adhost->adh_remote == NULL) { /* * Connection is dead. - * XXX: We shouldn't be here. + * There is a short race in sender_disconnect() between + * setting adh_remote to NULL and removing entries from + * the recv list, which can result in us being here. + * To avoid just spinning, wait for 0.1s. */ rw_unlock(&adist_remote_lock); + usleep(100000); continue; } if (proto_recv(adhost->adh_remote, &adrep, -- 2.45.0