From 6836330b1e017b8611c57ff4bc094ae5036dc3de Mon Sep 17 00:00:00 2001
From: hselasky <hselasky@FreeBSD.org>
Date: Wed, 12 Dec 2018 12:22:40 +0000
Subject: [PATCH] MFC r341560: mlx5: Fix use-after-free in self-healing flow

When the mlx5 health mechanism detects a problem while the driver
is in the middle of init_one or remove_one, the driver needs to prevent
the health mechanism from scheduling future work; if future work
is scheduled, there is a problem with use-after-free: the system WQ
tries to run the work item (which has been freed) at the scheduled
future time.

Prevent this by disabling work item scheduling in the health mechanism
when the driver is in the middle of init_one() or remove_one().

Sponsored by:   Mellanox Technologies
---
 sys/dev/mlx5/driver.h                |  2 +-
 sys/dev/mlx5/mlx5_core/mlx5_health.c | 10 +++++++++-
 sys/dev/mlx5/mlx5_core/mlx5_main.c   | 10 ++++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h
index c00184363ea..a707079e743 100644
--- a/sys/dev/mlx5/driver.h
+++ b/sys/dev/mlx5/driver.h
@@ -897,7 +897,7 @@ void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_health.c b/sys/dev/mlx5/mlx5_core/mlx5_health.c
index b3a011adb98..5c372bee927 100644
--- a/sys/dev/mlx5/mlx5_core/mlx5_health.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_health.c
@@ -516,9 +516,17 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
 }
 
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	if (disable_health) {
+		spin_lock_irqsave(&health->wq_lock, flags);
+		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+		spin_unlock_irqrestore(&health->wq_lock, flags);
+	}
 
 	del_timer_sync(&health->timer);
 }
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c
index 3f91322ad00..02a0110c70e 100644
--- a/sys/dev/mlx5/mlx5_core/mlx5_main.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c
@@ -1089,7 +1089,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		mlx5_cleanup_once(dev);
 
 err_stop_poll:
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, boot);
 	if (mlx5_cmd_teardown_hca(dev)) {
 		device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n");
 		goto out_err;
@@ -1141,7 +1141,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_disable_msix(dev);
         if (cleanup)
                 mlx5_cleanup_once(dev);
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, cleanup);
 	err = mlx5_cmd_teardown_hca(dev);
 	if (err) {
 		device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n");
@@ -1388,6 +1388,12 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
 		return -EAGAIN;
 	}
 
+	/* Panic tear down fw command will stop the PCI bus communication
+	 * with the HCA, so the health polll is no longer needed.
+	 */
+	mlx5_drain_health_wq(dev);
+	mlx5_stop_health_poll(dev, false);
+
 	err = mlx5_cmd_force_teardown_hca(dev);
 	if (err) {
 		mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", err);
-- 
2.45.0