From 855498f6718967f982b6ea980db99b0f567da2ae Mon Sep 17 00:00:00 2001 From: kib Date: Sun, 30 Dec 2018 15:46:45 +0000 Subject: [PATCH] Fix linux_destroy_dev() behaviour when there are still files open from the destroying cdev. Currently linux_destroy_dev() waits for the reference count on the linux cdev to drain, and each open file hold the reference. Practically it means that linux_destroy_dev() is blocked until all userspace processes that have the cdev open, exit. FreeBSD devfs does not have such problem, because device refcount only prevents freeing of the cdev memory, and separate 'active methods' counter blocks destroy_dev() until all threads leave the cdevsw methods. After that, attempts to enter cdevsw methods are refused with an error. Implement somewhat similar mechanism for LinuxKPI cdevs. Demote cdev refcount to only mean a hold on the linux cdev memory. Add sirefs count to track both number of threads inside the cdev methods, and for single-bit indicator that cdev is being destroyed. In the later case, the call is redirected to the dummy cdev. Reviewed by: markj Discussed with: hselasky Tested by: zeising MFC after: 1 week Sponsored by: Mellanox Technologies Differential revision: https://reviews.freebsd.org/D18606 --- .../linuxkpi/common/include/linux/cdev.h | 8 +- sys/compat/linuxkpi/common/src/linux_compat.c | 267 ++++++++++++------ 2 files changed, 189 insertions(+), 86 deletions(-) diff --git a/sys/compat/linuxkpi/common/include/linux/cdev.h b/sys/compat/linuxkpi/common/include/linux/cdev.h index 1b5f66edd75..6e2e568a607 100644 --- a/sys/compat/linuxkpi/common/include/linux/cdev.h +++ b/sys/compat/linuxkpi/common/include/linux/cdev.h @@ -52,7 +52,8 @@ struct linux_cdev { struct cdev *cdev; dev_t dev; const struct file_operations *ops; - atomic_long_t refs; + u_int refs; + u_int siref; }; static inline void @@ -61,7 +62,7 @@ cdev_init(struct linux_cdev *cdev, const struct file_operations *ops) kobject_init(&cdev->kobj, &linux_cdev_static_ktype); cdev->ops = ops; - atomic_long_set(&cdev->refs, 0); + cdev->refs = 1; } static inline struct linux_cdev * @@ -70,8 +71,9 @@ cdev_alloc(void) struct linux_cdev *cdev; cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); - if (cdev) + if (cdev != NULL) kobject_init(&cdev->kobj, &linux_cdev_ktype); + cdev->refs = 1; return (cdev); } diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 297c6d8e01d..3dd9dec4851 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -100,6 +101,7 @@ MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat"); #undef cdev #define RB_ROOT(head) (head)->rbh_root +static void linux_cdev_deref(struct linux_cdev *ldev); static struct vm_area_struct *linux_cdev_handle_find(void *handle); struct kobject linux_class_root; @@ -691,6 +693,52 @@ zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, return (0); } +static struct file_operations dummy_ldev_ops = { + /* XXXKIB */ +}; + +static struct linux_cdev dummy_ldev = { + .ops = &dummy_ldev_ops, +}; + +#define LDEV_SI_DTR 0x0001 +#define LDEV_SI_REF 0x0002 + +static void +linux_get_fop(struct linux_file *filp, const struct file_operations **fop, + struct linux_cdev **dev) +{ + struct linux_cdev *ldev; + u_int siref; + + ldev = filp->f_cdev; + *fop = filp->f_op; + if (ldev != NULL) { + for (siref = ldev->siref;;) { + if ((siref & LDEV_SI_DTR) != 0) { + ldev = &dummy_ldev; + siref = ldev->siref; + *fop = ldev->ops; + MPASS((ldev->siref & LDEV_SI_DTR) == 0); + } else if (atomic_fcmpset_int(&ldev->siref, &siref, + siref + LDEV_SI_REF)) { + break; + } + } + } + *dev = ldev; +} + +static void +linux_drop_fop(struct linux_cdev *ldev) +{ + + if (ldev == NULL) + return; + MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); + atomic_subtract_int(&ldev->siref, LDEV_SI_REF); +} + #define OPW(fp,td,code) ({ \ struct file *__fpop; \ __typeof(code) __retval; \ @@ -703,10 +751,12 @@ zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, }) static int -linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file) +linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, + struct file *file) { struct linux_cdev *ldev; struct linux_file *filp; + const struct file_operations *fop; int error; ldev = dev->si_drv1; @@ -718,20 +768,17 @@ linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *f filp->f_flags = file->f_flag; filp->f_vnode = file->f_vnode; filp->_file = file; + refcount_acquire(&ldev->refs); filp->f_cdev = ldev; linux_set_current(td); + linux_get_fop(filp, &fop, &ldev); - /* get a reference on the Linux character device */ - if (atomic_long_add_unless(&ldev->refs, 1, -1L) == 0) { - kfree(filp); - return (EINVAL); - } - - if (filp->f_op->open) { - error = -filp->f_op->open(file->f_vnode, filp); - if (error) { - atomic_long_dec(&ldev->refs); + if (fop->open != NULL) { + error = -fop->open(file->f_vnode, filp); + if (error != 0) { + linux_drop_fop(ldev); + linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } @@ -742,6 +789,7 @@ linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *f /* release the file from devfs */ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); + linux_drop_fop(ldev); return (ENXIO); } @@ -877,7 +925,8 @@ linux_get_error(struct task_struct *task, int error) static int linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, - u_long cmd, caddr_t data, struct thread *td) + const struct file_operations *fop, u_long cmd, caddr_t data, + struct thread *td) { struct task_struct *task = current; unsigned size; @@ -902,20 +951,28 @@ linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, #if defined(__amd64__) if (td->td_proc->p_elf_machine == EM_386) { /* try the compat IOCTL handler first */ - if (filp->f_op->compat_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->compat_ioctl(filp, cmd, (u_long)data)); - else + if (fop->compat_ioctl != NULL) { + error = -OPW(fp, td, fop->compat_ioctl(filp, + cmd, (u_long)data)); + } else { error = ENOTTY; + } /* fallback to the regular IOCTL handler, if any */ - if (error == ENOTTY && filp->f_op->unlocked_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data)); + if (error == ENOTTY && fop->unlocked_ioctl != NULL) { + error = -OPW(fp, td, fop->unlocked_ioctl(filp, + cmd, (u_long)data)); + } } else #endif - if (filp->f_op->unlocked_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data)); - else - error = ENOTTY; + { + if (fop->unlocked_ioctl != NULL) { + error = -OPW(fp, td, fop->unlocked_ioctl(filp, + cmd, (u_long)data)); + } else { + error = ENOTTY; + } + } if (size > 0) { task->bsd_ioctl_data = NULL; task->bsd_ioctl_len = 0; @@ -1084,30 +1141,36 @@ static struct filterops linux_dev_kqfiltops_write = { static void linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) { + struct thread *td; + const struct file_operations *fop; + struct linux_cdev *ldev; int temp; - if (filp->f_kqflags & kqflags) { - struct thread *td = curthread; - - /* get the latest polling state */ - temp = OPW(filp->_file, td, filp->f_op->poll(filp, NULL)); - - spin_lock(&filp->f_kqlock); - /* clear kqflags */ - filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | - LINUX_KQ_FLAG_NEED_WRITE); - /* update kqflags */ - if (temp & (POLLIN | POLLOUT)) { - if (temp & POLLIN) - filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; - if (temp & POLLOUT) - filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; - - /* make sure the "knote" gets woken up */ - KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); - } - spin_unlock(&filp->f_kqlock); + if ((filp->f_kqflags & kqflags) == 0) + return; + + td = curthread; + + linux_get_fop(filp, &fop, &ldev); + /* get the latest polling state */ + temp = OPW(filp->_file, td, fop->poll(filp, NULL)); + linux_drop_fop(ldev); + + spin_lock(&filp->f_kqlock); + /* clear kqflags */ + filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | + LINUX_KQ_FLAG_NEED_WRITE); + /* update kqflags */ + if ((temp & (POLLIN | POLLOUT)) != 0) { + if ((temp & POLLIN) != 0) + filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; + if ((temp & POLLOUT) != 0) + filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; + + /* make sure the "knote" gets woken up */ + KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); } + spin_unlock(&filp->f_kqlock); } static int @@ -1156,9 +1219,9 @@ linux_file_kqfilter(struct file *file, struct knote *kn) } static int -linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset, - vm_size_t size, struct vm_object **object, int nprot, - struct thread *td) +linux_file_mmap_single(struct file *fp, const struct file_operations *fop, + vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, + int nprot, struct thread *td) { struct task_struct *task; struct vm_area_struct *vmap; @@ -1170,7 +1233,7 @@ linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset, filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; - if (filp->f_op->mmap == NULL) + if (fop->mmap == NULL) return (EOPNOTSUPP); linux_set_current(td); @@ -1200,7 +1263,7 @@ linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset, if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { error = linux_get_error(task, EINTR); } else { - error = -OPW(fp, td, filp->f_op->mmap(filp, vmap)); + error = -OPW(fp, td, fop->mmap(filp, vmap)); error = linux_get_error(task, error); up_write(&vmap->vm_mm->mmap_sem); } @@ -1319,6 +1382,8 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; ssize_t bytes; int error; @@ -1331,8 +1396,10 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); - if (filp->f_op->read) { - bytes = OPW(file, td, filp->f_op->read(filp, uio->uio_iov->iov_base, + linux_get_fop(filp, &fop, &ldev); + if (fop->read != NULL) { + bytes = OPW(file, td, fop->read(filp, + uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = @@ -1347,6 +1414,7 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); + linux_drop_fop(ldev); return (error); } @@ -1356,10 +1424,11 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; ssize_t bytes; int error; - error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ @@ -1368,14 +1437,17 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); - if (filp->f_op->write) { - bytes = OPW(file, td, filp->f_op->write(filp, uio->uio_iov->iov_base, + linux_get_fop(filp, &fop, &ldev); + if (fop->write != NULL) { + bytes = OPW(file, td, fop->write(filp, + uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; + error = 0; } else { error = linux_get_error(current, -bytes); } @@ -1385,6 +1457,8 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); + linux_drop_fop(ldev); + return (error); } @@ -1393,16 +1467,21 @@ linux_file_poll(struct file *file, int events, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td); - if (filp->f_op->poll != NULL) - revents = OPW(file, td, filp->f_op->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events; - else + linux_get_fop(filp, &fop, &ldev); + if (fop->poll != NULL) { + revents = OPW(file, td, fop->poll(filp, + LINUX_POLL_TABLE_NORMAL)) & events; + } else { revents = 0; - + } + linux_drop_fop(ldev); return (revents); } @@ -1410,23 +1489,28 @@ static int linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int error; filp = (struct linux_file *)file->f_data; - KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp))); + KASSERT(file_count(filp) == 0, + ("File refcount(%d) is not zero", file_count(filp))); + error = 0; filp->f_flags = file->f_flag; linux_set_current(td); linux_poll_wait_dequeue(filp); - error = -OPW(file, td, filp->f_op->release(filp->f_vnode, filp)); + linux_get_fop(filp, &fop, &ldev); + if (fop->release != NULL) + error = -OPW(file, td, fop->release(filp->f_vnode, filp)); funsetown(&filp->f_sigio); if (filp->f_vnode != NULL) vdrop(filp->f_vnode); - if (filp->f_cdev != NULL) { - /* put a reference on the Linux character device */ - atomic_long_dec(&filp->f_cdev->refs); - } + linux_drop_fop(ldev); + if (filp->f_cdev != NULL) + linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); @@ -1437,27 +1521,30 @@ linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int error; + error = 0; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; - error = 0; + linux_get_fop(filp, &fop, &ldev); linux_set_current(td); switch (cmd) { case FIONBIO: break; case FIOASYNC: - if (filp->f_op->fasync == NULL) + if (fop->fasync == NULL) break; - error = -OPW(fp, td, filp->f_op->fasync(0, filp, fp->f_flag & FASYNC)); + error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) { - if (filp->f_op->fasync == NULL) + if (fop->fasync == NULL) break; - error = -OPW(fp, td, filp->f_op->fasync(0, filp, + error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); } break; @@ -1465,16 +1552,17 @@ linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, *(int *)data = fgetown(&filp->f_sigio); break; default: - error = linux_file_ioctl_sub(fp, filp, cmd, data, td); + error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); break; } + linux_drop_fop(ldev); return (error); } static int linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct file *fp, - vm_ooffset_t *foff, vm_object_t *objp) + vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) { /* * Character devices do not provide private mappings @@ -1486,7 +1574,8 @@ linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0) return (EINVAL); - return (linux_file_mmap_single(fp, foff, objsize, objp, (int)prot, td)); + return (linux_file_mmap_single(fp, fop, foff, objsize, objp, + (int)prot, td)); } static int @@ -1495,6 +1584,8 @@ linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; struct mount *mp; struct vnode *vp; vm_object_t object; @@ -1541,15 +1632,18 @@ linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size } maxprot &= cap_maxprot; - error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, &foff, - &object); + linux_get_fop(filp, &fop, &ldev); + error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, + &foff, fop, &object); if (error != 0) - return (error); + goto out; error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); +out: + linux_drop_fop(ldev); return (error); } @@ -1970,6 +2064,14 @@ linux_completion_done(struct completion *c) return (isdone); } +static void +linux_cdev_deref(struct linux_cdev *ldev) +{ + + if (refcount_release(&ldev->refs)) + kfree(ldev); +} + static void linux_cdev_release(struct kobject *kobj) { @@ -1979,7 +2081,7 @@ linux_cdev_release(struct kobject *kobj) cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); - kfree(cdev); + linux_cdev_deref(cdev); kobject_put(parent); } @@ -1996,20 +2098,19 @@ linux_cdev_static_release(struct kobject *kobj) } void -linux_destroy_dev(struct linux_cdev *cdev) +linux_destroy_dev(struct linux_cdev *ldev) { - if (cdev->cdev == NULL) + if (ldev->cdev == NULL) return; - atomic_long_dec(&cdev->refs); - - /* wait for all open files to be closed */ - while (atomic_long_read(&cdev->refs) != -1L) - pause("ldevdrn", hz); + MPASS((ldev->siref & LDEV_SI_DTR) == 0); + atomic_set_int(&ldev->siref, LDEV_SI_DTR); + while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) + pause("ldevdtr", hz / 4); - destroy_dev(cdev->cdev); - cdev->cdev = NULL; + destroy_dev(ldev->cdev); + ldev->cdev = NULL; } const struct kobj_type linux_cdev_ktype = { -- 2.45.0