contrib/ofed/include/udma_barrier.h

   1 /*
   2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  */
  32
  33 #ifndef __UTIL_UDMA_BARRIER_H
  34 #define __UTIL_UDMA_BARRIER_H
  35
  36 #include <pthread.h>
  37
  38 /* Barriers for DMA.
  39
  40    These barriers are expliclty only for use with user DMA operations. If you
  41    are looking for barriers to use with cache-coherent multi-threaded
  42    consitency then look in stdatomic.h. If you need both kinds of synchronicity
  43    for the same address then use an atomic operation followed by one
  44    of these barriers.
  45
  46    When reasoning about these barriers there are two objects:
  47      - CPU attached address space (the CPU memory could be a range of things:
  48        cached/uncached/non-temporal CPU DRAM, uncached MMIO space in another
  49        device, pMEM). Generally speaking the ordering is only relative
  50        to the local CPU's view of the system. Eg if the local CPU
  51        is not guaranteed to see a write from another CPU then it is also
  52        OK for the DMA device to also not see the write after the barrier.
  53      - A DMA initiator on a bus. For instance a PCI-E device issuing
  54        MemRd/MemWr TLPs.
  55
  56    The ordering guarantee is always stated between those two streams. Eg what
  57    happens if a MemRd TLP is sent in via PCI-E relative to a CPU WRITE to the
  58    same memory location.
  59
  60    The providers have a very regular and predictable use of these barriers,
  61    to make things very clear each narrow use is given a name and the proper
  62    name should be used in the provider as a form of documentation.
  63 */
  64
  65 /* Ensure that the device's view of memory matches the CPU's view of memory.
  66    This should be placed before any MMIO store that could trigger the device
  67    to begin doing DMA, such as a device doorbell ring.
  68
  69    eg
  70     *dma_buf = 1;
  71     udma_to_device_barrier();
  72     mmio_write(DO_DMA_REG, dma_buf);
  73    Must ensure that the device sees the '1'.
  74
  75    This is required to fence writes created by the libibverbs user. Those
  76    writes could be to any CPU mapped memory object with any cachability mode.
  77
  78    NOTE: x86 has historically used a weaker semantic for this barrier, and
  79    only fenced normal stores to normal memory. libibverbs users using other
  80    memory types or non-temporal stores are required to use SFENCE in their own
  81    code prior to calling verbs to start a DMA.
  82 */
  83 #if defined(__i386__)
  84 #define udma_to_device_barrier() asm volatile("" ::: "memory")
  85 #elif defined(__x86_64__)
  86 #define udma_to_device_barrier() asm volatile("" ::: "memory")
  87 #elif defined(__PPC64__)
  88 #define udma_to_device_barrier() asm volatile("sync" ::: "memory")
  89 #elif defined(__PPC__)
  90 #define udma_to_device_barrier() asm volatile("sync" ::: "memory")
  91 #elif defined(__ia64__)
  92 #define udma_to_device_barrier() asm volatile("mf" ::: "memory")
  93 #elif defined(__sparc_v9__)
  94 #define udma_to_device_barrier() asm volatile("membar #StoreStore" ::: "memory")
  95 #elif defined(__aarch64__)
  96 #define udma_to_device_barrier() asm volatile("dsb st" ::: "memory");
  97 #elif defined(__sparc__) || defined(__s390x__)
  98 #define udma_to_device_barrier() asm volatile("" ::: "memory")
  99 #elif defined(__mips__)
 100 #include <sys/types.h>
 101 #include <machine/atomic.h>
 102 #define udma_to_device_barrier() mips_sync()
 103 #elif defined(__arm__)
 104 #include <sys/types.h>
 105 #include <machine/atomic.h>
 106 #define udma_to_device_barrier() dmb()
 107 #elif defined(__riscv)
 108 #include <sys/types.h>
 109 #include <machine/atomic.h>
 110 #define udma_to_device_barrier() fence()
 111 #else
 112 #error No architecture specific memory barrier defines found!
 113 #endif
 114
 115 /* Ensure that all ordered stores from the device are observable from the
 116    CPU. This only makes sense after something that observes an ordered store
 117    from the device - eg by reading a MMIO register or seeing that CPU memory is
 118    updated.
 119
 120    This guarantees that all reads that follow the barrier see the ordered
 121    stores that preceded the observation.
 122
 123    For instance, this would be used after testing a valid bit in a memory
 124    that is a DMA target, to ensure that the following reads see the
 125    data written before the MemWr TLP that set the valid bit.
 126 */
 127 #if defined(__i386__)
 128 #define udma_from_device_barrier() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
 129 #elif defined(__x86_64__)
 130 #define udma_from_device_barrier() asm volatile("lfence" ::: "memory")
 131 #elif defined(__PPC64__)
 132 #define udma_from_device_barrier() asm volatile("lwsync" ::: "memory")
 133 #elif defined(__PPC__)
 134 #define udma_from_device_barrier() asm volatile("sync" ::: "memory")
 135 #elif defined(__ia64__)
 136 #define udma_from_device_barrier() asm volatile("mf" ::: "memory")
 137 #elif defined(__sparc_v9__)
 138 #define udma_from_device_barrier() asm volatile("membar #LoadLoad" ::: "memory")
 139 #elif defined(__aarch64__)
 140 #define udma_from_device_barrier() asm volatile("dsb ld" ::: "memory");
 141 #elif defined(__sparc__) || defined(__s390x__)
 142 #define udma_from_device_barrier() asm volatile("" ::: "memory")
 143 #elif defined(__mips__)
 144 #define udma_from_device_barrier() mips_sync()
 145 #elif defined(__arm__)
 146 #define udma_from_device_barrier() dmb()
 147 #elif defined(__riscv)
 148 #define udma_from_device_barrier() fence()
 149 #else
 150 #error No architecture specific memory barrier defines found!
 151 #endif
 152
 153 /* Order writes to CPU memory so that a DMA device cannot view writes after
 154    the barrier without also seeing all writes before the barrier. This does
 155    not guarantee any writes are visible to DMA.
 156
 157    This would be used in cases where a DMA buffer might have a valid bit and
 158    data, this barrier is placed after writing the data but before writing the
 159    valid bit to ensure the DMA device cannot observe a set valid bit with
 160    unwritten data.
 161
 162    Compared to udma_to_device_barrier() this barrier is not required to fence
 163    anything but normal stores to normal malloc memory. Usage should be:
 164
 165    write_wqe
 166       udma_to_device_barrier();    // Get user memory ready for DMA
 167       wqe->addr = ...;
 168       wqe->flags = ...;
 169       udma_ordering_write_barrier();  // Guarantee WQE written in order
 170       wqe->valid = 1;
 171 */
 172 #define udma_ordering_write_barrier() udma_to_device_barrier()
 173
 174 /* Promptly flush writes to MMIO Write Cominbing memory.
 175    This should be used after a write to WC memory. This is both a barrier
 176    and a hint to the CPU to flush any buffers to reduce latency to TLP
 177    generation.
 178
 179    This is not required to have any effect on CPU memory.
 180
 181    If done while holding a lock then the ordering of MMIO writes across CPUs
 182    must be guaranteed to follow the natural ordering implied by the lock.
 183
 184    This must also act as a barrier that prevents write combining, eg
 185      *wc_mem = 1;
 186      mmio_flush_writes();
 187      *wc_mem = 2;
 188    Must always produce two MemWr TLPs, '1' and '2'. Without the barrier
 189    the CPU is allowed to produce a single TLP '2'.
 190
 191    Note that there is no order guarantee for writes to WC memory without
 192    barriers.
 193
 194    This is intended to be used in conjunction with WC memory to generate large
 195    PCI-E MemWr TLPs from the CPU.
 196 */
 197 #if defined(__i386__)
 198 #define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
 199 #elif defined(__x86_64__)
 200 #define mmio_flush_writes() asm volatile("sfence" ::: "memory")
 201 #elif defined(__PPC64__)
 202 #define mmio_flush_writes() asm volatile("sync" ::: "memory")
 203 #elif defined(__PPC__)
 204 #define mmio_flush_writes() asm volatile("sync" ::: "memory")
 205 #elif defined(__ia64__)
 206 #define mmio_flush_writes() asm volatile("fwb" ::: "memory")
 207 #elif defined(__sparc_v9__)
 208 #define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory")
 209 #elif defined(__aarch64__)
 210 #define mmio_flush_writes() asm volatile("dsb st" ::: "memory");
 211 #elif defined(__sparc__) || defined(__s390x__)
 212 #define mmio_flush_writes() asm volatile("" ::: "memory")
 213 #elif defined(__mips__)
 214 #define mmio_flush_writes() mips_sync()
 215 #elif defined(__arm__)
 216 #define mmio_flush_writes() dmb()
 217 #elif defined(__riscv)
 218 #define mmio_flush_writes() fence()
 219 #else
 220 #error No architecture specific memory barrier defines found!
 221 #endif
 222
 223 /* Prevent WC writes from being re-ordered relative to other MMIO
 224    writes. This should be used before a write to WC memory.
 225
 226    This must act as a barrier to prevent write re-ordering from different
 227    memory types:
 228      *mmio_mem = 1;
 229      mmio_flush_writes();
 230      *wc_mem = 2;
 231    Must always produce a TLP '1' followed by '2'.
 232
 233    This barrier implies udma_to_device_barrier()
 234
 235    This is intended to be used in conjunction with WC memory to generate large
 236    PCI-E MemWr TLPs from the CPU.
 237 */
 238 #define mmio_wc_start() mmio_flush_writes()
 239
 240 /* Keep MMIO writes in order.
 241    Currently we lack writel macros that universally guarantee MMIO
 242    writes happen in order, like the kernel does. Even worse many
 243    providers haphazardly open code writes to MMIO memory omitting even
 244    volatile.
 245
 246    Until this can be fixed with a proper writel macro, this barrier
 247    is a stand in to indicate places where MMIO writes should be switched
 248    to some future writel.
 249 */
 250 #define mmio_ordered_writes_hack() mmio_flush_writes()
 251
 252 /* Write Combining Spinlock primitive
 253
 254    Any access to a multi-value WC region must ensure that multiple cpus do not
 255    write to the same values concurrently, these macros make that
 256    straightforward and efficient if the choosen exclusion is a spinlock.
 257
 258    The spinlock guarantees that the WC writes issued within the critical
 259    section are made visible as TLP to the device. The TLP must be seen by the
 260    device strictly in the order that the spinlocks are acquired, and combining
 261    WC writes between different sections is not permitted.
 262
 263    Use of these macros allow the fencing inside the spinlock to be combined
 264    with the fencing required for DMA.
 265  */
 266 static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
 267 {
 268         pthread_spin_lock(lock);
 269 #if !defined(__i386__) && !defined(__x86_64__)
 270         /* For x86 the serialization within the spin lock is enough to
 271          * strongly order WC and other memory types. */
 272         mmio_wc_start();
 273 #endif
 274 }
 275
 276 static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
 277 {
 278         /* It is possible that on x86 the atomic in the lock is strong enough
 279          * to force-flush the WC buffers quickly, and this SFENCE can be
 280          * omitted too. */
 281         mmio_flush_writes();
 282         pthread_spin_unlock(lock);
 283 }
 284
 285 #endif