]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/vfs_aio.c
This commit was generated by cvs2svn to compensate for changes in r82367,
[FreeBSD/FreeBSD.git] / sys / kern / vfs_aio.c
1 /*
2  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. John S. Dyson's name may not be used to endorse or promote products
10  *    derived from this software without specific prior written permission.
11  *
12  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13  * bad that happens because of using this software isn't the responsibility
14  * of the author.  This software is distributed AS-IS.
15  *
16  * $FreeBSD$
17  */
18
19 /*
20  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21  */
22
23 #include <sys/param.h>
24 #include <sys/systm.h>
25 #include <sys/bio.h>
26 #include <sys/buf.h>
27 #include <sys/sysproto.h>
28 #include <sys/filedesc.h>
29 #include <sys/kernel.h>
30 #include <sys/kthread.h>
31 #include <sys/fcntl.h>
32 #include <sys/file.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/unistd.h>
36 #include <sys/proc.h>
37 #include <sys/resourcevar.h>
38 #include <sys/signalvar.h>
39 #include <sys/protosw.h>
40 #include <sys/socketvar.h>
41 #include <sys/sysctl.h>
42 #include <sys/vnode.h>
43 #include <sys/conf.h>
44 #include <sys/event.h>
45
46 #include <vm/vm.h>
47 #include <vm/vm_extern.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_zone.h>
51 #include <sys/aio.h>
52
53 #include <machine/limits.h>
54
55 #include "opt_vfs_aio.h"
56
57 #ifdef VFS_AIO
58
59 static  long jobrefid;
60
61 #define JOBST_NULL              0x0
62 #define JOBST_JOBQPROC          0x1
63 #define JOBST_JOBQGLOBAL        0x2
64 #define JOBST_JOBRUNNING        0x3
65 #define JOBST_JOBFINISHED       0x4
66 #define JOBST_JOBQBUF           0x5
67 #define JOBST_JOBBFINISHED      0x6
68
69 #ifndef MAX_AIO_PER_PROC
70 #define MAX_AIO_PER_PROC        32
71 #endif
72
73 #ifndef MAX_AIO_QUEUE_PER_PROC
74 #define MAX_AIO_QUEUE_PER_PROC  256 /* Bigger than AIO_LISTIO_MAX */
75 #endif
76
77 #ifndef MAX_AIO_PROCS
78 #define MAX_AIO_PROCS           32
79 #endif
80
81 #ifndef MAX_AIO_QUEUE
82 #define MAX_AIO_QUEUE           1024 /* Bigger than AIO_LISTIO_MAX */
83 #endif
84
85 #ifndef TARGET_AIO_PROCS
86 #define TARGET_AIO_PROCS        4
87 #endif
88
89 #ifndef MAX_BUF_AIO
90 #define MAX_BUF_AIO             16
91 #endif
92
93 #ifndef AIOD_TIMEOUT_DEFAULT
94 #define AIOD_TIMEOUT_DEFAULT    (10 * hz)
95 #endif
96
97 #ifndef AIOD_LIFETIME_DEFAULT
98 #define AIOD_LIFETIME_DEFAULT   (30 * hz)
99 #endif
100
101 static int max_aio_procs = MAX_AIO_PROCS;
102 static int num_aio_procs = 0;
103 static int target_aio_procs = TARGET_AIO_PROCS;
104 static int max_queue_count = MAX_AIO_QUEUE;
105 static int num_queue_count = 0;
106 static int num_buf_aio = 0;
107 static int num_aio_resv_start = 0;
108 static int aiod_timeout;
109 static int aiod_lifetime;
110
111 static int max_aio_per_proc = MAX_AIO_PER_PROC;
112 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
113 static int max_buf_aio = MAX_BUF_AIO;
114
115 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
116
117 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
118         CTLFLAG_RW, &max_aio_per_proc, 0, "");
119
120 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
121         CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
122
123 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
124         CTLFLAG_RW, &max_aio_procs, 0, "");
125
126 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
127         CTLFLAG_RD, &num_aio_procs, 0, "");
128
129 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
130         CTLFLAG_RD, &num_queue_count, 0, "");
131
132 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
133         CTLFLAG_RW, &max_queue_count, 0, "");
134
135 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
136         CTLFLAG_RW, &target_aio_procs, 0, "");
137
138 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
139         CTLFLAG_RW, &max_buf_aio, 0, "");
140
141 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
142         CTLFLAG_RD, &num_buf_aio, 0, "");
143
144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
145         CTLFLAG_RW, &aiod_lifetime, 0, "");
146
147 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
148         CTLFLAG_RW, &aiod_timeout, 0, "");
149
150 /*
151  * AIO process info
152  */
153 #define AIOP_FREE       0x1                     /* proc on free queue */
154 #define AIOP_SCHED      0x2                     /* proc explicitly scheduled */
155
156 struct aioproclist {
157         int aioprocflags;                       /* AIO proc flags */
158         TAILQ_ENTRY(aioproclist) list;          /* List of processes */
159         struct proc *aioproc;                   /* The AIO thread */
160         TAILQ_HEAD (,aiocblist) jobtorun;       /* suggested job to run */
161 };
162
163 /*
164  * data-structure for lio signal management
165  */
166 struct aio_liojob {
167         int     lioj_flags;
168         int     lioj_buffer_count;
169         int     lioj_buffer_finished_count;
170         int     lioj_queue_count;
171         int     lioj_queue_finished_count;
172         struct  sigevent lioj_signal;   /* signal on all I/O done */
173         TAILQ_ENTRY     (aio_liojob) lioj_list;
174         struct  kaioinfo *lioj_ki;
175 };
176 #define LIOJ_SIGNAL             0x1     /* signal on all done (lio) */
177 #define LIOJ_SIGNAL_POSTED      0x2     /* signal has been posted */
178
179 /*
180  * per process aio data structure
181  */
182 struct kaioinfo {
183         int     kaio_flags;             /* per process kaio flags */
184         int     kaio_maxactive_count;   /* maximum number of AIOs */
185         int     kaio_active_count;      /* number of currently used AIOs */
186         int     kaio_qallowed_count;    /* maxiumu size of AIO queue */
187         int     kaio_queue_count;       /* size of AIO queue */
188         int     kaio_ballowed_count;    /* maximum number of buffers */
189         int     kaio_queue_finished_count; /* number of daemon jobs finished */
190         int     kaio_buffer_count;      /* number of physio buffers */
191         int     kaio_buffer_finished_count; /* count of I/O done */
192         struct  proc *kaio_p;           /* process that uses this kaio block */
193         TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
194         TAILQ_HEAD (,aiocblist) kaio_jobqueue;  /* job queue for process */
195         TAILQ_HEAD (,aiocblist) kaio_jobdone;   /* done queue for process */
196         TAILQ_HEAD (,aiocblist) kaio_bufqueue;  /* buffer job queue for process */
197         TAILQ_HEAD (,aiocblist) kaio_bufdone;   /* buffer done queue for process */
198         TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
199 };
200
201 #define KAIO_RUNDOWN    0x1     /* process is being run down */
202 #define KAIO_WAKEUP     0x2     /* wakeup process when there is a significant event */
203
204 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
205 static TAILQ_HEAD(,aiocblist) aio_jobs;                 /* Async job list */
206 static TAILQ_HEAD(,aiocblist) aio_bufjobs;              /* Phys I/O job list */
207
208 static void     aio_init_aioinfo(struct proc *p);
209 static void     aio_onceonly(void *);
210 static int      aio_free_entry(struct aiocblist *aiocbe);
211 static void     aio_process(struct aiocblist *aiocbe);
212 static int      aio_newproc(void);
213 static int      aio_aqueue(struct proc *p, struct aiocb *job, int type);
214 static void     aio_physwakeup(struct buf *bp);
215 static int      aio_fphysio(struct proc *p, struct aiocblist *aiocbe);
216 static int      aio_qphysio(struct proc *p, struct aiocblist *iocb);
217 static void     aio_daemon(void *uproc);
218 static void     process_signal(void *aioj);
219
220 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
221
222 static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0;
223 static vm_zone_t aiolio_zone = 0;
224
225 /*
226  * Startup initialization
227  */
228 static void
229 aio_onceonly(void *na)
230 {
231         TAILQ_INIT(&aio_freeproc);
232         TAILQ_INIT(&aio_activeproc);
233         TAILQ_INIT(&aio_jobs);
234         TAILQ_INIT(&aio_bufjobs);
235         kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
236         aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
237         aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
238         aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
239         aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct
240             aio_liojob), 0, 0, 1);
241         aiod_timeout = AIOD_TIMEOUT_DEFAULT;
242         aiod_lifetime = AIOD_LIFETIME_DEFAULT;
243         jobrefid = 1;
244 }
245
246 /*
247  * Init the per-process aioinfo structure.  The aioinfo limits are set
248  * per-process for user limit (resource) management.
249  */
250 static void
251 aio_init_aioinfo(struct proc *p)
252 {
253         struct kaioinfo *ki;
254         if (p->p_aioinfo == NULL) {
255                 ki = zalloc(kaio_zone);
256                 p->p_aioinfo = ki;
257                 ki->kaio_flags = 0;
258                 ki->kaio_maxactive_count = max_aio_per_proc;
259                 ki->kaio_active_count = 0;
260                 ki->kaio_qallowed_count = max_aio_queue_per_proc;
261                 ki->kaio_queue_count = 0;
262                 ki->kaio_ballowed_count = max_buf_aio;
263                 ki->kaio_buffer_count = 0;
264                 ki->kaio_buffer_finished_count = 0;
265                 ki->kaio_p = p;
266                 TAILQ_INIT(&ki->kaio_jobdone);
267                 TAILQ_INIT(&ki->kaio_jobqueue);
268                 TAILQ_INIT(&ki->kaio_bufdone);
269                 TAILQ_INIT(&ki->kaio_bufqueue);
270                 TAILQ_INIT(&ki->kaio_liojoblist);
271                 TAILQ_INIT(&ki->kaio_sockqueue);
272         }
273         
274         while (num_aio_procs < target_aio_procs)
275                 aio_newproc();
276 }
277
278 /*
279  * Free a job entry.  Wait for completion if it is currently active, but don't
280  * delay forever.  If we delay, we return a flag that says that we have to
281  * restart the queue scan.
282  */
283 static int
284 aio_free_entry(struct aiocblist *aiocbe)
285 {
286         struct kaioinfo *ki;
287         struct aioproclist *aiop;
288         struct aio_liojob *lj;
289         struct proc *p;
290         int error;
291         int s;
292
293         if (aiocbe->jobstate == JOBST_NULL)
294                 panic("aio_free_entry: freeing already free job");
295
296         p = aiocbe->userproc;
297         ki = p->p_aioinfo;
298         lj = aiocbe->lio;
299         if (ki == NULL)
300                 panic("aio_free_entry: missing p->p_aioinfo");
301
302         while (aiocbe->jobstate == JOBST_JOBRUNNING) {
303                 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
304                         return 0;
305                 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
306                 tsleep(aiocbe, PRIBIO, "jobwai", 0);
307         }
308         aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
309
310         if (aiocbe->bp == NULL) {
311                 if (ki->kaio_queue_count <= 0)
312                         panic("aio_free_entry: process queue size <= 0");
313                 if (num_queue_count <= 0)
314                         panic("aio_free_entry: system wide queue size <= 0");
315         
316                 if (lj) {
317                         lj->lioj_queue_count--;
318                         if (aiocbe->jobflags & AIOCBLIST_DONE)
319                                 lj->lioj_queue_finished_count--;
320                 }
321                 ki->kaio_queue_count--;
322                 if (aiocbe->jobflags & AIOCBLIST_DONE)
323                         ki->kaio_queue_finished_count--;
324                 num_queue_count--;
325         } else {
326                 if (lj) {
327                         lj->lioj_buffer_count--;
328                         if (aiocbe->jobflags & AIOCBLIST_DONE)
329                                 lj->lioj_buffer_finished_count--;
330                 }
331                 if (aiocbe->jobflags & AIOCBLIST_DONE)
332                         ki->kaio_buffer_finished_count--;
333                 ki->kaio_buffer_count--;
334                 num_buf_aio--;
335         }
336
337         /* aiocbe is going away, we need to destroy any knotes */
338         knote_remove(p, &aiocbe->klist);
339
340         if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
341             && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
342                 ki->kaio_flags &= ~KAIO_WAKEUP;
343                 wakeup(p);
344         }
345
346         if (aiocbe->jobstate == JOBST_JOBQBUF) {
347                 if ((error = aio_fphysio(p, aiocbe)) != 0)
348                         return error;
349                 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
350                         panic("aio_free_entry: invalid physio finish-up state");
351                 s = splbio();
352                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
353                 splx(s);
354         } else if (aiocbe->jobstate == JOBST_JOBQPROC) {
355                 aiop = aiocbe->jobaioproc;
356                 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
357         } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
358                 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
359                 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
360         } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
361                 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
362         else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
363                 s = splbio();
364                 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
365                 splx(s);
366                 if (aiocbe->bp) {
367                         vunmapbuf(aiocbe->bp);
368                         relpbuf(aiocbe->bp, NULL);
369                         aiocbe->bp = NULL;
370                 }
371         }
372         if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
373                 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
374                 zfree(aiolio_zone, lj);
375         }
376         aiocbe->jobstate = JOBST_NULL;
377         untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
378         zfree(aiocb_zone, aiocbe);
379         return 0;
380 }
381 #endif /* VFS_AIO */
382
383 /*
384  * Rundown the jobs for a given process.  
385  */
386 void
387 aio_proc_rundown(struct proc *p)
388 {
389 #ifndef VFS_AIO
390         return;
391 #else
392         int s;
393         struct kaioinfo *ki;
394         struct aio_liojob *lj, *ljn;
395         struct aiocblist *aiocbe, *aiocbn;
396         struct file *fp;
397         struct filedesc *fdp;
398         struct socket *so;
399
400         ki = p->p_aioinfo;
401         if (ki == NULL)
402                 return;
403
404         ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
405         while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
406             ki->kaio_buffer_finished_count)) {
407                 ki->kaio_flags |= KAIO_RUNDOWN;
408                 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
409                         break;
410         }
411
412         /*
413          * Move any aio ops that are waiting on socket I/O to the normal job
414          * queues so they are cleaned up with any others.
415          */
416         fdp = p->p_fd;
417
418         s = splnet();
419         for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
420             aiocbn) {
421                 aiocbn = TAILQ_NEXT(aiocbe, plist);
422                 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
423                 
424                 /*
425                  * Under some circumstances, the aio_fildes and the file
426                  * structure don't match.  This would leave aiocbe's in the
427                  * TAILQ associated with the socket and cause a panic later.
428                  * 
429                  * Detect and fix.
430                  */
431                 if ((fp == NULL) || (fp != aiocbe->fd_file))
432                         fp = aiocbe->fd_file;
433                 if (fp) {
434                         so = (struct socket *)fp->f_data;
435                         TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
436                         if (TAILQ_EMPTY(&so->so_aiojobq)) {
437                                 so->so_snd.sb_flags &= ~SB_AIO;
438                                 so->so_rcv.sb_flags &= ~SB_AIO;
439                         }
440                 }
441                 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
442                 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
443                 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
444         }
445         splx(s);
446
447 restart1:
448         for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
449                 aiocbn = TAILQ_NEXT(aiocbe, plist);
450                 if (aio_free_entry(aiocbe))
451                         goto restart1;
452         }
453
454 restart2:
455         for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
456             aiocbn) {
457                 aiocbn = TAILQ_NEXT(aiocbe, plist);
458                 if (aio_free_entry(aiocbe))
459                         goto restart2;
460         }
461
462 /*
463  * Note the use of lots of splbio here, trying to avoid splbio for long chains
464  * of I/O.  Probably unnecessary.
465  */
466 restart3:
467         s = splbio();
468         while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
469                 ki->kaio_flags |= KAIO_WAKEUP;
470                 tsleep(p, PRIBIO, "aioprn", 0);
471                 splx(s);
472                 goto restart3;
473         }
474         splx(s);
475
476 restart4:
477         s = splbio();
478         for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
479                 aiocbn = TAILQ_NEXT(aiocbe, plist);
480                 if (aio_free_entry(aiocbe)) {
481                         splx(s);
482                         goto restart4;
483                 }
484         }
485         splx(s);
486
487         /*
488          * If we've slept, jobs might have moved from one queue to another.
489          * Retry rundown if we didn't manage to empty the queues.
490          */
491         if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
492             TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
493             TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
494             TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
495                 goto restart1;
496
497         for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
498                 ljn = TAILQ_NEXT(lj, lioj_list);
499                 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
500                     0)) {
501                         TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
502                         zfree(aiolio_zone, lj);
503                 } else {
504 #ifdef DIAGNOSTIC
505                         printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
506                             "QF:%d\n", lj->lioj_buffer_count,
507                             lj->lioj_buffer_finished_count,
508                             lj->lioj_queue_count,
509                             lj->lioj_queue_finished_count);
510 #endif
511                 }
512         }
513
514         zfree(kaio_zone, ki);
515         p->p_aioinfo = NULL;
516 #endif /* VFS_AIO */
517 }
518
519 #ifdef VFS_AIO
520 /*
521  * Select a job to run (called by an AIO daemon).
522  */
523 static struct aiocblist *
524 aio_selectjob(struct aioproclist *aiop)
525 {
526         int s;
527         struct aiocblist *aiocbe;
528         struct kaioinfo *ki;
529         struct proc *userp;
530
531         aiocbe = TAILQ_FIRST(&aiop->jobtorun);
532         if (aiocbe) {
533                 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
534                 return aiocbe;
535         }
536
537         s = splnet();
538         for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
539             TAILQ_NEXT(aiocbe, list)) {
540                 userp = aiocbe->userproc;
541                 ki = userp->p_aioinfo;
542
543                 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
544                         TAILQ_REMOVE(&aio_jobs, aiocbe, list);
545                         splx(s);
546                         return aiocbe;
547                 }
548         }
549         splx(s);
550
551         return NULL;
552 }
553
554 /*
555  * The AIO processing activity.  This is the code that does the I/O request for
556  * the non-physio version of the operations.  The normal vn operations are used,
557  * and this code should work in all instances for every type of file, including
558  * pipes, sockets, fifos, and regular files.
559  */
560 static void
561 aio_process(struct aiocblist *aiocbe)
562 {
563         struct filedesc *fdp;
564         struct proc *userp, *mycp;
565         struct aiocb *cb;
566         struct file *fp;
567         struct uio auio;
568         struct iovec aiov;
569         unsigned int fd;
570         int cnt;
571         int error;
572         off_t offset;
573         int oublock_st, oublock_end;
574         int inblock_st, inblock_end;
575
576         userp = aiocbe->userproc;
577         cb = &aiocbe->uaiocb;
578
579         mycp = curproc;
580
581         fdp = mycp->p_fd;
582         fd = cb->aio_fildes;
583         fp = fdp->fd_ofiles[fd];
584
585         if ((fp == NULL) || (fp != aiocbe->fd_file)) {
586                 cb->_aiocb_private.error = EBADF;
587                 cb->_aiocb_private.status = -1;
588                 return;
589         }
590
591         aiov.iov_base = (void *)cb->aio_buf;
592         aiov.iov_len = cb->aio_nbytes;
593
594         auio.uio_iov = &aiov;
595         auio.uio_iovcnt = 1;
596         auio.uio_offset = offset = cb->aio_offset;
597         auio.uio_resid = cb->aio_nbytes;
598         cnt = cb->aio_nbytes;
599         auio.uio_segflg = UIO_USERSPACE;
600         auio.uio_procp = mycp;
601
602         inblock_st = mycp->p_stats->p_ru.ru_inblock;
603         oublock_st = mycp->p_stats->p_ru.ru_oublock;
604         /*
605          * Temporarily bump the ref count while reading to avoid the
606          * descriptor being ripped out from under us.
607          */
608         fhold(fp);
609         if (cb->aio_lio_opcode == LIO_READ) {
610                 auio.uio_rw = UIO_READ;
611                 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
612         } else {
613                 auio.uio_rw = UIO_WRITE;
614                 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
615         }
616         fdrop(fp, mycp);
617         inblock_end = mycp->p_stats->p_ru.ru_inblock;
618         oublock_end = mycp->p_stats->p_ru.ru_oublock;
619
620         aiocbe->inputcharge = inblock_end - inblock_st;
621         aiocbe->outputcharge = oublock_end - oublock_st;
622
623         if ((error) && (auio.uio_resid != cnt)) {
624                 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
625                         error = 0;
626                 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
627                         PROC_LOCK(userp);
628                         psignal(userp, SIGPIPE);
629                         PROC_UNLOCK(userp);
630                 }
631         }
632
633         cnt -= auio.uio_resid;
634         cb->_aiocb_private.error = error;
635         cb->_aiocb_private.status = cnt;
636 }
637
638 /*
639  * The AIO daemon, most of the actual work is done in aio_process,
640  * but the setup (and address space mgmt) is done in this routine.
641  */
642 static void
643 aio_daemon(void *uproc)
644 {
645         int s;
646         struct aio_liojob *lj;
647         struct aiocb *cb;
648         struct aiocblist *aiocbe;
649         struct aioproclist *aiop;
650         struct kaioinfo *ki;
651         struct proc *curcp, *mycp, *userp;
652         struct vmspace *myvm, *tmpvm;
653
654         mtx_lock(&Giant);
655         /*
656          * Local copies of curproc (cp) and vmspace (myvm)
657          */
658         mycp = curproc;
659         myvm = mycp->p_vmspace;
660
661         if (mycp->p_textvp) {
662                 vrele(mycp->p_textvp);
663                 mycp->p_textvp = NULL;
664         }
665
666         /*
667          * Allocate and ready the aio control info.  There is one aiop structure
668          * per daemon.
669          */
670         aiop = zalloc(aiop_zone);
671         aiop->aioproc = mycp;
672         aiop->aioprocflags |= AIOP_FREE;
673         TAILQ_INIT(&aiop->jobtorun);
674
675         s = splnet();
676
677         /*
678          * Place thread (lightweight process) onto the AIO free thread list.
679          */
680         if (TAILQ_EMPTY(&aio_freeproc))
681                 wakeup(&aio_freeproc);
682         TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
683
684         splx(s);
685
686         /*
687          * Get rid of our current filedescriptors.  AIOD's don't need any
688          * filedescriptors, except as temporarily inherited from the client.
689          * Credentials are also cloned, and made equivalent to "root".
690          */
691         fdfree(mycp);
692         mycp->p_fd = NULL;
693         mycp->p_ucred = crcopy(mycp->p_ucred);
694         mycp->p_ucred->cr_uid = 0;
695         uifree(mycp->p_ucred->cr_uidinfo);
696         mycp->p_ucred->cr_uidinfo = uifind(0);
697         mycp->p_ucred->cr_ngroups = 1;
698         mycp->p_ucred->cr_groups[0] = 1;
699
700         /* The daemon resides in its own pgrp. */
701         enterpgrp(mycp, mycp->p_pid, 1);
702
703         /* Mark special process type. */
704         mycp->p_flag |= P_SYSTEM;
705
706         /*
707          * Wakeup parent process.  (Parent sleeps to keep from blasting away
708          * creating to many daemons.)
709          */
710         wakeup(mycp);
711
712         for (;;) {
713                 /*
714                  * curcp is the current daemon process context.
715                  * userp is the current user process context.
716                  */
717                 curcp = mycp;
718
719                 /*
720                  * Take daemon off of free queue
721                  */
722                 if (aiop->aioprocflags & AIOP_FREE) {
723                         s = splnet();
724                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
725                         TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
726                         aiop->aioprocflags &= ~AIOP_FREE;
727                         splx(s);
728                 }
729                 aiop->aioprocflags &= ~AIOP_SCHED;
730
731                 /*
732                  * Check for jobs.
733                  */
734                 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
735                         cb = &aiocbe->uaiocb;
736                         userp = aiocbe->userproc;
737
738                         aiocbe->jobstate = JOBST_JOBRUNNING;
739
740                         /*
741                          * Connect to process address space for user program.
742                          */
743                         if (userp != curcp) {
744                                 /*
745                                  * Save the current address space that we are
746                                  * connected to.
747                                  */
748                                 tmpvm = mycp->p_vmspace;
749                                 
750                                 /*
751                                  * Point to the new user address space, and
752                                  * refer to it.
753                                  */
754                                 mycp->p_vmspace = userp->p_vmspace;
755                                 mycp->p_vmspace->vm_refcnt++;
756                                 
757                                 /* Activate the new mapping. */
758                                 pmap_activate(mycp);
759                                 
760                                 /*
761                                  * If the old address space wasn't the daemons
762                                  * own address space, then we need to remove the
763                                  * daemon's reference from the other process
764                                  * that it was acting on behalf of.
765                                  */
766                                 if (tmpvm != myvm) {
767                                         vmspace_free(tmpvm);
768                                 }
769                                 
770                                 /*
771                                  * Disassociate from previous clients file
772                                  * descriptors, and associate to the new clients
773                                  * descriptors.  Note that the daemon doesn't
774                                  * need to worry about its orginal descriptors,
775                                  * because they were originally freed.
776                                  */
777                                 if (mycp->p_fd)
778                                         fdfree(mycp);
779                                 mycp->p_fd = fdshare(userp);
780                                 curcp = userp;
781                         }
782
783                         ki = userp->p_aioinfo;
784                         lj = aiocbe->lio;
785
786                         /* Account for currently active jobs. */
787                         ki->kaio_active_count++;
788
789                         /* Do the I/O function. */
790                         aiocbe->jobaioproc = aiop;
791                         aio_process(aiocbe);
792
793                         /* Decrement the active job count. */
794                         ki->kaio_active_count--;
795
796                         /*
797                          * Increment the completion count for wakeup/signal
798                          * comparisons.
799                          */
800                         aiocbe->jobflags |= AIOCBLIST_DONE;
801                         ki->kaio_queue_finished_count++;
802                         if (lj)
803                                 lj->lioj_queue_finished_count++;
804                         if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
805                             & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
806                                 ki->kaio_flags &= ~KAIO_WAKEUP;
807                                 wakeup(userp);
808                         }
809
810                         s = splbio();
811                         if (lj && (lj->lioj_flags &
812                             (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
813                                 if ((lj->lioj_queue_finished_count ==
814                                     lj->lioj_queue_count) &&
815                                     (lj->lioj_buffer_finished_count ==
816                                     lj->lioj_buffer_count)) {
817                                         PROC_LOCK(userp);
818                                         psignal(userp,
819                                             lj->lioj_signal.sigev_signo);
820                                         PROC_UNLOCK(userp);
821                                         lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
822                                 }
823                         }
824                         splx(s);
825
826                         aiocbe->jobstate = JOBST_JOBFINISHED;
827
828                         /*
829                          * If the I/O request should be automatically rundown,
830                          * do the needed cleanup.  Otherwise, place the queue
831                          * entry for the just finished I/O request into the done
832                          * queue for the associated client.
833                          */
834                         s = splnet();
835                         if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
836                                 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
837                                 zfree(aiocb_zone, aiocbe);
838                         } else {
839                                 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
840                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
841                                     plist);
842                         }
843                         splx(s);
844                         KNOTE(&aiocbe->klist, 0);
845
846                         if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
847                                 wakeup(aiocbe);
848                                 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
849                         }
850
851                         if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
852                                 PROC_LOCK(userp);
853                                 psignal(userp, cb->aio_sigevent.sigev_signo);
854                                 PROC_UNLOCK(userp);
855                         }
856                 }
857
858                 /*
859                  * Disconnect from user address space.
860                  */
861                 if (curcp != mycp) {
862                         /* Get the user address space to disconnect from. */
863                         tmpvm = mycp->p_vmspace;
864                         
865                         /* Get original address space for daemon. */
866                         mycp->p_vmspace = myvm;
867                         
868                         /* Activate the daemon's address space. */
869                         pmap_activate(mycp);
870 #ifdef DIAGNOSTIC
871                         if (tmpvm == myvm) {
872                                 printf("AIOD: vmspace problem -- %d\n",
873                                     mycp->p_pid);
874                         }
875 #endif
876                         /* Remove our vmspace reference. */
877                         vmspace_free(tmpvm);
878                         
879                         /*
880                          * Disassociate from the user process's file
881                          * descriptors.
882                          */
883                         if (mycp->p_fd)
884                                 fdfree(mycp);
885                         mycp->p_fd = NULL;
886                         curcp = mycp;
887                 }
888
889                 /*
890                  * If we are the first to be put onto the free queue, wakeup
891                  * anyone waiting for a daemon.
892                  */
893                 s = splnet();
894                 TAILQ_REMOVE(&aio_activeproc, aiop, list);
895                 if (TAILQ_EMPTY(&aio_freeproc))
896                         wakeup(&aio_freeproc);
897                 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
898                 aiop->aioprocflags |= AIOP_FREE;
899                 splx(s);
900
901                 /*
902                  * If daemon is inactive for a long time, allow it to exit,
903                  * thereby freeing resources.
904                  */
905                 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
906                     PRIBIO, "aiordy", aiod_lifetime)) {
907                         s = splnet();
908                         if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
909                             (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
910                                 if ((aiop->aioprocflags & AIOP_FREE) &&
911                                     (num_aio_procs > target_aio_procs)) {
912                                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
913                                         splx(s);
914                                         zfree(aiop_zone, aiop);
915                                         num_aio_procs--;
916 #ifdef DIAGNOSTIC
917                                         if (mycp->p_vmspace->vm_refcnt <= 1) {
918                                                 printf("AIOD: bad vm refcnt for"
919                                                     " exiting daemon: %d\n",
920                                                     mycp->p_vmspace->vm_refcnt);
921                                         }
922 #endif
923                                         kthread_exit(0);
924                                 }
925                         }
926                         splx(s);
927                 }
928         }
929 }
930
931 /*
932  * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
933  * AIO daemon modifies its environment itself.
934  */
935 static int
936 aio_newproc()
937 {
938         int error;
939         struct proc *p;
940
941         error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
942                                num_aio_procs);
943         if (error)
944                 return error;
945
946         /*
947          * Wait until daemon is started, but continue on just in case to
948          * handle error conditions.
949          */
950         error = tsleep(p, PZERO, "aiosta", aiod_timeout);
951
952         num_aio_procs++;
953
954         return error;
955 }
956
957 /*
958  * Try the high-performance, low-overhead physio method for eligible
959  * VCHR devices.  This method doesn't use an aio helper thread, and
960  * thus has very low overhead. 
961  *
962  * Assumes that the caller, _aio_aqueue(), has incremented the file
963  * structure's reference count, preventing its deallocation for the
964  * duration of this call. 
965  */
966 static int
967 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
968 {
969         int error;
970         struct aiocb *cb;
971         struct file *fp;
972         struct buf *bp;
973         struct vnode *vp;
974         struct kaioinfo *ki;
975         struct filedesc *fdp;
976         struct aio_liojob *lj;
977         int fd;
978         int s;
979         int notify;
980
981         cb = &aiocbe->uaiocb;
982         fdp = p->p_fd;
983         fd = cb->aio_fildes;
984         fp = fdp->fd_ofiles[fd];
985
986         if (fp->f_type != DTYPE_VNODE) 
987                 return (-1);
988
989         vp = (struct vnode *)fp->f_data;
990
991         /*
992          * If its not a disk, we don't want to return a positive error.
993          * It causes the aio code to not fall through to try the thread
994          * way when you're talking to a regular file.
995          */
996         if (!vn_isdisk(vp, &error)) {
997                 if (error == ENOTBLK)
998                         return (-1);
999                 else
1000                         return (error);
1001         }
1002
1003         if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1004                 return (-1);
1005
1006         if (cb->aio_nbytes >
1007             MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1008                 return (-1);
1009
1010         ki = p->p_aioinfo;
1011         if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 
1012                 return (-1);
1013
1014         ki->kaio_buffer_count++;
1015
1016         lj = aiocbe->lio;
1017         if (lj)
1018                 lj->lioj_buffer_count++;
1019
1020         /* Create and build a buffer header for a transfer. */
1021         bp = (struct buf *)getpbuf(NULL);
1022         BUF_KERNPROC(bp);
1023
1024         /*
1025          * Get a copy of the kva from the physical buffer.
1026          */
1027         bp->b_caller1 = p;
1028         bp->b_dev = vp->v_rdev;
1029         error = bp->b_error = 0;
1030
1031         bp->b_bcount = cb->aio_nbytes;
1032         bp->b_bufsize = cb->aio_nbytes;
1033         bp->b_flags = B_PHYS;
1034         bp->b_iodone = aio_physwakeup;
1035         bp->b_saveaddr = bp->b_data;
1036         bp->b_data = (void *)cb->aio_buf;
1037         bp->b_blkno = btodb(cb->aio_offset);
1038
1039         if (cb->aio_lio_opcode == LIO_WRITE) {
1040                 bp->b_iocmd = BIO_WRITE;
1041                 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1042                         error = EFAULT;
1043                         goto doerror;
1044                 }
1045         } else {
1046                 bp->b_iocmd = BIO_READ;
1047                 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1048                         error = EFAULT;
1049                         goto doerror;
1050                 }
1051         }
1052
1053         /* Bring buffer into kernel space. */
1054         vmapbuf(bp);
1055
1056         s = splbio();
1057         aiocbe->bp = bp;
1058         bp->b_spc = (void *)aiocbe;
1059         TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1060         TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1061         aiocbe->jobstate = JOBST_JOBQBUF;
1062         cb->_aiocb_private.status = cb->aio_nbytes;
1063         num_buf_aio++;
1064         bp->b_error = 0;
1065
1066         splx(s);
1067         
1068         /* Perform transfer. */
1069         DEV_STRATEGY(bp, 0);
1070
1071         notify = 0;
1072         s = splbio();
1073         
1074         /*
1075          * If we had an error invoking the request, or an error in processing
1076          * the request before we have returned, we process it as an error in
1077          * transfer.  Note that such an I/O error is not indicated immediately,
1078          * but is returned using the aio_error mechanism.  In this case,
1079          * aio_suspend will return immediately.
1080          */
1081         if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1082                 struct aiocb *job = aiocbe->uuaiocb;
1083
1084                 aiocbe->uaiocb._aiocb_private.status = 0;
1085                 suword(&job->_aiocb_private.status, 0);
1086                 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1087                 suword(&job->_aiocb_private.error, bp->b_error);
1088
1089                 ki->kaio_buffer_finished_count++;
1090
1091                 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1092                         aiocbe->jobstate = JOBST_JOBBFINISHED;
1093                         aiocbe->jobflags |= AIOCBLIST_DONE;
1094                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1095                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1096                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1097                         notify = 1;
1098                 }
1099         }
1100         splx(s);
1101         if (notify)
1102                 KNOTE(&aiocbe->klist, 0);
1103         return 0;
1104
1105 doerror:
1106         ki->kaio_buffer_count--;
1107         if (lj)
1108                 lj->lioj_buffer_count--;
1109         aiocbe->bp = NULL;
1110         relpbuf(bp, NULL);
1111         return error;
1112 }
1113
1114 /*
1115  * This waits/tests physio completion.
1116  */
1117 static int
1118 aio_fphysio(struct proc *p, struct aiocblist *iocb)
1119 {
1120         int s;
1121         struct buf *bp;
1122         int error;
1123
1124         bp = iocb->bp;
1125
1126         s = splbio();
1127         while ((bp->b_flags & B_DONE) == 0) {
1128                 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1129                         if ((bp->b_flags & B_DONE) == 0) {
1130                                 splx(s);
1131                                 return EINPROGRESS;
1132                         } else
1133                                 break;
1134                 }
1135         }
1136         splx(s);
1137
1138         /* Release mapping into kernel space. */
1139         vunmapbuf(bp);
1140         iocb->bp = 0;
1141
1142         error = 0;
1143         
1144         /* Check for an error. */
1145         if (bp->b_ioflags & BIO_ERROR)
1146                 error = bp->b_error;
1147
1148         relpbuf(bp, NULL);
1149         return (error);
1150 }
1151 #endif /* VFS_AIO */
1152
1153 /*
1154  * Wake up aio requests that may be serviceable now.
1155  */
1156 void
1157 aio_swake(struct socket *so, struct sockbuf *sb)
1158 {
1159 #ifndef VFS_AIO
1160         return;
1161 #else
1162         struct aiocblist *cb,*cbn;
1163         struct proc *p;
1164         struct kaioinfo *ki = NULL;
1165         int opcode, wakecount = 0;
1166         struct aioproclist *aiop;
1167
1168         if (sb == &so->so_snd) {
1169                 opcode = LIO_WRITE;
1170                 so->so_snd.sb_flags &= ~SB_AIO;
1171         } else {
1172                 opcode = LIO_READ;
1173                 so->so_rcv.sb_flags &= ~SB_AIO;
1174         }
1175
1176         for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1177                 cbn = TAILQ_NEXT(cb, list);
1178                 if (opcode == cb->uaiocb.aio_lio_opcode) {
1179                         p = cb->userproc;
1180                         ki = p->p_aioinfo;
1181                         TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1182                         TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1183                         TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1184                         TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1185                         wakecount++;
1186                         if (cb->jobstate != JOBST_JOBQGLOBAL)
1187                                 panic("invalid queue value");
1188                 }
1189         }
1190
1191         while (wakecount--) {
1192                 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1193                         TAILQ_REMOVE(&aio_freeproc, aiop, list);
1194                         TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1195                         aiop->aioprocflags &= ~AIOP_FREE;
1196                         wakeup(aiop->aioproc);
1197                 }
1198         }
1199 #endif /* VFS_AIO */
1200 }
1201
1202 #ifdef VFS_AIO
1203 /*
1204  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1205  * technique is done in this code.
1206  */
1207 static int
1208 _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1209 {
1210         struct filedesc *fdp;
1211         struct file *fp;
1212         unsigned int fd;
1213         struct socket *so;
1214         int s;
1215         int error;
1216         int opcode;
1217         struct aiocblist *aiocbe;
1218         struct aioproclist *aiop;
1219         struct kaioinfo *ki;
1220         struct kevent kev;
1221         struct kqueue *kq;
1222         struct file *kq_fp;
1223
1224         aiocbe = zalloc(aiocb_zone);
1225         aiocbe->inputcharge = 0;
1226         aiocbe->outputcharge = 0;
1227         callout_handle_init(&aiocbe->timeouthandle);
1228         SLIST_INIT(&aiocbe->klist);
1229
1230         suword(&job->_aiocb_private.status, -1);
1231         suword(&job->_aiocb_private.error, 0);
1232         suword(&job->_aiocb_private.kernelinfo, -1);
1233
1234         error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1235         if (error) {
1236                 suword(&job->_aiocb_private.error, error);
1237                 zfree(aiocb_zone, aiocbe);
1238                 return error;
1239         }
1240         if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1241                 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1242                 zfree(aiocb_zone, aiocbe);
1243                 return EINVAL;
1244         }
1245
1246         /* Save userspace address of the job info. */
1247         aiocbe->uuaiocb = job;
1248
1249         /* Get the opcode. */
1250         if (type != LIO_NOP)
1251                 aiocbe->uaiocb.aio_lio_opcode = type;
1252         opcode = aiocbe->uaiocb.aio_lio_opcode;
1253
1254         /* Get the fd info for process. */
1255         fdp = p->p_fd;
1256
1257         /*
1258          * Range check file descriptor.
1259          */
1260         fd = aiocbe->uaiocb.aio_fildes;
1261         if (fd >= fdp->fd_nfiles) {
1262                 zfree(aiocb_zone, aiocbe);
1263                 if (type == 0)
1264                         suword(&job->_aiocb_private.error, EBADF);
1265                 return EBADF;
1266         }
1267
1268         fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1269         if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1270             0))) {
1271                 zfree(aiocb_zone, aiocbe);
1272                 if (type == 0)
1273                         suword(&job->_aiocb_private.error, EBADF);
1274                 return EBADF;
1275         }
1276
1277         if (aiocbe->uaiocb.aio_offset == -1LL) {
1278                 zfree(aiocb_zone, aiocbe);
1279                 if (type == 0)
1280                         suword(&job->_aiocb_private.error, EINVAL);
1281                 return EINVAL;
1282         }
1283
1284         error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1285         if (error) {
1286                 zfree(aiocb_zone, aiocbe);
1287                 if (type == 0)
1288                         suword(&job->_aiocb_private.error, EINVAL);
1289                 return error;
1290         }
1291
1292         aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1293         if (jobrefid == LONG_MAX)
1294                 jobrefid = 1;
1295         else
1296                 jobrefid++;
1297         
1298         if (opcode == LIO_NOP) {
1299                 zfree(aiocb_zone, aiocbe);
1300                 if (type == 0) {
1301                         suword(&job->_aiocb_private.error, 0);
1302                         suword(&job->_aiocb_private.status, 0);
1303                         suword(&job->_aiocb_private.kernelinfo, 0);
1304                 }
1305                 return 0;
1306         }
1307
1308         if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1309                 zfree(aiocb_zone, aiocbe);
1310                 if (type == 0) {
1311                         suword(&job->_aiocb_private.status, 0);
1312                         suword(&job->_aiocb_private.error, EINVAL);
1313                 }
1314                 return EINVAL;
1315         }
1316
1317         fhold(fp);
1318
1319         if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1320                 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1321                 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1322         }
1323         else {
1324                 /*
1325                  * This method for requesting kevent-based notification won't
1326                  * work on the alpha, since we're passing in a pointer
1327                  * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1328                  * based method instead.
1329                  */
1330                 struct kevent *kevp;
1331
1332                 kevp = (struct kevent *)job->aio_lio_opcode;
1333                 if (kevp == NULL)
1334                         goto no_kqueue;
1335
1336                 error = copyin(kevp, &kev, sizeof(kev));
1337                 if (error)
1338                         goto aqueue_fail;
1339         }
1340         if ((u_int)kev.ident >= fdp->fd_nfiles ||
1341             (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1342             (kq_fp->f_type != DTYPE_KQUEUE)) {
1343                 error = EBADF;
1344                 goto aqueue_fail;
1345         }
1346         kq = (struct kqueue *)kq_fp->f_data;
1347         kev.ident = (uintptr_t)aiocbe;
1348         kev.filter = EVFILT_AIO;
1349         kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1350         error = kqueue_register(kq, &kev, p);
1351 aqueue_fail:
1352         if (error) {
1353                 zfree(aiocb_zone, aiocbe);
1354                 if (type == 0)
1355                         suword(&job->_aiocb_private.error, error);
1356                 goto done;
1357         }
1358 no_kqueue:
1359
1360         suword(&job->_aiocb_private.error, EINPROGRESS);
1361         aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1362         aiocbe->userproc = p;
1363         aiocbe->jobflags = 0;
1364         aiocbe->lio = lj;
1365         ki = p->p_aioinfo;
1366
1367         if (fp->f_type == DTYPE_SOCKET) {
1368                 /*
1369                  * Alternate queueing for socket ops: Reach down into the
1370                  * descriptor to get the socket data.  Then check to see if the
1371                  * socket is ready to be read or written (based on the requested
1372                  * operation).
1373                  *
1374                  * If it is not ready for io, then queue the aiocbe on the
1375                  * socket, and set the flags so we get a call when sbnotify()
1376                  * happens.
1377                  */
1378                 so = (struct socket *)fp->f_data;
1379                 s = splnet();
1380                 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1381                     LIO_WRITE) && (!sowriteable(so)))) {
1382                         TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1383                         TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1384                         if (opcode == LIO_READ)
1385                                 so->so_rcv.sb_flags |= SB_AIO;
1386                         else
1387                                 so->so_snd.sb_flags |= SB_AIO;
1388                         aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1389                         ki->kaio_queue_count++;
1390                         num_queue_count++;
1391                         splx(s);
1392                         error = 0;
1393                         goto done;
1394                 }
1395                 splx(s);
1396         }
1397
1398         if ((error = aio_qphysio(p, aiocbe)) == 0)
1399                 goto done;
1400         if (error > 0) {
1401                 suword(&job->_aiocb_private.status, 0);
1402                 aiocbe->uaiocb._aiocb_private.error = error;
1403                 suword(&job->_aiocb_private.error, error);
1404                 goto done;
1405         }
1406
1407         /* No buffer for daemon I/O. */
1408         aiocbe->bp = NULL;
1409
1410         ki->kaio_queue_count++;
1411         if (lj)
1412                 lj->lioj_queue_count++;
1413         s = splnet();
1414         TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1415         TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1416         splx(s);
1417         aiocbe->jobstate = JOBST_JOBQGLOBAL;
1418
1419         num_queue_count++;
1420         error = 0;
1421
1422         /*
1423          * If we don't have a free AIO process, and we are below our quota, then
1424          * start one.  Otherwise, depend on the subsequent I/O completions to
1425          * pick-up this job.  If we don't sucessfully create the new process
1426          * (thread) due to resource issues, we return an error for now (EAGAIN),
1427          * which is likely not the correct thing to do.
1428          */
1429 retryproc:
1430         s = splnet();
1431         if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1432                 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1433                 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1434                 aiop->aioprocflags &= ~AIOP_FREE;
1435                 wakeup(aiop->aioproc);
1436         } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1437             ((ki->kaio_active_count + num_aio_resv_start) <
1438             ki->kaio_maxactive_count)) {
1439                 num_aio_resv_start++;
1440                 if ((error = aio_newproc()) == 0) {
1441                         num_aio_resv_start--;
1442                         p->p_retval[0] = 0;
1443                         goto retryproc;
1444                 }
1445                 num_aio_resv_start--;
1446         }
1447         splx(s);
1448 done:
1449         fdrop(fp, p);
1450         return error;
1451 }
1452
1453 /*
1454  * This routine queues an AIO request, checking for quotas.
1455  */
1456 static int
1457 aio_aqueue(struct proc *p, struct aiocb *job, int type)
1458 {
1459         struct kaioinfo *ki;
1460
1461         if (p->p_aioinfo == NULL)
1462                 aio_init_aioinfo(p);
1463
1464         if (num_queue_count >= max_queue_count)
1465                 return EAGAIN;
1466
1467         ki = p->p_aioinfo;
1468         if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1469                 return EAGAIN;
1470
1471         return _aio_aqueue(p, job, NULL, type);
1472 }
1473 #endif /* VFS_AIO */
1474
1475 /*
1476  * Support the aio_return system call, as a side-effect, kernel resources are
1477  * released.
1478  */
1479 int
1480 aio_return(struct proc *p, struct aio_return_args *uap)
1481 {
1482 #ifndef VFS_AIO
1483         return ENOSYS;
1484 #else
1485         int s;
1486         int jobref;
1487         struct aiocblist *cb, *ncb;
1488         struct aiocb *ujob;
1489         struct kaioinfo *ki;
1490
1491         ki = p->p_aioinfo;
1492         if (ki == NULL)
1493                 return EINVAL;
1494
1495         ujob = uap->aiocbp;
1496
1497         jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1498         if (jobref == -1 || jobref == 0)
1499                 return EINVAL;
1500
1501         s = splnet();
1502         for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1503             plist)) {
1504                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1505                     jobref) {
1506                         splx(s);
1507                         if (ujob == cb->uuaiocb) {
1508                                 p->p_retval[0] =
1509                                     cb->uaiocb._aiocb_private.status;
1510                         } else
1511                                 p->p_retval[0] = EFAULT;
1512                         if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1513                                 curproc->p_stats->p_ru.ru_oublock +=
1514                                     cb->outputcharge;
1515                                 cb->outputcharge = 0;
1516                         } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1517                                 curproc->p_stats->p_ru.ru_inblock +=
1518                                     cb->inputcharge;
1519                                 cb->inputcharge = 0;
1520                         }
1521                         aio_free_entry(cb);
1522                         return 0;
1523                 }
1524         }
1525         splx(s);
1526         
1527         s = splbio();
1528         for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1529                 ncb = TAILQ_NEXT(cb, plist);
1530                 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1531                     == jobref) {
1532                         splx(s);
1533                         if (ujob == cb->uuaiocb) {
1534                                 p->p_retval[0] =
1535                                     cb->uaiocb._aiocb_private.status;
1536                         } else
1537                                 p->p_retval[0] = EFAULT;
1538                         aio_free_entry(cb);
1539                         return 0;
1540                 }
1541         }
1542         splx(s);
1543
1544         return (EINVAL);
1545 #endif /* VFS_AIO */
1546 }
1547
1548 /*
1549  * Allow a process to wakeup when any of the I/O requests are completed.
1550  */
1551 int
1552 aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1553 {
1554 #ifndef VFS_AIO
1555         return ENOSYS;
1556 #else
1557         struct timeval atv;
1558         struct timespec ts;
1559         struct aiocb *const *cbptr, *cbp;
1560         struct kaioinfo *ki;
1561         struct aiocblist *cb;
1562         int i;
1563         int njoblist;
1564         int error, s, timo;
1565         int *ijoblist;
1566         struct aiocb **ujoblist;
1567         
1568         if (uap->nent >= AIO_LISTIO_MAX)
1569                 return EINVAL;
1570
1571         timo = 0;
1572         if (uap->timeout) {
1573                 /* Get timespec struct. */
1574                 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1575                         return error;
1576
1577                 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1578                         return (EINVAL);
1579
1580                 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1581                 if (itimerfix(&atv))
1582                         return (EINVAL);
1583                 timo = tvtohz(&atv);
1584         }
1585
1586         ki = p->p_aioinfo;
1587         if (ki == NULL)
1588                 return EAGAIN;
1589
1590         njoblist = 0;
1591         ijoblist = zalloc(aiol_zone);
1592         ujoblist = zalloc(aiol_zone);
1593         cbptr = uap->aiocbp;
1594
1595         for (i = 0; i < uap->nent; i++) {
1596                 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1597                 if (cbp == 0)
1598                         continue;
1599                 ujoblist[njoblist] = cbp;
1600                 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1601                 njoblist++;
1602         }
1603
1604         if (njoblist == 0) {
1605                 zfree(aiol_zone, ijoblist);
1606                 zfree(aiol_zone, ujoblist);
1607                 return 0;
1608         }
1609
1610         error = 0;
1611         for (;;) {
1612                 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1613                     TAILQ_NEXT(cb, plist)) {
1614                         for (i = 0; i < njoblist; i++) {
1615                                 if (((intptr_t)
1616                                     cb->uaiocb._aiocb_private.kernelinfo) ==
1617                                     ijoblist[i]) {
1618                                         if (ujoblist[i] != cb->uuaiocb)
1619                                                 error = EINVAL;
1620                                         zfree(aiol_zone, ijoblist);
1621                                         zfree(aiol_zone, ujoblist);
1622                                         return error;
1623                                 }
1624                         }
1625                 }
1626
1627                 s = splbio();
1628                 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1629                     TAILQ_NEXT(cb, plist)) {
1630                         for (i = 0; i < njoblist; i++) {
1631                                 if (((intptr_t)
1632                                     cb->uaiocb._aiocb_private.kernelinfo) ==
1633                                     ijoblist[i]) {
1634                                         splx(s);
1635                                         if (ujoblist[i] != cb->uuaiocb)
1636                                                 error = EINVAL;
1637                                         zfree(aiol_zone, ijoblist);
1638                                         zfree(aiol_zone, ujoblist);
1639                                         return error;
1640                                 }
1641                         }
1642                 }
1643
1644                 ki->kaio_flags |= KAIO_WAKEUP;
1645                 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1646                 splx(s);
1647
1648                 if (error == ERESTART || error == EINTR) {
1649                         zfree(aiol_zone, ijoblist);
1650                         zfree(aiol_zone, ujoblist);
1651                         return EINTR;
1652                 } else if (error == EWOULDBLOCK) {
1653                         zfree(aiol_zone, ijoblist);
1654                         zfree(aiol_zone, ujoblist);
1655                         return EAGAIN;
1656                 }
1657         }
1658
1659 /* NOTREACHED */
1660         return EINVAL;
1661 #endif /* VFS_AIO */
1662 }
1663
1664 /*
1665  * aio_cancel cancels any non-physio aio operations not currently in
1666  * progress.
1667  */
1668 int
1669 aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1670 {
1671 #ifndef VFS_AIO
1672         return ENOSYS;
1673 #else
1674         struct kaioinfo *ki;
1675         struct aiocblist *cbe, *cbn;
1676         struct file *fp;
1677         struct filedesc *fdp;
1678         struct socket *so;
1679         struct proc *po;
1680         int s,error;
1681         int cancelled=0;
1682         int notcancelled=0;
1683         struct vnode *vp;
1684
1685         fdp = p->p_fd;
1686
1687         fp = fdp->fd_ofiles[uap->fd];
1688
1689         if (fp == NULL) {
1690                 return EBADF;
1691         }
1692
1693         if (fp->f_type == DTYPE_VNODE) {
1694                 vp = (struct vnode *)fp->f_data;
1695                 
1696                 if (vn_isdisk(vp,&error)) {
1697                         p->p_retval[0] = AIO_NOTCANCELED;
1698                         return 0;
1699                 }
1700         } else if (fp->f_type == DTYPE_SOCKET) {
1701                 so = (struct socket *)fp->f_data;
1702
1703                 s = splnet();
1704
1705                 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1706                         cbn = TAILQ_NEXT(cbe, list);
1707                         if ((uap->aiocbp == NULL) ||
1708                                 (uap->aiocbp == cbe->uuaiocb) ) {
1709                                 po = cbe->userproc;
1710                                 ki = po->p_aioinfo;
1711                                 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1712                                 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1713                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1714                                 if (ki->kaio_flags & KAIO_WAKEUP) {
1715                                         wakeup(po);
1716                                 }
1717                                 cbe->jobstate = JOBST_JOBFINISHED;
1718                                 cbe->uaiocb._aiocb_private.status=-1;
1719                                 cbe->uaiocb._aiocb_private.error=ECANCELED;
1720                                 cancelled++;
1721 /* XXX cancelled, knote? */
1722                                 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1723                                     SIGEV_SIGNAL) {
1724                                         PROC_LOCK(cbe->userproc);
1725                                         psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1726                                         PROC_UNLOCK(cbe->userproc);
1727                                 }
1728                                 if (uap->aiocbp) 
1729                                         break;
1730                         }
1731                 }
1732         
1733                 splx(s);
1734
1735                 if ((cancelled) && (uap->aiocbp)) {
1736                         p->p_retval[0] = AIO_CANCELED;
1737                         return 0;
1738                 }
1739
1740         }
1741
1742         ki=p->p_aioinfo;
1743                 
1744         s = splnet();
1745
1746         for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1747                 cbn = TAILQ_NEXT(cbe, plist);
1748
1749                 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1750                     ((uap->aiocbp == NULL ) || 
1751                      (uap->aiocbp == cbe->uuaiocb))) {
1752                         
1753                         if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1754                                 TAILQ_REMOVE(&aio_jobs, cbe, list);
1755                                 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1756                                 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1757                                     plist);
1758                                 cancelled++;
1759                                 ki->kaio_queue_finished_count++;
1760                                 cbe->jobstate = JOBST_JOBFINISHED;
1761                                 cbe->uaiocb._aiocb_private.status = -1;
1762                                 cbe->uaiocb._aiocb_private.error = ECANCELED;
1763 /* XXX cancelled, knote? */
1764                                 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1765                                     SIGEV_SIGNAL) {
1766                                         PROC_LOCK(cbe->userproc);
1767                                         psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1768                                         PROC_UNLOCK(cbe->userproc);
1769                                 }
1770                         } else {
1771                                 notcancelled++;
1772                         }
1773                 }
1774         }
1775
1776         splx(s);
1777                 
1778
1779         if (notcancelled) {
1780                 p->p_retval[0] = AIO_NOTCANCELED;
1781                 return 0;
1782         }
1783
1784         if (cancelled) {
1785                 p->p_retval[0] = AIO_CANCELED;
1786                 return 0;
1787         }
1788
1789         p->p_retval[0] = AIO_ALLDONE;
1790
1791         return 0;
1792 #endif /* VFS_AIO */
1793 }
1794
1795 /*
1796  * aio_error is implemented in the kernel level for compatibility purposes only.
1797  * For a user mode async implementation, it would be best to do it in a userland
1798  * subroutine.
1799  */
1800 int
1801 aio_error(struct proc *p, struct aio_error_args *uap)
1802 {
1803 #ifndef VFS_AIO
1804         return ENOSYS;
1805 #else
1806         int s;
1807         struct aiocblist *cb;
1808         struct kaioinfo *ki;
1809         int jobref;
1810
1811         ki = p->p_aioinfo;
1812         if (ki == NULL)
1813                 return EINVAL;
1814
1815         jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1816         if ((jobref == -1) || (jobref == 0))
1817                 return EINVAL;
1818
1819         for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1820             plist)) {
1821                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1822                     jobref) {
1823                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1824                         return 0;
1825                 }
1826         }
1827
1828         s = splnet();
1829
1830         for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1831             plist)) {
1832                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1833                     jobref) {
1834                         p->p_retval[0] = EINPROGRESS;
1835                         splx(s);
1836                         return 0;
1837                 }
1838         }
1839
1840         for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1841             plist)) {
1842                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1843                     jobref) {
1844                         p->p_retval[0] = EINPROGRESS;
1845                         splx(s);
1846                         return 0;
1847                 }
1848         }
1849         splx(s);
1850
1851         s = splbio();
1852         for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1853             plist)) {
1854                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1855                     jobref) {
1856                         p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1857                         splx(s);
1858                         return 0;
1859                 }
1860         }
1861
1862         for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1863             plist)) {
1864                 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1865                     jobref) {
1866                         p->p_retval[0] = EINPROGRESS;
1867                         splx(s);
1868                         return 0;
1869                 }
1870         }
1871         splx(s);
1872
1873 #if (0)
1874         /*
1875          * Hack for lio.
1876          */
1877         status = fuword(&uap->aiocbp->_aiocb_private.status);
1878         if (status == -1)
1879                 return fuword(&uap->aiocbp->_aiocb_private.error);
1880 #endif
1881         return EINVAL;
1882 #endif /* VFS_AIO */
1883 }
1884
1885 int
1886 aio_read(struct proc *p, struct aio_read_args *uap)
1887 {
1888 #ifndef VFS_AIO
1889         return ENOSYS;
1890 #else
1891         return aio_aqueue(p, uap->aiocbp, LIO_READ);
1892 #endif /* VFS_AIO */
1893 }
1894
1895 int
1896 aio_write(struct proc *p, struct aio_write_args *uap)
1897 {
1898 #ifndef VFS_AIO
1899         return ENOSYS;
1900 #else
1901         return aio_aqueue(p, uap->aiocbp, LIO_WRITE);
1902 #endif /* VFS_AIO */
1903 }
1904
1905 int
1906 lio_listio(struct proc *p, struct lio_listio_args *uap)
1907 {
1908 #ifndef VFS_AIO
1909         return ENOSYS;
1910 #else
1911         int nent, nentqueued;
1912         struct aiocb *iocb, * const *cbptr;
1913         struct aiocblist *cb;
1914         struct kaioinfo *ki;
1915         struct aio_liojob *lj;
1916         int error, runningcode;
1917         int nerror;
1918         int i;
1919         int s;
1920
1921         if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1922                 return EINVAL;
1923
1924         nent = uap->nent;
1925         if (nent > AIO_LISTIO_MAX)
1926                 return EINVAL;
1927
1928         if (p->p_aioinfo == NULL)
1929                 aio_init_aioinfo(p);
1930
1931         if ((nent + num_queue_count) > max_queue_count)
1932                 return EAGAIN;
1933
1934         ki = p->p_aioinfo;
1935         if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1936                 return EAGAIN;
1937
1938         lj = zalloc(aiolio_zone);
1939         if (!lj)
1940                 return EAGAIN;
1941
1942         lj->lioj_flags = 0;
1943         lj->lioj_buffer_count = 0;
1944         lj->lioj_buffer_finished_count = 0;
1945         lj->lioj_queue_count = 0;
1946         lj->lioj_queue_finished_count = 0;
1947         lj->lioj_ki = ki;
1948
1949         /*
1950          * Setup signal.
1951          */
1952         if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1953                 error = copyin(uap->sig, &lj->lioj_signal,
1954                                sizeof(lj->lioj_signal));
1955                 if (error) {
1956                         zfree(aiolio_zone, lj);
1957                         return error;
1958                 }
1959                 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1960                         zfree(aiolio_zone, lj);
1961                         return EINVAL;
1962                 }
1963                 lj->lioj_flags |= LIOJ_SIGNAL;
1964                 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1965         } else
1966                 lj->lioj_flags &= ~LIOJ_SIGNAL;
1967
1968         TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1969         /*
1970          * Get pointers to the list of I/O requests.
1971          */
1972         nerror = 0;
1973         nentqueued = 0;
1974         cbptr = uap->acb_list;
1975         for (i = 0; i < uap->nent; i++) {
1976                 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1977                 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
1978                         error = _aio_aqueue(p, iocb, lj, 0);
1979                         if (error == 0)
1980                                 nentqueued++;
1981                         else
1982                                 nerror++;
1983                 }
1984         }
1985
1986         /*
1987          * If we haven't queued any, then just return error.
1988          */
1989         if (nentqueued == 0)
1990                 return 0;
1991
1992         /*
1993          * Calculate the appropriate error return.
1994          */
1995         runningcode = 0;
1996         if (nerror)
1997                 runningcode = EIO;
1998
1999         if (uap->mode == LIO_WAIT) {
2000                 int command, found, jobref;
2001                 
2002                 for (;;) {
2003                         found = 0;
2004                         for (i = 0; i < uap->nent; i++) {
2005                                 /*
2006                                  * Fetch address of the control buf pointer in
2007                                  * user space.
2008                                  */
2009                                 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2010                                 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2011                                     == 0))
2012                                         continue;
2013
2014                                 /*
2015                                  * Fetch the associated command from user space.
2016                                  */
2017                                 command = fuword(&iocb->aio_lio_opcode);
2018                                 if (command == LIO_NOP) {
2019                                         found++;
2020                                         continue;
2021                                 }
2022
2023                                 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2024
2025                                 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2026                                         if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2027                                             == jobref) {
2028                                                 if (cb->uaiocb.aio_lio_opcode
2029                                                     == LIO_WRITE) {
2030                                                         curproc->p_stats->p_ru.ru_oublock
2031                                                             +=
2032                                                             cb->outputcharge;
2033                                                         cb->outputcharge = 0;
2034                                                 } else if (cb->uaiocb.aio_lio_opcode
2035                                                     == LIO_READ) {
2036                                                         curproc->p_stats->p_ru.ru_inblock
2037                                                             += cb->inputcharge;
2038                                                         cb->inputcharge = 0;
2039                                                 }
2040                                                 found++;
2041                                                 break;
2042                                         }
2043                                 }
2044
2045                                 s = splbio();
2046                                 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2047                                         if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2048                                             == jobref) {
2049                                                 found++;
2050                                                 break;
2051                                         }
2052                                 }
2053                                 splx(s);
2054                         }
2055
2056                         /*
2057                          * If all I/Os have been disposed of, then we can
2058                          * return.
2059                          */
2060                         if (found == nentqueued)
2061                                 return runningcode;
2062                         
2063                         ki->kaio_flags |= KAIO_WAKEUP;
2064                         error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2065
2066                         if (error == EINTR)
2067                                 return EINTR;
2068                         else if (error == EWOULDBLOCK)
2069                                 return EAGAIN;
2070                 }
2071         }
2072
2073         return runningcode;
2074 #endif /* VFS_AIO */
2075 }
2076
2077 #ifdef VFS_AIO
2078 /*
2079  * This is a weird hack so that we can post a signal.  It is safe to do so from
2080  * a timeout routine, but *not* from an interrupt routine.
2081  */
2082 static void
2083 process_signal(void *aioj)
2084 {
2085         struct aiocblist *aiocbe = aioj;
2086         struct aio_liojob *lj = aiocbe->lio;
2087         struct aiocb *cb = &aiocbe->uaiocb;
2088
2089         if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2090                 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2091                 PROC_LOCK(lj->lioj_ki->kaio_p);
2092                 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2093                 PROC_UNLOCK(lj->lioj_ki->kaio_p);
2094                 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2095         }
2096
2097         if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2098                 PROC_LOCK(aiocbe->userproc);
2099                 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2100                 PROC_UNLOCK(aiocbe->userproc);
2101         }
2102 }
2103
2104 /*
2105  * Interrupt handler for physio, performs the necessary process wakeups, and
2106  * signals.
2107  */
2108 static void
2109 aio_physwakeup(struct buf *bp)
2110 {
2111         struct aiocblist *aiocbe;
2112         struct proc *p;
2113         struct kaioinfo *ki;
2114         struct aio_liojob *lj;
2115
2116         wakeup(bp);
2117
2118         aiocbe = (struct aiocblist *)bp->b_spc;
2119         if (aiocbe) {
2120                 p = bp->b_caller1;
2121
2122                 aiocbe->jobstate = JOBST_JOBBFINISHED;
2123                 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2124                 aiocbe->uaiocb._aiocb_private.error = 0;
2125                 aiocbe->jobflags |= AIOCBLIST_DONE;
2126
2127                 if (bp->b_ioflags & BIO_ERROR)
2128                         aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2129
2130                 lj = aiocbe->lio;
2131                 if (lj) {
2132                         lj->lioj_buffer_finished_count++;
2133                         
2134                         /*
2135                          * wakeup/signal if all of the interrupt jobs are done.
2136                          */
2137                         if (lj->lioj_buffer_finished_count ==
2138                             lj->lioj_buffer_count) {
2139                                 /*
2140                                  * Post a signal if it is called for.
2141                                  */
2142                                 if ((lj->lioj_flags &
2143                                     (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2144                                     LIOJ_SIGNAL) {
2145                                         lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2146                                         aiocbe->timeouthandle =
2147                                                 timeout(process_signal,
2148                                                         aiocbe, 0);
2149                                 }
2150                         }
2151                 }
2152
2153                 ki = p->p_aioinfo;
2154                 if (ki) {
2155                         ki->kaio_buffer_finished_count++;
2156                         TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2157                         TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2158                         TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2159
2160                         KNOTE(&aiocbe->klist, 0);
2161                         /* Do the wakeup. */
2162                         if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2163                                 ki->kaio_flags &= ~KAIO_WAKEUP;
2164                                 wakeup(p);
2165                         }
2166                 }
2167
2168                 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2169                         aiocbe->timeouthandle =
2170                                 timeout(process_signal, aiocbe, 0);
2171         }
2172 }
2173 #endif /* VFS_AIO */
2174
2175 int
2176 aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2177 {
2178 #ifndef VFS_AIO
2179         return ENOSYS;
2180 #else
2181         struct timeval atv;
2182         struct timespec ts;
2183         struct aiocb **cbptr;
2184         struct kaioinfo *ki;
2185         struct aiocblist *cb = NULL;
2186         int error, s, timo;
2187         
2188         suword(uap->aiocbp, (int)NULL);
2189
2190         timo = 0;
2191         if (uap->timeout) {
2192                 /* Get timespec struct. */
2193                 error = copyin(uap->timeout, &ts, sizeof(ts));
2194                 if (error)
2195                         return error;
2196
2197                 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2198                         return (EINVAL);
2199
2200                 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2201                 if (itimerfix(&atv))
2202                         return (EINVAL);
2203                 timo = tvtohz(&atv);
2204         }
2205
2206         ki = p->p_aioinfo;
2207         if (ki == NULL)
2208                 return EAGAIN;
2209
2210         cbptr = uap->aiocbp;
2211
2212         for (;;) {
2213                 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2214                         suword(uap->aiocbp, (int)cb->uuaiocb);
2215                         p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2216                         if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2217                                 curproc->p_stats->p_ru.ru_oublock +=
2218                                     cb->outputcharge;
2219                                 cb->outputcharge = 0;
2220                         } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2221                                 curproc->p_stats->p_ru.ru_inblock +=
2222                                     cb->inputcharge;
2223                                 cb->inputcharge = 0;
2224                         }
2225                         aio_free_entry(cb);
2226                         return cb->uaiocb._aiocb_private.error;
2227                 }
2228
2229                 s = splbio();
2230                 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2231                         splx(s);
2232                         suword(uap->aiocbp, (int)cb->uuaiocb);
2233                         p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2234                         aio_free_entry(cb);
2235                         return cb->uaiocb._aiocb_private.error;
2236                 }
2237
2238                 ki->kaio_flags |= KAIO_WAKEUP;
2239                 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2240                 splx(s);
2241
2242                 if (error == ERESTART)
2243                         return EINTR;
2244                 else if (error < 0)
2245                         return error;
2246                 else if (error == EINTR)
2247                         return EINTR;
2248                 else if (error == EWOULDBLOCK)
2249                         return EAGAIN;
2250         }
2251 #endif /* VFS_AIO */
2252 }
2253
2254
2255 #ifndef VFS_AIO
2256 static int
2257 filt_aioattach(struct knote *kn)
2258 {
2259
2260         return (ENXIO);
2261 }
2262
2263 struct filterops aio_filtops =
2264         { 0, filt_aioattach, NULL, NULL };
2265
2266 #else
2267 static int
2268 filt_aioattach(struct knote *kn)
2269 {
2270         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2271
2272         /*
2273          * The aiocbe pointer must be validated before using it, so
2274          * registration is restricted to the kernel; the user cannot
2275          * set EV_FLAG1.
2276          */
2277         if ((kn->kn_flags & EV_FLAG1) == 0)
2278                 return (EPERM);
2279         kn->kn_flags &= ~EV_FLAG1;
2280
2281         SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2282
2283         return (0);
2284 }
2285
2286 static void
2287 filt_aiodetach(struct knote *kn)
2288 {
2289         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2290         int s = splhigh();       /* XXX no clue, so overkill */
2291
2292         SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2293         splx(s);
2294 }
2295
2296 /*ARGSUSED*/
2297 static int
2298 filt_aio(struct knote *kn, long hint)
2299 {
2300         struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2301
2302         kn->kn_data = 0;                /* XXX data returned? */
2303         if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2304             aiocbe->jobstate != JOBST_JOBBFINISHED)
2305                 return (0);
2306         kn->kn_flags |= EV_EOF; 
2307         return (1);
2308 }
2309
2310 struct filterops aio_filtops =
2311         { 0, filt_aioattach, filt_aiodetach, filt_aio };
2312 #endif /* VFS_AIO */