Index: sys/conf/files =================================================================== --- sys/conf/files (revision 226066) +++ sys/conf/files (working copy) @@ -2397,6 +2397,7 @@ kern/subr_msgbuf.c standard kern/subr_param.c standard kern/subr_pcpu.c standard +kern/subr_pipe.c standard kern/subr_power.c standard kern/subr_prf.c standard kern/subr_prof.c standard Index: sys/kern/subr_pipe.c =================================================================== --- sys/kern/subr_pipe.c (revision 0) +++ sys/kern/subr_pipe.c (revision 0) @@ -0,0 +1,1530 @@ +/*- + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + */ + +/* + * This file contains a high-performance replacement for the socket-based + * pipes scheme originally used in FreeBSD/4.4Lite. It does not support + * all features of sockets, but does do everything that pipes normally + * do. + */ + +/* + * This code has two modes of operation, a small write mode and a large + * write mode. The small write mode acts like conventional pipes with + * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the + * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT + * and PIPE_SIZE in size, the sending process pins the underlying pages in + * memory, and the receiving process copies directly from these pinned pages + * in the sending process. + * + * If the sending process receives a signal, it is possible that it will + * go away, and certainly its address space can change, because control + * is returned back to the user-mode side. In that case, the pipe code + * arranges to copy the buffer supplied by the user process, to a pageable + * kernel buffer, and the receiving process will grab the data from the + * pageable kernel buffer. Since signals don't happen all that often, + * the copy operation is normally eliminated. + * + * The constant PIPE_MINDIRECT is chosen to make sure that buffering will + * happen for small transfers so that the system will not spend all of + * its time context switching. + * + * In order to limit the resource use of pipes, two sysctls exist: + * + * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable + * address space available to us in pipe_map. This value is normally + * autotuned, but may also be loader tuned. + * + * kern.ipc.pipekva - This read-only sysctl tracks the current amount of + * memory in use by pipes. + * + * Based on how large pipekva is relative to maxpipekva, the following + * will happen: + * + * 0% - 50%: + * New pipes are given 16K of memory backing, pipes may dynamically + * grow to as large as 64K where needed. + * 50% - 75%: + * New pipes are given 4K (or PAGE_SIZE) of memory backing, + * existing pipes may NOT grow. + * 75% - 100%: + * New pipes are given 4K (or PAGE_SIZE) of memory backing, + * existing pipes will be shrunk down to 4K whenever possible. + * + * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If + * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE + * resize which MUST occur for reverse-direction pipes when they are + * first used. + * + * Additional information about the current state of pipes may be obtained + * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, + * and kern.ipc.piperesizefail. + * + * Locking rules: There are two locks present here: A mutex, used via + * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via + * the flag, as mutexes can not persist over uiomove. The mutex + * exists only to guard access to the flag, and is not in itself a + * locking mechanism. Also note that there is only a single mutex for + * both directions of a pipe. + * + * As pipelock() may have to sleep before it can acquire the flag, it + * is important to reread all data after a call to pipelock(); everything + * in the structure may have changed. + */ + +#include +__FBSDID("$FreeBSD"); + +#include "opt_mac.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Use this define if you want to disable *fancy* VM things. Expect an + * approx 30% decrease in transfer rate. This could be useful for + * NetBSD or OpenBSD. + */ +/* #define PIPE_NODIRECT */ + +static void filt_pipedetach(struct knote *kn); +static int filt_piperead(struct knote *kn, long hint); +static int filt_pipewrite(struct knote *kn, long hint); + +static struct filterops pipe_rfiltops = { + .f_isfd = 1, + .f_detach = filt_pipedetach, + .f_event = filt_piperead +}; +static struct filterops pipe_wfiltops = { + .f_isfd = 1, + .f_detach = filt_pipedetach, + .f_event = filt_pipewrite +}; + +/* + * Default pipe buffer size(s), this can be kind-of large now because pipe + * space is pageable. The pipe code will try to maintain locality of + * reference for performance reasons, so small amounts of outstanding I/O + * will not wipe the cache. + */ +#define MINPIPESIZE (PIPE_SIZE/3) +#define MAXPIPESIZE (2*PIPE_SIZE/3) + +static long amountpipekva; +static int pipefragretry; +static int pipeallocfail; +static int piperesizefail; +static int piperesizeallowed = 1; + +SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, + &maxpipekva, 0, "Pipe KVA limit"); +SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, + &amountpipekva, 0, "Pipe KVA usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, + &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); +SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, + &pipeallocfail, 0, "Pipe allocation failures"); +SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, + &piperesizefail, 0, "Pipe resize failures"); +SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, + &piperesizeallowed, 0, "Pipe resizing allowed"); + +static void pipeinit(void *dummy __unused); +static void pipe_free_kmem(struct pipe *cpipe); +static int pipe_create(struct pipe *pipe, int backing); +static __inline int pipelock(struct pipe *cpipe, int catch); +static __inline void pipeunlock(struct pipe *cpipe); +static __inline void pipeselwakeup(struct pipe *cpipe); +#ifndef PIPE_NODIRECT +static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); +static void pipe_destroy_write_buffer(struct pipe *wpipe); +static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); +static void pipe_clone_write_buffer(struct pipe *wpipe); +#endif +static int pipespace(struct pipe *cpipe, int size); +static int pipespace_new(struct pipe *cpipe, int size); + +static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); +static int pipe_zone_init(void *mem, int size, int flags); +static void pipe_zone_fini(void *mem, int size); + +static uma_zone_t pipe_zone; +static struct unrhdr *pipeino_unr; +static dev_t pipedev_ino; + +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); + +static void +pipeinit(void *dummy __unused) +{ + + pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), + pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, + UMA_ALIGN_PTR, 0); + KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); + pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); + KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); + pipedev_ino = devfs_alloc_cdp_inode(); + KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); +} + +static int +pipe_zone_ctor(void *mem, int size, void *arg, int flags) +{ + struct pipepair *pp; + struct pipe *rpipe, *wpipe; + + KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); + + pp = (struct pipepair *)mem; + + /* + * We zero both pipe endpoints to make sure all the kmem pointers + * are NULL, flag fields are zero'd, etc. We timestamp both + * endpoints with the same time. + */ + rpipe = &pp->pp_rpipe; + bzero(rpipe, sizeof(*rpipe)); + vfs_timestamp(&rpipe->pipe_ctime); + rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; + + wpipe = &pp->pp_wpipe; + bzero(wpipe, sizeof(*wpipe)); + wpipe->pipe_ctime = rpipe->pipe_ctime; + wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; + + rpipe->pipe_peer = wpipe; + rpipe->pipe_pair = pp; + wpipe->pipe_peer = rpipe; + wpipe->pipe_pair = pp; + + /* + * Mark both endpoints as present; they will later get free'd + * one at a time. When both are free'd, then the whole pair + * is released. + */ + rpipe->pipe_present = PIPE_ACTIVE; + wpipe->pipe_present = PIPE_ACTIVE; + + /* + * Eventually, the MAC Framework may initialize the label + * in ctor or init, but for now we do it elswhere to avoid + * blocking in ctor or init. + */ + pp->pp_label = NULL; + + return (0); +} + +static int +pipe_zone_init(void *mem, int size, int flags) +{ + struct pipepair *pp; + + KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); + + pp = (struct pipepair *)mem; + + mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); + return (0); +} + +static void +pipe_zone_fini(void *mem, int size) +{ + struct pipepair *pp; + + KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); + + pp = (struct pipepair *)mem; + + mtx_destroy(&pp->pp_mtx); +} + +int +pipepair_create(struct thread *td, struct pipe **p_rpipe, struct pipe **p_wpipe) +{ + struct pipepair *pp; + struct pipe *rpipe, *wpipe; + int error; + + pp = uma_zalloc(pipe_zone, M_WAITOK); +#ifdef MAC + /* + * The MAC label is shared between the connected endpoints. As a + * result mac_pipe_init() and mac_pipe_create() are called once + * for the pair, and not on the endpoints. + */ + mac_pipe_init(pp); + mac_pipe_create(td->td_ucred, pp); +#endif + rpipe = &pp->pp_rpipe; + wpipe = &pp->pp_wpipe; + *p_rpipe = rpipe; + *p_wpipe = wpipe; + + knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); + knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); + + if ((error = pipe_create(rpipe, 1)) != 0 || + (error = pipe_create(wpipe, 0)) != 0) { + pipe_close(rpipe); + pipe_close(wpipe); + return (error); + } + + rpipe->pipe_state |= PIPE_DIRECTOK; + wpipe->pipe_state |= PIPE_DIRECTOK; + + return (0); +} + +/* + * Allocate kva for pipe circular buffer, the space is pageable + * This routine will 'realloc' the size of a pipe safely, if it fails + * it will retain the old buffer. + * If it fails it will return ENOMEM. + */ +static int +pipespace_new(cpipe, size) + struct pipe *cpipe; + int size; +{ + caddr_t buffer; + int error, cnt, firstseg; + static int curfail = 0; + static struct timeval lastfail; + + KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); + KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), + ("pipespace: resize of direct writes not allowed")); +retry: + cnt = cpipe->pipe_buffer.cnt; + if (cnt > size) + size = cnt; + + size = round_page(size); + buffer = (caddr_t) vm_map_min(pipe_map); + + error = vm_map_find(pipe_map, NULL, 0, + (vm_offset_t *) &buffer, size, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) { + if ((cpipe->pipe_buffer.buffer == NULL) && + (size > SMALL_PIPE_SIZE)) { + size = SMALL_PIPE_SIZE; + pipefragretry++; + goto retry; + } + if (cpipe->pipe_buffer.buffer == NULL) { + pipeallocfail++; + if (ppsratecheck(&lastfail, &curfail, 1)) + printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); + } else { + piperesizefail++; + } + return (ENOMEM); + } + + /* copy data, then free old resources if we're resizing */ + if (cnt > 0) { + if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { + firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; + bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], + buffer, firstseg); + if ((cnt - firstseg) > 0) + bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], + cpipe->pipe_buffer.in); + } else { + bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], + buffer, cnt); + } + } + pipe_free_kmem(cpipe); + cpipe->pipe_buffer.buffer = buffer; + cpipe->pipe_buffer.size = size; + cpipe->pipe_buffer.in = cnt; + cpipe->pipe_buffer.out = 0; + cpipe->pipe_buffer.cnt = cnt; + atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); + return (0); +} + +/* + * Wrapper for pipespace_new() that performs locking assertions. + */ +static int +pipespace(cpipe, size) + struct pipe *cpipe; + int size; +{ + + KASSERT(cpipe->pipe_state & PIPE_LOCKFL, + ("Unlocked pipe passed to pipespace")); + return (pipespace_new(cpipe, size)); +} + +/* + * lock a pipe for I/O, blocking other access + */ +static __inline int +pipelock(cpipe, catch) + struct pipe *cpipe; + int catch; +{ + int error; + + PIPE_LOCK_ASSERT(cpipe, MA_OWNED); + while (cpipe->pipe_state & PIPE_LOCKFL) { + cpipe->pipe_state |= PIPE_LWANT; + error = msleep(cpipe, PIPE_MTX(cpipe), + catch ? (PRIBIO | PCATCH) : PRIBIO, + "pipelk", 0); + if (error != 0) + return (error); + } + cpipe->pipe_state |= PIPE_LOCKFL; + return (0); +} + +/* + * unlock a pipe I/O lock + */ +static __inline void +pipeunlock(cpipe) + struct pipe *cpipe; +{ + + PIPE_LOCK_ASSERT(cpipe, MA_OWNED); + KASSERT(cpipe->pipe_state & PIPE_LOCKFL, + ("Unlocked pipe passed to pipeunlock")); + cpipe->pipe_state &= ~PIPE_LOCKFL; + if (cpipe->pipe_state & PIPE_LWANT) { + cpipe->pipe_state &= ~PIPE_LWANT; + wakeup(cpipe); + } +} + +static __inline void +pipeselwakeup(cpipe) + struct pipe *cpipe; +{ + + PIPE_LOCK_ASSERT(cpipe, MA_OWNED); + if (cpipe->pipe_state & PIPE_SEL) { + selwakeuppri(&cpipe->pipe_sel, PSOCK); + if (!SEL_WAITING(&cpipe->pipe_sel)) + cpipe->pipe_state &= ~PIPE_SEL; + } + if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) + pgsigio(&cpipe->pipe_sigio, SIGIO, 0); + KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); +} + +/* + * Initialize and allocate VM and memory for pipe. The structure + * will start out zero'd from the ctor, so we just manage the kmem. + */ +static int +pipe_create(pipe, backing) + struct pipe *pipe; + int backing; +{ + int error; + + if (backing) { + if (amountpipekva > maxpipekva / 2) + error = pipespace_new(pipe, SMALL_PIPE_SIZE); + else + error = pipespace_new(pipe, PIPE_SIZE); + } else { + /* If we're not backing this pipe, no need to do anything. */ + error = 0; + } + if (error == 0) { + pipe->pipe_ino = alloc_unr(pipeino_unr); + if (pipe->pipe_ino == -1) + /* pipeclose will clear allocated kva */ + error = ENOMEM; + } + return (error); +} + +int +pipe_read(struct pipe *pipe, struct uio *uio, struct ucred *active_cred, + int f_flags, struct thread *td) +{ + struct pipe *rpipe = pipe; + int error; + int nread = 0; + u_int size; + + PIPE_LOCK(rpipe); + ++rpipe->pipe_busy; + error = pipelock(rpipe, 1); + if (error) + goto unlocked_error; + +#ifdef MAC + error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); + if (error) + goto locked_error; +#endif + if (amountpipekva > (3 * maxpipekva) / 4) { + if (!(rpipe->pipe_state & PIPE_DIRECTW) && + (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && + (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && + (piperesizeallowed == 1)) { + PIPE_UNLOCK(rpipe); + pipespace(rpipe, SMALL_PIPE_SIZE); + PIPE_LOCK(rpipe); + } + } + + while (uio->uio_resid) { + /* + * normal pipe buffer receive + */ + if (rpipe->pipe_buffer.cnt > 0) { + size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; + if (size > rpipe->pipe_buffer.cnt) + size = rpipe->pipe_buffer.cnt; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + + PIPE_UNLOCK(rpipe); + error = uiomove( + &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], + size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + + rpipe->pipe_buffer.out += size; + if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) + rpipe->pipe_buffer.out = 0; + + rpipe->pipe_buffer.cnt -= size; + + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + if (rpipe->pipe_buffer.cnt == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + } + nread += size; +#ifndef PIPE_NODIRECT + /* + * Direct copy, bypassing a kernel buffer. + */ + } else if ((size = rpipe->pipe_map.cnt) && + (rpipe->pipe_state & PIPE_DIRECTW)) { + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + + PIPE_UNLOCK(rpipe); + error = uiomove_fromphys(rpipe->pipe_map.ms, + rpipe->pipe_map.pos, size, uio); + PIPE_LOCK(rpipe); + if (error) + break; + nread += size; + rpipe->pipe_map.pos += size; + rpipe->pipe_map.cnt -= size; + if (rpipe->pipe_map.cnt == 0) { + rpipe->pipe_state &= ~PIPE_DIRECTW; + wakeup(rpipe); + } +#endif + } else { + /* + * detect EOF condition + * read returns 0 on EOF, no need to set error + */ + if (rpipe->pipe_state & PIPE_EOF) + break; + + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + + /* + * Break if some data was read. + */ + if (nread > 0) + break; + + /* + * Unlock the pipe buffer for our remaining processing. + * We will either break out with an error or we will + * sleep and relock to loop. + */ + pipeunlock(rpipe); + + /* + * Handle non-blocking mode operation or + * wait for more data. + */ + if (f_flags & FNONBLOCK) { + error = EAGAIN; + } else { + rpipe->pipe_state |= PIPE_WANTR; + if ((error = msleep(rpipe, PIPE_MTX(rpipe), + PRIBIO | PCATCH, + "piperd", 0)) == 0) + error = pipelock(rpipe, 1); + } + if (error) + goto unlocked_error; + } + } +#ifdef MAC +locked_error: +#endif + pipeunlock(rpipe); + + /* XXX: should probably do this before getting any locks. */ + if (error == 0) + vfs_timestamp(&rpipe->pipe_atime); +unlocked_error: + --rpipe->pipe_busy; + + /* + * PIPE_WANT processing only makes sense if pipe_busy is 0. + */ + if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { + rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); + wakeup(rpipe); + } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + /* + * Handle write blocking hysteresis. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + } + + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + pipeselwakeup(rpipe); + + PIPE_UNLOCK(rpipe); + return (error); +} + +#ifndef PIPE_NODIRECT +/* + * Map the sending processes' buffer into kernel space and wire it. + * This is similar to a physical write operation. + */ +static int +pipe_build_write_buffer(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + u_int size; + int i; + + PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); + KASSERT(wpipe->pipe_state & PIPE_DIRECTW, + ("Clone attempt on non-direct write pipe!")); + + size = (u_int) uio->uio_iov->iov_len; + if (size > wpipe->pipe_buffer.size) + size = wpipe->pipe_buffer.size; + + if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, + wpipe->pipe_map.ms, PIPENPAGES)) < 0) + return (EFAULT); + +/* + * set up the control block + */ + wpipe->pipe_map.npages = i; + wpipe->pipe_map.pos = + ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; + wpipe->pipe_map.cnt = size; + +/* + * and update the uio data + */ + + uio->uio_iov->iov_len -= size; + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; + if (uio->uio_iov->iov_len == 0) + uio->uio_iov++; + uio->uio_resid -= size; + uio->uio_offset += size; + return (0); +} + +/* + * unmap and unwire the process buffer + */ +static void +pipe_destroy_write_buffer(wpipe) + struct pipe *wpipe; +{ + + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); + wpipe->pipe_map.npages = 0; +} + +/* + * In the case of a signal, the writing process might go away. This + * code copies the data into the circular buffer so that the source + * pages can be freed without loss of data. + */ +static void +pipe_clone_write_buffer(wpipe) + struct pipe *wpipe; +{ + struct uio uio; + struct iovec iov; + int size; + int pos; + + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + size = wpipe->pipe_map.cnt; + pos = wpipe->pipe_map.pos; + + wpipe->pipe_buffer.in = size; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = size; + wpipe->pipe_state &= ~PIPE_DIRECTW; + + PIPE_UNLOCK(wpipe); + iov.iov_base = wpipe->pipe_buffer.buffer; + iov.iov_len = size; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = size; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_READ; + uio.uio_td = curthread; + uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); + PIPE_LOCK(wpipe); + pipe_destroy_write_buffer(wpipe); +} + +/* + * This implements the pipe buffer write mechanism. Note that only + * a direct write OR a normal pipe write can be pending at any given time. + * If there are any characters in the pipe buffer, the direct write will + * be deferred until the receiving process grabs all of the bytes from + * the pipe buffer. Then the direct mapping write is set-up. + */ +static int +pipe_direct_write(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + int error; + +retry: + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + error = pipelock(wpipe, 1); + if (wpipe->pipe_state & PIPE_EOF) + error = EPIPE; + if (error) { + pipeunlock(wpipe); + goto error1; + } + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + wpipe->pipe_state |= PIPE_WANTW; + pipeunlock(wpipe); + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdww", 0); + if (error) + goto error1; + else + goto retry; + } + wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ + if (wpipe->pipe_buffer.cnt > 0) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + wpipe->pipe_state |= PIPE_WANTW; + pipeunlock(wpipe); + error = msleep(wpipe, PIPE_MTX(wpipe), + PRIBIO | PCATCH, "pipdwc", 0); + if (error) + goto error1; + else + goto retry; + } + + wpipe->pipe_state |= PIPE_DIRECTW; + + PIPE_UNLOCK(wpipe); + error = pipe_build_write_buffer(wpipe, uio); + PIPE_LOCK(wpipe); + if (error) { + wpipe->pipe_state &= ~PIPE_DIRECTW; + pipeunlock(wpipe); + goto error1; + } + + error = 0; + while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + if (wpipe->pipe_state & PIPE_EOF) { + pipe_destroy_write_buffer(wpipe); + pipeselwakeup(wpipe); + pipeunlock(wpipe); + error = EPIPE; + goto error1; + } + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + pipeunlock(wpipe); + error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, + "pipdwt", 0); + pipelock(wpipe, 0); + } + + if (wpipe->pipe_state & PIPE_EOF) + error = EPIPE; + if (wpipe->pipe_state & PIPE_DIRECTW) { + /* + * this bit of trickery substitutes a kernel buffer for + * the process that might be going away. + */ + pipe_clone_write_buffer(wpipe); + } else { + pipe_destroy_write_buffer(wpipe); + } + pipeunlock(wpipe); + return (error); + +error1: + wakeup(wpipe); + return (error); +} +#endif + +int +pipe_write(struct pipe *pipe, struct uio *uio, struct ucred *active_cred, + int f_flags, struct thread *td) +{ + int error = 0; + int desiredsize, orig_resid; + struct pipe *rpipe, *wpipe; + + rpipe = pipe; + wpipe = rpipe->pipe_peer; + + PIPE_LOCK(rpipe); + error = pipelock(wpipe, 1); + if (error) { + PIPE_UNLOCK(rpipe); + return (error); + } + + /* + * detect loss of pipe read side, issue SIGPIPE if lost. + */ + if (wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) { + pipeunlock(wpipe); + PIPE_UNLOCK(rpipe); + return (EPIPE); + } +#ifdef MAC + error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); + if (error) { + pipeunlock(wpipe); + PIPE_UNLOCK(rpipe); + return (error); + } +#endif + ++wpipe->pipe_busy; + + /* Choose a larger size if it's advantageous */ + desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); + while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { + if (piperesizeallowed != 1) + break; + if (amountpipekva > maxpipekva / 2) + break; + if (desiredsize == BIG_PIPE_SIZE) + break; + desiredsize = desiredsize * 2; + } + + /* Choose a smaller size if we're in a OOM situation */ + if ((amountpipekva > (3 * maxpipekva) / 4) && + (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && + (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && + (piperesizeallowed == 1)) + desiredsize = SMALL_PIPE_SIZE; + + /* Resize if the above determined that a new size was necessary */ + if ((desiredsize != wpipe->pipe_buffer.size) && + ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { + PIPE_UNLOCK(wpipe); + pipespace(wpipe, desiredsize); + PIPE_LOCK(wpipe); + } + if (wpipe->pipe_buffer.size == 0) { + /* + * This can only happen for reverse direction use of pipes + * in a complete OOM situation. + */ + error = ENOMEM; + --wpipe->pipe_busy; + pipeunlock(wpipe); + PIPE_UNLOCK(wpipe); + return (error); + } + + pipeunlock(wpipe); + + orig_resid = uio->uio_resid; + + while (uio->uio_resid) { + int space; + + pipelock(wpipe, 0); + if (wpipe->pipe_state & PIPE_EOF) { + pipeunlock(wpipe); + error = EPIPE; + break; + } +#ifndef PIPE_NODIRECT + /* + * If the transfer is large, we can gain performance if + * we do process-to-process copies directly. + * If the write is non-blocking, we don't use the + * direct write mechanism. + * + * The direct write mechanism will detect the reader going + * away on us. + */ + if (uio->uio_segflg == UIO_USERSPACE && + uio->uio_iov->iov_len >= PIPE_MINDIRECT && + wpipe->pipe_buffer.size >= PIPE_MINDIRECT && + (f_flags & FNONBLOCK) == 0) { + pipeunlock(wpipe); + error = pipe_direct_write(wpipe, uio); + if (error) + break; + continue; + } +#endif + + /* + * Pipe buffered writes cannot be coincidental with + * direct writes. We wait until the currently executing + * direct write is completed before we start filling the + * pipe buffer. We break out if a signal occurs or the + * reader goes away. + */ + if (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + wpipe->pipe_state |= PIPE_WANTW; + pipeunlock(wpipe); + error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, + "pipbww", 0); + if (error) + break; + else + continue; + } + + space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + + /* Writes of size <= PIPE_BUF must be atomic. */ + if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) + space = 0; + + if (space > 0) { + int size; /* Transfer size */ + int segsize; /* first segment to transfer */ + + /* + * Transfer size is minimum of uio transfer + * and free space in pipe buffer. + */ + if (space > uio->uio_resid) + size = uio->uio_resid; + else + size = space; + /* + * First segment to transfer is minimum of + * transfer size and contiguous space in + * pipe buffer. If first segment to transfer + * is less than the transfer size, we've got + * a wraparound in the buffer. + */ + segsize = wpipe->pipe_buffer.size - + wpipe->pipe_buffer.in; + if (segsize > size) + segsize = size; + + /* Transfer first segment */ + + PIPE_UNLOCK(rpipe); + error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], + segsize, uio); + PIPE_LOCK(rpipe); + + if (error == 0 && segsize < size) { + KASSERT(wpipe->pipe_buffer.in + segsize == + wpipe->pipe_buffer.size, + ("Pipe buffer wraparound disappeared")); + + /* + * Transfer remaining part now, to + * support atomic writes. Wraparound + * happened. + */ + PIPE_UNLOCK(rpipe); + error = uiomove( + &wpipe->pipe_buffer.buffer[0], + size - segsize, uio); + PIPE_LOCK(rpipe); + } + if (error == 0) { + wpipe->pipe_buffer.in += size; + if (wpipe->pipe_buffer.in >= + wpipe->pipe_buffer.size) { + KASSERT(wpipe->pipe_buffer.in == + size - segsize + + wpipe->pipe_buffer.size, + ("Expected wraparound bad")); + wpipe->pipe_buffer.in = size - segsize; + } + + wpipe->pipe_buffer.cnt += size; + KASSERT(wpipe->pipe_buffer.cnt <= + wpipe->pipe_buffer.size, + ("Pipe buffer overflow")); + } + pipeunlock(wpipe); + if (error != 0) + break; + } else { + + /* + * If the "read-side" has been blocked, wake it up now. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + /* + * don't block on non-blocking I/O + */ + if (f_flags & FNONBLOCK) { + error = EAGAIN; + pipeunlock(wpipe); + break; + } + + /* + * We have no more space and have something to offer, + * wake up select/poll. + */ + pipeselwakeup(wpipe); + + wpipe->pipe_state |= PIPE_WANTW; + pipeunlock(wpipe); + error = msleep(wpipe, PIPE_MTX(rpipe), + PRIBIO | PCATCH, "pipewr", 0); + if (error != 0) + break; + } + } + + pipelock(wpipe, 0); + --wpipe->pipe_busy; + + if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); + wakeup(wpipe); + } else if (wpipe->pipe_buffer.cnt > 0) { + /* + * If we have put any characters in the buffer, we wake up + * the reader. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + } + + /* + * Don't return EPIPE if I/O was successful + */ + if ((wpipe->pipe_buffer.cnt == 0) && + (uio->uio_resid == 0) && + (error == EPIPE)) { + error = 0; + } + + if (error == 0) + vfs_timestamp(&wpipe->pipe_mtime); + + /* + * We have something to offer, + * wake up select/poll. + */ + if (wpipe->pipe_buffer.cnt) + pipeselwakeup(wpipe); + + pipeunlock(wpipe); + PIPE_UNLOCK(rpipe); + return (error); +} + +int +pipe_truncate(struct pipe *pipe, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + +/* + * we implement a very minimal set of ioctls for compatibility with sockets. + */ +int +pipe_ioctl(struct pipe *pipe, u_long cmd, void *data, struct ucred *active_cred, + struct thread *td) +{ + struct pipe *mpipe = pipe; + int error; + + PIPE_LOCK(mpipe); + +#ifdef MAC + error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); + if (error) { + PIPE_UNLOCK(mpipe); + return (error); + } +#endif + + error = 0; + switch (cmd) { + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *)data) { + mpipe->pipe_state |= PIPE_ASYNC; + } else { + mpipe->pipe_state &= ~PIPE_ASYNC; + } + break; + + case FIONREAD: + if (mpipe->pipe_state & PIPE_DIRECTW) + *(int *)data = mpipe->pipe_map.cnt; + else + *(int *)data = mpipe->pipe_buffer.cnt; + break; + + case FIOSETOWN: + PIPE_UNLOCK(mpipe); + error = fsetown(*(int *)data, &mpipe->pipe_sigio); + goto out_unlocked; + + case FIOGETOWN: + *(int *)data = fgetown(&mpipe->pipe_sigio); + break; + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + PIPE_UNLOCK(mpipe); + error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); + goto out_unlocked; + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(&mpipe->pipe_sigio); + break; + + default: + error = ENOTTY; + break; + } + PIPE_UNLOCK(mpipe); +out_unlocked: + return (error); +} + +int +pipe_poll(struct pipe *pipe, int events, struct ucred *active_cred, + struct thread *td) +{ + struct pipe *rpipe = pipe; + struct pipe *wpipe; + int revents = 0; +#ifdef MAC + int error; +#endif + + wpipe = rpipe->pipe_peer; + PIPE_LOCK(rpipe); +#ifdef MAC + error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); + if (error) + goto locked_error; +#endif + if (events & (POLLIN | POLLRDNORM)) + if ((rpipe->pipe_state & PIPE_DIRECTW) || + (rpipe->pipe_buffer.cnt > 0)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF) || + (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) + revents |= events & (POLLOUT | POLLWRNORM); + + if ((events & POLLINIGNEOF) == 0) { + if (rpipe->pipe_state & PIPE_EOF) { + revents |= (events & (POLLIN | POLLRDNORM)); + if (wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) + revents |= POLLHUP; + } + } + + if (revents == 0) { + if (events & (POLLIN | POLLRDNORM)) { + selrecord(td, &rpipe->pipe_sel); + if (SEL_WAITING(&rpipe->pipe_sel)) + rpipe->pipe_state |= PIPE_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &wpipe->pipe_sel); + if (SEL_WAITING(&wpipe->pipe_sel)) + wpipe->pipe_state |= PIPE_SEL; + } + } +#ifdef MAC +locked_error: +#endif + PIPE_UNLOCK(rpipe); + + return (revents); +} + +/* + * We shouldn't need locks here as we're doing a read and this should + * be a natural race. + */ +int +pipe_stat(struct pipe *pipe, struct stat *ub, struct ucred *active_cred, + struct thread *td) +{ +#ifdef MAC + int error; + + PIPE_LOCK(pipe); + error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); + PIPE_UNLOCK(pipe); + if (error) + return (error); +#endif + bzero(ub, sizeof(*ub)); + ub->st_mode = S_IFIFO; + ub->st_blksize = PAGE_SIZE; + if (pipe->pipe_state & PIPE_DIRECTW) + ub->st_size = pipe->pipe_map.cnt; + else + ub->st_size = pipe->pipe_buffer.cnt; + ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; + ub->st_atim = pipe->pipe_atime; + ub->st_mtim = pipe->pipe_mtime; + ub->st_ctim = pipe->pipe_ctime; + ub->st_dev = pipedev_ino; + ub->st_ino = pipe->pipe_ino; + /* + * Left as 0: st_nlink, st_rdev, st_flags, st_gen. + */ + return (0); +} + +static void +pipe_free_kmem(cpipe) + struct pipe *cpipe; +{ + + KASSERT(!mtx_owned(PIPE_MTX(cpipe)), + ("pipe_free_kmem: pipe mutex locked")); + + if (cpipe->pipe_buffer.buffer != NULL) { + atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); + vm_map_remove(pipe_map, + (vm_offset_t)cpipe->pipe_buffer.buffer, + (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); + cpipe->pipe_buffer.buffer = NULL; + } +#ifndef PIPE_NODIRECT + { + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + } +#endif +} + +/* + * shutdown the pipe + */ +void +pipe_close(struct pipe *cpipe) +{ + struct pipepair *pp; + struct pipe *ppipe; + ino_t ino; + + KASSERT(cpipe != NULL, ("pipe_close: cpipe == NULL")); + + funsetown(&cpipe->pipe_sigio); + + PIPE_LOCK(cpipe); + pipelock(cpipe, 0); + pp = cpipe->pipe_pair; + + pipeselwakeup(cpipe); + + /* + * If the other side is blocked, wake it up saying that + * we want to close it down. + */ + cpipe->pipe_state |= PIPE_EOF; + while (cpipe->pipe_busy) { + wakeup(cpipe); + cpipe->pipe_state |= PIPE_WANT; + pipeunlock(cpipe); + msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); + pipelock(cpipe, 0); + } + + /* + * Disconnect from peer, if any. + */ + ppipe = cpipe->pipe_peer; + if (ppipe->pipe_present == PIPE_ACTIVE) { + pipeselwakeup(ppipe); + + ppipe->pipe_state |= PIPE_EOF; + wakeup(ppipe); + KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); + } + + /* + * Mark this endpoint as free. Release kmem resources. We + * don't mark this endpoint as unused until we've finished + * doing that, or the pipe might disappear out from under + * us. + */ + PIPE_UNLOCK(cpipe); + pipe_free_kmem(cpipe); + PIPE_LOCK(cpipe); + cpipe->pipe_present = PIPE_CLOSING; + pipeunlock(cpipe); + + /* + * knlist_clear() may sleep dropping the PIPE_MTX. Set the + * PIPE_FINALIZED, that allows other end to free the + * pipe_pair, only after the knotes are completely dismantled. + */ + knlist_clear(&cpipe->pipe_sel.si_note, 1); + cpipe->pipe_present = PIPE_FINALIZED; + seldrain(&cpipe->pipe_sel); + knlist_destroy(&cpipe->pipe_sel.si_note); + + /* + * Postpone the destroy of the fake inode number allocated for + * our end, until pipe mtx is unlocked. + */ + ino = cpipe->pipe_ino; + + /* + * If both endpoints are now closed, release the memory for the + * pipe pair. If not, unlock. + */ + if (ppipe->pipe_present == PIPE_FINALIZED) { + PIPE_UNLOCK(cpipe); +#ifdef MAC + mac_pipe_destroy(pp); +#endif + uma_zfree(pipe_zone, cpipe->pipe_pair); + } else + PIPE_UNLOCK(cpipe); + + if (ino > 0) + free_unr(pipeino_unr, cpipe->pipe_ino); +} + +int +pipe_kqfilter(struct pipe *cpipe, struct knote *kn) +{ + + PIPE_LOCK(cpipe); + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &pipe_rfiltops; + break; + case EVFILT_WRITE: + kn->kn_fop = &pipe_wfiltops; + if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { + /* other end of pipe has been closed */ + PIPE_UNLOCK(cpipe); + return (EPIPE); + } + cpipe = cpipe->pipe_peer; + break; + default: + PIPE_UNLOCK(cpipe); + return (EINVAL); + } + + knlist_add(&cpipe->pipe_sel.si_note, kn, 1); + PIPE_UNLOCK(cpipe); + return (0); +} + +static void +filt_pipedetach(struct knote *kn) +{ + struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; + + PIPE_LOCK(cpipe); + if (kn->kn_filter == EVFILT_WRITE) + cpipe = cpipe->pipe_peer; + knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); + PIPE_UNLOCK(cpipe); +} + +/*ARGSUSED*/ +static int +filt_piperead(struct knote *kn, long hint) +{ + struct pipe *rpipe = kn->kn_fp->f_data; + struct pipe *wpipe = rpipe->pipe_peer; + int ret; + + PIPE_LOCK(rpipe); + kn->kn_data = rpipe->pipe_buffer.cnt; + if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + kn->kn_data = rpipe->pipe_map.cnt; + + if ((rpipe->pipe_state & PIPE_EOF) || + wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(rpipe); + return (1); + } + ret = kn->kn_data > 0; + PIPE_UNLOCK(rpipe); + return ret; +} + +/*ARGSUSED*/ +static int +filt_pipewrite(struct knote *kn, long hint) +{ + struct pipe *rpipe = kn->kn_fp->f_data; + struct pipe *wpipe = rpipe->pipe_peer; + + PIPE_LOCK(rpipe); + if (wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(rpipe); + return (1); + } + kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + if (wpipe->pipe_state & PIPE_DIRECTW) + kn->kn_data = 0; + + PIPE_UNLOCK(rpipe); + return (kn->kn_data >= PIPE_BUF); +} Index: sys/kern/sys_pipe.c =================================================================== --- sys/kern/sys_pipe.c (revision 226066) +++ sys/kern/sys_pipe.c (working copy) @@ -1,91 +1,28 @@ /*- - * Copyright (c) 1996 John S. Dyson + * Copyright (c) 2009 Zhao Shuai + * Google Summer of Code project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice immediately at the beginning of the file, without modification, - * this list of conditions, and the following disclaimer. + * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Absolutely no warranty of function or purpose is made by the author - * John S. Dyson. - * 4. Modifications may be freely made to this file if the above conditions - * are met. - */ - -/* - * This file contains a high-performance replacement for the socket-based - * pipes scheme originally used in FreeBSD/4.4Lite. It does not support - * all features of sockets, but does do everything that pipes normally - * do. - */ - -/* - * This code has two modes of operation, a small write mode and a large - * write mode. The small write mode acts like conventional pipes with - * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the - * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT - * and PIPE_SIZE in size, the sending process pins the underlying pages in - * memory, and the receiving process copies directly from these pinned pages - * in the sending process. * - * If the sending process receives a signal, it is possible that it will - * go away, and certainly its address space can change, because control - * is returned back to the user-mode side. In that case, the pipe code - * arranges to copy the buffer supplied by the user process, to a pageable - * kernel buffer, and the receiving process will grab the data from the - * pageable kernel buffer. Since signals don't happen all that often, - * the copy operation is normally eliminated. - * - * The constant PIPE_MINDIRECT is chosen to make sure that buffering will - * happen for small transfers so that the system will not spend all of - * its time context switching. - * - * In order to limit the resource use of pipes, two sysctls exist: - * - * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable - * address space available to us in pipe_map. This value is normally - * autotuned, but may also be loader tuned. - * - * kern.ipc.pipekva - This read-only sysctl tracks the current amount of - * memory in use by pipes. - * - * Based on how large pipekva is relative to maxpipekva, the following - * will happen: - * - * 0% - 50%: - * New pipes are given 16K of memory backing, pipes may dynamically - * grow to as large as 64K where needed. - * 50% - 75%: - * New pipes are given 4K (or PAGE_SIZE) of memory backing, - * existing pipes may NOT grow. - * 75% - 100%: - * New pipes are given 4K (or PAGE_SIZE) of memory backing, - * existing pipes will be shrunk down to 4K whenever possible. - * - * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If - * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE - * resize which MUST occur for reverse-direction pipes when they are - * first used. - * - * Additional information about the current state of pipes may be obtained - * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, - * and kern.ipc.piperesizefail. - * - * Locking rules: There are two locks present here: A mutex, used via - * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via - * the flag, as mutexes can not persist over uiomove. The mutex - * exists only to guard access to the flag, and is not in itself a - * locking mechanism. Also note that there is only a single mutex for - * both directions of a pipe. - * - * As pipelock() may have to sleep before it can acquire the flag, it - * is important to reread all data after a call to pipelock(); everything - * in the structure may have changed. + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include @@ -95,20 +32,13 @@ #include #include #include -#include #include #include #include -#include -#include -#include #include #include -#include #include -#include #include -#include #include #include #include @@ -116,253 +46,55 @@ #include #include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - /* - * Use this define if you want to disable *fancy* VM things. Expect an - * approx 30% decrease in transfer rate. This could be useful for - * NetBSD or OpenBSD. - */ -/* #define PIPE_NODIRECT */ - -/* * interfaces to the outside world */ -static fo_rdwr_t pipe_read; -static fo_rdwr_t pipe_write; -static fo_truncate_t pipe_truncate; -static fo_ioctl_t pipe_ioctl; -static fo_poll_t pipe_poll; -static fo_kqfilter_t pipe_kqfilter; -static fo_stat_t pipe_stat; -static fo_close_t pipe_close; +static fo_rdwr_t pipe_read_f; +static fo_rdwr_t pipe_write_f; +static fo_truncate_t pipe_truncate_f; +static fo_ioctl_t pipe_ioctl_f; +static fo_poll_t pipe_poll_f; +static fo_kqfilter_t pipe_kqfilter_f; +static fo_stat_t pipe_stat_f; +static fo_close_t pipe_close_f; static struct fileops pipeops = { - .fo_read = pipe_read, - .fo_write = pipe_write, - .fo_truncate = pipe_truncate, - .fo_ioctl = pipe_ioctl, - .fo_poll = pipe_poll, - .fo_kqfilter = pipe_kqfilter, - .fo_stat = pipe_stat, - .fo_close = pipe_close, + .fo_read = pipe_read_f, + .fo_write = pipe_write_f, + .fo_truncate = pipe_truncate_f, + .fo_ioctl = pipe_ioctl_f, + .fo_poll = pipe_poll_f, + .fo_kqfilter = pipe_kqfilter_f, + .fo_stat = pipe_stat_f, + .fo_close = pipe_close_f, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, - .fo_flags = DFLAG_PASSABLE + .fo_flags = DFLAG_PASSABLE }; -static void filt_pipedetach(struct knote *kn); -static int filt_piperead(struct knote *kn, long hint); -static int filt_pipewrite(struct knote *kn, long hint); - -static struct filterops pipe_rfiltops = { - .f_isfd = 1, - .f_detach = filt_pipedetach, - .f_event = filt_piperead -}; -static struct filterops pipe_wfiltops = { - .f_isfd = 1, - .f_detach = filt_pipedetach, - .f_event = filt_pipewrite -}; - /* - * Default pipe buffer size(s), this can be kind-of large now because pipe - * space is pageable. The pipe code will try to maintain locality of - * reference for performance reasons, so small amounts of outstanding I/O - * will not wipe the cache. - */ -#define MINPIPESIZE (PIPE_SIZE/3) -#define MAXPIPESIZE (2*PIPE_SIZE/3) - -static long amountpipekva; -static int pipefragretry; -static int pipeallocfail; -static int piperesizefail; -static int piperesizeallowed = 1; - -SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, - &maxpipekva, 0, "Pipe KVA limit"); -SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, - &amountpipekva, 0, "Pipe KVA usage"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, - &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, - &pipeallocfail, 0, "Pipe allocation failures"); -SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, - &piperesizefail, 0, "Pipe resize failures"); -SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, - &piperesizeallowed, 0, "Pipe resizing allowed"); - -static void pipeinit(void *dummy __unused); -static void pipeclose(struct pipe *cpipe); -static void pipe_free_kmem(struct pipe *cpipe); -static int pipe_create(struct pipe *pipe, int backing); -static __inline int pipelock(struct pipe *cpipe, int catch); -static __inline void pipeunlock(struct pipe *cpipe); -static __inline void pipeselwakeup(struct pipe *cpipe); -#ifndef PIPE_NODIRECT -static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); -static void pipe_destroy_write_buffer(struct pipe *wpipe); -static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); -static void pipe_clone_write_buffer(struct pipe *wpipe); -#endif -static int pipespace(struct pipe *cpipe, int size); -static int pipespace_new(struct pipe *cpipe, int size); - -static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); -static int pipe_zone_init(void *mem, int size, int flags); -static void pipe_zone_fini(void *mem, int size); - -static uma_zone_t pipe_zone; -static struct unrhdr *pipeino_unr; -static dev_t pipedev_ino; - -SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); - -static void -pipeinit(void *dummy __unused) -{ - - pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), - pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, - UMA_ALIGN_PTR, 0); - KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); - pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); - KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); - pipedev_ino = devfs_alloc_cdp_inode(); - KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); -} - -static int -pipe_zone_ctor(void *mem, int size, void *arg, int flags) -{ - struct pipepair *pp; - struct pipe *rpipe, *wpipe; - - KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); - - pp = (struct pipepair *)mem; - - /* - * We zero both pipe endpoints to make sure all the kmem pointers - * are NULL, flag fields are zero'd, etc. We timestamp both - * endpoints with the same time. - */ - rpipe = &pp->pp_rpipe; - bzero(rpipe, sizeof(*rpipe)); - vfs_timestamp(&rpipe->pipe_ctime); - rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; - - wpipe = &pp->pp_wpipe; - bzero(wpipe, sizeof(*wpipe)); - wpipe->pipe_ctime = rpipe->pipe_ctime; - wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; - - rpipe->pipe_peer = wpipe; - rpipe->pipe_pair = pp; - wpipe->pipe_peer = rpipe; - wpipe->pipe_pair = pp; - - /* - * Mark both endpoints as present; they will later get free'd - * one at a time. When both are free'd, then the whole pair - * is released. - */ - rpipe->pipe_present = PIPE_ACTIVE; - wpipe->pipe_present = PIPE_ACTIVE; - - /* - * Eventually, the MAC Framework may initialize the label - * in ctor or init, but for now we do it elswhere to avoid - * blocking in ctor or init. - */ - pp->pp_label = NULL; - - return (0); -} - -static int -pipe_zone_init(void *mem, int size, int flags) -{ - struct pipepair *pp; - - KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); - - pp = (struct pipepair *)mem; - - mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); - return (0); -} - -static void -pipe_zone_fini(void *mem, int size) -{ - struct pipepair *pp; - - KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); - - pp = (struct pipepair *)mem; - - mtx_destroy(&pp->pp_mtx); -} - -/* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let - * the zone pick up the pieces via pipeclose(). + * the zone pick up the pieces via pipe_close(). */ int kern_pipe(struct thread *td, int fildes[2]) { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; - struct pipepair *pp; struct pipe *rpipe, *wpipe; int fd, error; - pp = uma_zalloc(pipe_zone, M_WAITOK); -#ifdef MAC - /* - * The MAC label is shared between the connected endpoints. As a - * result mac_pipe_init() and mac_pipe_create() are called once - * for the pair, and not on the endpoints. - */ - mac_pipe_init(pp); - mac_pipe_create(td->td_ucred, pp); -#endif - rpipe = &pp->pp_rpipe; - wpipe = &pp->pp_wpipe; - - knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); - knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); - - /* Only the forward direction pipe is backed by default */ - if ((error = pipe_create(rpipe, 1)) != 0 || - (error = pipe_create(wpipe, 0)) != 0) { - pipeclose(rpipe); - pipeclose(wpipe); + error = pipepair_create(td, &rpipe, &wpipe); + if (error) return (error); - } - rpipe->pipe_state |= PIPE_DIRECTOK; - wpipe->pipe_state |= PIPE_DIRECTOK; - error = falloc(td, &rf, &fd, 0); if (error) { - pipeclose(rpipe); - pipeclose(wpipe); + pipe_close(rpipe); + pipe_close(wpipe); return (error); } + /* An extra reference on `rf' has been held for us by falloc(). */ fildes[0] = fd; @@ -377,10 +109,12 @@ if (error) { fdclose(fdp, rf, fildes[0], td); fdrop(rf, td); + /* rpipe has been closed by fdrop(). */ - pipeclose(wpipe); + pipe_close(wpipe); return (error); } + /* An extra reference on `wf' has been held for us by falloc(). */ finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); @@ -407,1244 +141,87 @@ return (0); } -/* - * Allocate kva for pipe circular buffer, the space is pageable - * This routine will 'realloc' the size of a pipe safely, if it fails - * it will retain the old buffer. - * If it fails it will return ENOMEM. - */ static int -pipespace_new(cpipe, size) - struct pipe *cpipe; - int size; +pipe_read_f(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) { - caddr_t buffer; - int error, cnt, firstseg; - static int curfail = 0; - static struct timeval lastfail; - KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); - KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), - ("pipespace: resize of direct writes not allowed")); -retry: - cnt = cpipe->pipe_buffer.cnt; - if (cnt > size) - size = cnt; + struct pipe *pipe = fp->f_data; - size = round_page(size); - buffer = (caddr_t) vm_map_min(pipe_map); - - error = vm_map_find(pipe_map, NULL, 0, - (vm_offset_t *) &buffer, size, 1, - VM_PROT_ALL, VM_PROT_ALL, 0); - if (error != KERN_SUCCESS) { - if ((cpipe->pipe_buffer.buffer == NULL) && - (size > SMALL_PIPE_SIZE)) { - size = SMALL_PIPE_SIZE; - pipefragretry++; - goto retry; - } - if (cpipe->pipe_buffer.buffer == NULL) { - pipeallocfail++; - if (ppsratecheck(&lastfail, &curfail, 1)) - printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); - } else { - piperesizefail++; - } - return (ENOMEM); - } - - /* copy data, then free old resources if we're resizing */ - if (cnt > 0) { - if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { - firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; - bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], - buffer, firstseg); - if ((cnt - firstseg) > 0) - bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], - cpipe->pipe_buffer.in); - } else { - bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], - buffer, cnt); - } - } - pipe_free_kmem(cpipe); - cpipe->pipe_buffer.buffer = buffer; - cpipe->pipe_buffer.size = size; - cpipe->pipe_buffer.in = cnt; - cpipe->pipe_buffer.out = 0; - cpipe->pipe_buffer.cnt = cnt; - atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); - return (0); + /* The 4th arguments of pipe_read is file flag */ + return pipe_read(pipe, uio, active_cred, fp->f_flag, td); } -/* - * Wrapper for pipespace_new() that performs locking assertions. - */ static int -pipespace(cpipe, size) - struct pipe *cpipe; - int size; +pipe_write_f(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { + struct pipe *pipe = fp->f_data; - KASSERT(cpipe->pipe_state & PIPE_LOCKFL, - ("Unlocked pipe passed to pipespace")); - return (pipespace_new(cpipe, size)); + /* The 4th arguments of pipe_write is file flag */ + return pipe_write(pipe, uio, active_cred, fp->f_flag, td); } -/* - * lock a pipe for I/O, blocking other access - */ -static __inline int -pipelock(cpipe, catch) - struct pipe *cpipe; - int catch; -{ - int error; - - PIPE_LOCK_ASSERT(cpipe, MA_OWNED); - while (cpipe->pipe_state & PIPE_LOCKFL) { - cpipe->pipe_state |= PIPE_LWANT; - error = msleep(cpipe, PIPE_MTX(cpipe), - catch ? (PRIBIO | PCATCH) : PRIBIO, - "pipelk", 0); - if (error != 0) - return (error); - } - cpipe->pipe_state |= PIPE_LOCKFL; - return (0); -} - -/* - * unlock a pipe I/O lock - */ -static __inline void -pipeunlock(cpipe) - struct pipe *cpipe; -{ - - PIPE_LOCK_ASSERT(cpipe, MA_OWNED); - KASSERT(cpipe->pipe_state & PIPE_LOCKFL, - ("Unlocked pipe passed to pipeunlock")); - cpipe->pipe_state &= ~PIPE_LOCKFL; - if (cpipe->pipe_state & PIPE_LWANT) { - cpipe->pipe_state &= ~PIPE_LWANT; - wakeup(cpipe); - } -} - -static __inline void -pipeselwakeup(cpipe) - struct pipe *cpipe; -{ - - PIPE_LOCK_ASSERT(cpipe, MA_OWNED); - if (cpipe->pipe_state & PIPE_SEL) { - selwakeuppri(&cpipe->pipe_sel, PSOCK); - if (!SEL_WAITING(&cpipe->pipe_sel)) - cpipe->pipe_state &= ~PIPE_SEL; - } - if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) - pgsigio(&cpipe->pipe_sigio, SIGIO, 0); - KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); -} - -/* - * Initialize and allocate VM and memory for pipe. The structure - * will start out zero'd from the ctor, so we just manage the kmem. - */ static int -pipe_create(pipe, backing) - struct pipe *pipe; - int backing; +pipe_truncate_f(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { - int error; - if (backing) { - if (amountpipekva > maxpipekva / 2) - error = pipespace_new(pipe, SMALL_PIPE_SIZE); - else - error = pipespace_new(pipe, PIPE_SIZE); - } else { - /* If we're not backing this pipe, no need to do anything. */ - error = 0; - } - if (error == 0) { - pipe->pipe_ino = alloc_unr(pipeino_unr); - if (pipe->pipe_ino == -1) - /* pipeclose will clear allocated kva */ - error = ENOMEM; - } - return (error); -} + struct pipe *pipe = fp->f_data; -/* ARGSUSED */ -static int -pipe_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; -{ - struct pipe *rpipe = fp->f_data; - int error; - int nread = 0; - u_int size; - - PIPE_LOCK(rpipe); - ++rpipe->pipe_busy; - error = pipelock(rpipe, 1); - if (error) - goto unlocked_error; - -#ifdef MAC - error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); - if (error) - goto locked_error; -#endif - if (amountpipekva > (3 * maxpipekva) / 4) { - if (!(rpipe->pipe_state & PIPE_DIRECTW) && - (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && - (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && - (piperesizeallowed == 1)) { - PIPE_UNLOCK(rpipe); - pipespace(rpipe, SMALL_PIPE_SIZE); - PIPE_LOCK(rpipe); - } - } - - while (uio->uio_resid) { - /* - * normal pipe buffer receive - */ - if (rpipe->pipe_buffer.cnt > 0) { - size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; - if (size > rpipe->pipe_buffer.cnt) - size = rpipe->pipe_buffer.cnt; - if (size > (u_int) uio->uio_resid) - size = (u_int) uio->uio_resid; - - PIPE_UNLOCK(rpipe); - error = uiomove( - &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], - size, uio); - PIPE_LOCK(rpipe); - if (error) - break; - - rpipe->pipe_buffer.out += size; - if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) - rpipe->pipe_buffer.out = 0; - - rpipe->pipe_buffer.cnt -= size; - - /* - * If there is no more to read in the pipe, reset - * its pointers to the beginning. This improves - * cache hit stats. - */ - if (rpipe->pipe_buffer.cnt == 0) { - rpipe->pipe_buffer.in = 0; - rpipe->pipe_buffer.out = 0; - } - nread += size; -#ifndef PIPE_NODIRECT - /* - * Direct copy, bypassing a kernel buffer. - */ - } else if ((size = rpipe->pipe_map.cnt) && - (rpipe->pipe_state & PIPE_DIRECTW)) { - if (size > (u_int) uio->uio_resid) - size = (u_int) uio->uio_resid; - - PIPE_UNLOCK(rpipe); - error = uiomove_fromphys(rpipe->pipe_map.ms, - rpipe->pipe_map.pos, size, uio); - PIPE_LOCK(rpipe); - if (error) - break; - nread += size; - rpipe->pipe_map.pos += size; - rpipe->pipe_map.cnt -= size; - if (rpipe->pipe_map.cnt == 0) { - rpipe->pipe_state &= ~PIPE_DIRECTW; - wakeup(rpipe); - } -#endif - } else { - /* - * detect EOF condition - * read returns 0 on EOF, no need to set error - */ - if (rpipe->pipe_state & PIPE_EOF) - break; - - /* - * If the "write-side" has been blocked, wake it up now. - */ - if (rpipe->pipe_state & PIPE_WANTW) { - rpipe->pipe_state &= ~PIPE_WANTW; - wakeup(rpipe); - } - - /* - * Break if some data was read. - */ - if (nread > 0) - break; - - /* - * Unlock the pipe buffer for our remaining processing. - * We will either break out with an error or we will - * sleep and relock to loop. - */ - pipeunlock(rpipe); - - /* - * Handle non-blocking mode operation or - * wait for more data. - */ - if (fp->f_flag & FNONBLOCK) { - error = EAGAIN; - } else { - rpipe->pipe_state |= PIPE_WANTR; - if ((error = msleep(rpipe, PIPE_MTX(rpipe), - PRIBIO | PCATCH, - "piperd", 0)) == 0) - error = pipelock(rpipe, 1); - } - if (error) - goto unlocked_error; - } - } -#ifdef MAC -locked_error: -#endif - pipeunlock(rpipe); - - /* XXX: should probably do this before getting any locks. */ - if (error == 0) - vfs_timestamp(&rpipe->pipe_atime); -unlocked_error: - --rpipe->pipe_busy; - - /* - * PIPE_WANT processing only makes sense if pipe_busy is 0. - */ - if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { - rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); - wakeup(rpipe); - } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { - /* - * Handle write blocking hysteresis. - */ - if (rpipe->pipe_state & PIPE_WANTW) { - rpipe->pipe_state &= ~PIPE_WANTW; - wakeup(rpipe); - } - } - - if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) - pipeselwakeup(rpipe); - - PIPE_UNLOCK(rpipe); - return (error); + return pipe_truncate(pipe, length, active_cred, td); } -#ifndef PIPE_NODIRECT -/* - * Map the sending processes' buffer into kernel space and wire it. - * This is similar to a physical write operation. - */ static int -pipe_build_write_buffer(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; +pipe_ioctl_f(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, + struct thread *td) { - u_int size; - int i; + + struct pipe *pipe = fp->f_data; - PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); - KASSERT(wpipe->pipe_state & PIPE_DIRECTW, - ("Clone attempt on non-direct write pipe!")); - - size = (u_int) uio->uio_iov->iov_len; - if (size > wpipe->pipe_buffer.size) - size = wpipe->pipe_buffer.size; - - if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, - (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, - wpipe->pipe_map.ms, PIPENPAGES)) < 0) - return (EFAULT); - -/* - * set up the control block - */ - wpipe->pipe_map.npages = i; - wpipe->pipe_map.pos = - ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; - wpipe->pipe_map.cnt = size; - -/* - * and update the uio data - */ - - uio->uio_iov->iov_len -= size; - uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; - if (uio->uio_iov->iov_len == 0) - uio->uio_iov++; - uio->uio_resid -= size; - uio->uio_offset += size; - return (0); + return pipe_ioctl(pipe, cmd, data, active_cred, td); } -/* - * unmap and unwire the process buffer - */ -static void -pipe_destroy_write_buffer(wpipe) - struct pipe *wpipe; -{ - - PIPE_LOCK_ASSERT(wpipe, MA_OWNED); - vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); - wpipe->pipe_map.npages = 0; -} - -/* - * In the case of a signal, the writing process might go away. This - * code copies the data into the circular buffer so that the source - * pages can be freed without loss of data. - */ -static void -pipe_clone_write_buffer(wpipe) - struct pipe *wpipe; -{ - struct uio uio; - struct iovec iov; - int size; - int pos; - - PIPE_LOCK_ASSERT(wpipe, MA_OWNED); - size = wpipe->pipe_map.cnt; - pos = wpipe->pipe_map.pos; - - wpipe->pipe_buffer.in = size; - wpipe->pipe_buffer.out = 0; - wpipe->pipe_buffer.cnt = size; - wpipe->pipe_state &= ~PIPE_DIRECTW; - - PIPE_UNLOCK(wpipe); - iov.iov_base = wpipe->pipe_buffer.buffer; - iov.iov_len = size; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = 0; - uio.uio_resid = size; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = curthread; - uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); - PIPE_LOCK(wpipe); - pipe_destroy_write_buffer(wpipe); -} - -/* - * This implements the pipe buffer write mechanism. Note that only - * a direct write OR a normal pipe write can be pending at any given time. - * If there are any characters in the pipe buffer, the direct write will - * be deferred until the receiving process grabs all of the bytes from - * the pipe buffer. Then the direct mapping write is set-up. - */ static int -pipe_direct_write(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; +pipe_poll_f(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { - int error; -retry: - PIPE_LOCK_ASSERT(wpipe, MA_OWNED); - error = pipelock(wpipe, 1); - if (wpipe->pipe_state & PIPE_EOF) - error = EPIPE; - if (error) { - pipeunlock(wpipe); - goto error1; - } - while (wpipe->pipe_state & PIPE_DIRECTW) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - pipeselwakeup(wpipe); - wpipe->pipe_state |= PIPE_WANTW; - pipeunlock(wpipe); - error = msleep(wpipe, PIPE_MTX(wpipe), - PRIBIO | PCATCH, "pipdww", 0); - if (error) - goto error1; - else - goto retry; - } - wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ - if (wpipe->pipe_buffer.cnt > 0) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - pipeselwakeup(wpipe); - wpipe->pipe_state |= PIPE_WANTW; - pipeunlock(wpipe); - error = msleep(wpipe, PIPE_MTX(wpipe), - PRIBIO | PCATCH, "pipdwc", 0); - if (error) - goto error1; - else - goto retry; - } + struct pipe *pipe = fp->f_data; - wpipe->pipe_state |= PIPE_DIRECTW; - - PIPE_UNLOCK(wpipe); - error = pipe_build_write_buffer(wpipe, uio); - PIPE_LOCK(wpipe); - if (error) { - wpipe->pipe_state &= ~PIPE_DIRECTW; - pipeunlock(wpipe); - goto error1; - } - - error = 0; - while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { - if (wpipe->pipe_state & PIPE_EOF) { - pipe_destroy_write_buffer(wpipe); - pipeselwakeup(wpipe); - pipeunlock(wpipe); - error = EPIPE; - goto error1; - } - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - pipeselwakeup(wpipe); - pipeunlock(wpipe); - error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, - "pipdwt", 0); - pipelock(wpipe, 0); - } - - if (wpipe->pipe_state & PIPE_EOF) - error = EPIPE; - if (wpipe->pipe_state & PIPE_DIRECTW) { - /* - * this bit of trickery substitutes a kernel buffer for - * the process that might be going away. - */ - pipe_clone_write_buffer(wpipe); - } else { - pipe_destroy_write_buffer(wpipe); - } - pipeunlock(wpipe); - return (error); - -error1: - wakeup(wpipe); - return (error); + return pipe_poll(pipe, events, active_cred, td); } -#endif static int -pipe_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +pipe_stat_f(struct file *fp, struct stat *ub, struct ucred *active_cred, struct thread *td) { - int error = 0; - int desiredsize, orig_resid; - struct pipe *wpipe, *rpipe; - - rpipe = fp->f_data; - wpipe = rpipe->pipe_peer; - - PIPE_LOCK(rpipe); - error = pipelock(wpipe, 1); - if (error) { - PIPE_UNLOCK(rpipe); - return (error); - } - /* - * detect loss of pipe read side, issue SIGPIPE if lost. - */ - if (wpipe->pipe_present != PIPE_ACTIVE || - (wpipe->pipe_state & PIPE_EOF)) { - pipeunlock(wpipe); - PIPE_UNLOCK(rpipe); - return (EPIPE); - } -#ifdef MAC - error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); - if (error) { - pipeunlock(wpipe); - PIPE_UNLOCK(rpipe); - return (error); - } -#endif - ++wpipe->pipe_busy; - - /* Choose a larger size if it's advantageous */ - desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); - while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { - if (piperesizeallowed != 1) - break; - if (amountpipekva > maxpipekva / 2) - break; - if (desiredsize == BIG_PIPE_SIZE) - break; - desiredsize = desiredsize * 2; - } - - /* Choose a smaller size if we're in a OOM situation */ - if ((amountpipekva > (3 * maxpipekva) / 4) && - (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && - (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && - (piperesizeallowed == 1)) - desiredsize = SMALL_PIPE_SIZE; - - /* Resize if the above determined that a new size was necessary */ - if ((desiredsize != wpipe->pipe_buffer.size) && - ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { - PIPE_UNLOCK(wpipe); - pipespace(wpipe, desiredsize); - PIPE_LOCK(wpipe); - } - if (wpipe->pipe_buffer.size == 0) { - /* - * This can only happen for reverse direction use of pipes - * in a complete OOM situation. - */ - error = ENOMEM; - --wpipe->pipe_busy; - pipeunlock(wpipe); - PIPE_UNLOCK(wpipe); - return (error); - } - - pipeunlock(wpipe); - - orig_resid = uio->uio_resid; - - while (uio->uio_resid) { - int space; - - pipelock(wpipe, 0); - if (wpipe->pipe_state & PIPE_EOF) { - pipeunlock(wpipe); - error = EPIPE; - break; - } -#ifndef PIPE_NODIRECT - /* - * If the transfer is large, we can gain performance if - * we do process-to-process copies directly. - * If the write is non-blocking, we don't use the - * direct write mechanism. - * - * The direct write mechanism will detect the reader going - * away on us. - */ - if (uio->uio_segflg == UIO_USERSPACE && - uio->uio_iov->iov_len >= PIPE_MINDIRECT && - wpipe->pipe_buffer.size >= PIPE_MINDIRECT && - (fp->f_flag & FNONBLOCK) == 0) { - pipeunlock(wpipe); - error = pipe_direct_write(wpipe, uio); - if (error) - break; - continue; - } -#endif - - /* - * Pipe buffered writes cannot be coincidental with - * direct writes. We wait until the currently executing - * direct write is completed before we start filling the - * pipe buffer. We break out if a signal occurs or the - * reader goes away. - */ - if (wpipe->pipe_state & PIPE_DIRECTW) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - pipeselwakeup(wpipe); - wpipe->pipe_state |= PIPE_WANTW; - pipeunlock(wpipe); - error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, - "pipbww", 0); - if (error) - break; - else - continue; - } - - space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; - - /* Writes of size <= PIPE_BUF must be atomic. */ - if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) - space = 0; - - if (space > 0) { - int size; /* Transfer size */ - int segsize; /* first segment to transfer */ - - /* - * Transfer size is minimum of uio transfer - * and free space in pipe buffer. - */ - if (space > uio->uio_resid) - size = uio->uio_resid; - else - size = space; - /* - * First segment to transfer is minimum of - * transfer size and contiguous space in - * pipe buffer. If first segment to transfer - * is less than the transfer size, we've got - * a wraparound in the buffer. - */ - segsize = wpipe->pipe_buffer.size - - wpipe->pipe_buffer.in; - if (segsize > size) - segsize = size; - - /* Transfer first segment */ - - PIPE_UNLOCK(rpipe); - error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], - segsize, uio); - PIPE_LOCK(rpipe); - - if (error == 0 && segsize < size) { - KASSERT(wpipe->pipe_buffer.in + segsize == - wpipe->pipe_buffer.size, - ("Pipe buffer wraparound disappeared")); - /* - * Transfer remaining part now, to - * support atomic writes. Wraparound - * happened. - */ - - PIPE_UNLOCK(rpipe); - error = uiomove( - &wpipe->pipe_buffer.buffer[0], - size - segsize, uio); - PIPE_LOCK(rpipe); - } - if (error == 0) { - wpipe->pipe_buffer.in += size; - if (wpipe->pipe_buffer.in >= - wpipe->pipe_buffer.size) { - KASSERT(wpipe->pipe_buffer.in == - size - segsize + - wpipe->pipe_buffer.size, - ("Expected wraparound bad")); - wpipe->pipe_buffer.in = size - segsize; - } - - wpipe->pipe_buffer.cnt += size; - KASSERT(wpipe->pipe_buffer.cnt <= - wpipe->pipe_buffer.size, - ("Pipe buffer overflow")); - } - pipeunlock(wpipe); - if (error != 0) - break; - } else { - /* - * If the "read-side" has been blocked, wake it up now. - */ - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - - /* - * don't block on non-blocking I/O - */ - if (fp->f_flag & FNONBLOCK) { - error = EAGAIN; - pipeunlock(wpipe); - break; - } - - /* - * We have no more space and have something to offer, - * wake up select/poll. - */ - pipeselwakeup(wpipe); - - wpipe->pipe_state |= PIPE_WANTW; - pipeunlock(wpipe); - error = msleep(wpipe, PIPE_MTX(rpipe), - PRIBIO | PCATCH, "pipewr", 0); - if (error != 0) - break; - } - } - - pipelock(wpipe, 0); - --wpipe->pipe_busy; - - if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { - wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); - wakeup(wpipe); - } else if (wpipe->pipe_buffer.cnt > 0) { - /* - * If we have put any characters in the buffer, we wake up - * the reader. - */ - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - } - - /* - * Don't return EPIPE if I/O was successful - */ - if ((wpipe->pipe_buffer.cnt == 0) && - (uio->uio_resid == 0) && - (error == EPIPE)) { - error = 0; - } - - if (error == 0) - vfs_timestamp(&wpipe->pipe_mtime); - - /* - * We have something to offer, - * wake up select/poll. - */ - if (wpipe->pipe_buffer.cnt) - pipeselwakeup(wpipe); - - pipeunlock(wpipe); - PIPE_UNLOCK(rpipe); - return (error); -} - -/* ARGSUSED */ -static int -pipe_truncate(fp, length, active_cred, td) - struct file *fp; - off_t length; - struct ucred *active_cred; - struct thread *td; -{ - - return (EINVAL); -} - -/* - * we implement a very minimal set of ioctls for compatibility with sockets. - */ -static int -pipe_ioctl(fp, cmd, data, active_cred, td) - struct file *fp; - u_long cmd; - void *data; - struct ucred *active_cred; - struct thread *td; -{ - struct pipe *mpipe = fp->f_data; - int error; - - PIPE_LOCK(mpipe); - -#ifdef MAC - error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); - if (error) { - PIPE_UNLOCK(mpipe); - return (error); - } -#endif - - error = 0; - switch (cmd) { - - case FIONBIO: - break; - - case FIOASYNC: - if (*(int *)data) { - mpipe->pipe_state |= PIPE_ASYNC; - } else { - mpipe->pipe_state &= ~PIPE_ASYNC; - } - break; - - case FIONREAD: - if (mpipe->pipe_state & PIPE_DIRECTW) - *(int *)data = mpipe->pipe_map.cnt; - else - *(int *)data = mpipe->pipe_buffer.cnt; - break; - - case FIOSETOWN: - PIPE_UNLOCK(mpipe); - error = fsetown(*(int *)data, &mpipe->pipe_sigio); - goto out_unlocked; - - case FIOGETOWN: - *(int *)data = fgetown(&mpipe->pipe_sigio); - break; - - /* This is deprecated, FIOSETOWN should be used instead. */ - case TIOCSPGRP: - PIPE_UNLOCK(mpipe); - error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); - goto out_unlocked; - - /* This is deprecated, FIOGETOWN should be used instead. */ - case TIOCGPGRP: - *(int *)data = -fgetown(&mpipe->pipe_sigio); - break; - - default: - error = ENOTTY; - break; - } - PIPE_UNLOCK(mpipe); -out_unlocked: - return (error); -} - -static int -pipe_poll(fp, events, active_cred, td) - struct file *fp; - int events; - struct ucred *active_cred; - struct thread *td; -{ - struct pipe *rpipe = fp->f_data; - struct pipe *wpipe; - int revents = 0; -#ifdef MAC - int error; -#endif - - wpipe = rpipe->pipe_peer; - PIPE_LOCK(rpipe); -#ifdef MAC - error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); - if (error) - goto locked_error; -#endif - if (events & (POLLIN | POLLRDNORM)) - if ((rpipe->pipe_state & PIPE_DIRECTW) || - (rpipe->pipe_buffer.cnt > 0)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (wpipe->pipe_present != PIPE_ACTIVE || - (wpipe->pipe_state & PIPE_EOF) || - (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && - (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) - revents |= events & (POLLOUT | POLLWRNORM); - - if ((events & POLLINIGNEOF) == 0) { - if (rpipe->pipe_state & PIPE_EOF) { - revents |= (events & (POLLIN | POLLRDNORM)); - if (wpipe->pipe_present != PIPE_ACTIVE || - (wpipe->pipe_state & PIPE_EOF)) - revents |= POLLHUP; - } - } - - if (revents == 0) { - if (events & (POLLIN | POLLRDNORM)) { - selrecord(td, &rpipe->pipe_sel); - if (SEL_WAITING(&rpipe->pipe_sel)) - rpipe->pipe_state |= PIPE_SEL; - } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &wpipe->pipe_sel); - if (SEL_WAITING(&wpipe->pipe_sel)) - wpipe->pipe_state |= PIPE_SEL; - } - } -#ifdef MAC -locked_error: -#endif - PIPE_UNLOCK(rpipe); - - return (revents); -} - -/* - * We shouldn't need locks here as we're doing a read and this should - * be a natural race. - */ -static int -pipe_stat(fp, ub, active_cred, td) - struct file *fp; - struct stat *ub; - struct ucred *active_cred; - struct thread *td; -{ + struct pipe *pipe = fp->f_data; -#ifdef MAC int error; - PIPE_LOCK(pipe); - error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); - PIPE_UNLOCK(pipe); + error = pipe_stat(pipe, ub, active_cred, td); if (error) return (error); -#endif - bzero(ub, sizeof(*ub)); - ub->st_mode = S_IFIFO; - ub->st_blksize = PAGE_SIZE; - if (pipe->pipe_state & PIPE_DIRECTW) - ub->st_size = pipe->pipe_map.cnt; - else - ub->st_size = pipe->pipe_buffer.cnt; - ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; - ub->st_atim = pipe->pipe_atime; - ub->st_mtim = pipe->pipe_mtime; - ub->st_ctim = pipe->pipe_ctime; + + /* pipe_stat zeros all fields of *ub, following assignment should be done after it */ ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; - ub->st_dev = pipedev_ino; - ub->st_ino = pipe->pipe_ino; - /* - * Left as 0: st_nlink, st_rdev, st_flags, st_gen. - */ return (0); } -/* ARGSUSED */ static int -pipe_close(fp, td) - struct file *fp; - struct thread *td; +pipe_close_f(struct file *fp, struct thread *td) { - struct pipe *cpipe = fp->f_data; + struct pipe *pipe = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; - funsetown(&cpipe->pipe_sigio); - pipeclose(cpipe); + pipe_close(pipe); return (0); } -static void -pipe_free_kmem(cpipe) - struct pipe *cpipe; -{ - - KASSERT(!mtx_owned(PIPE_MTX(cpipe)), - ("pipe_free_kmem: pipe mutex locked")); - - if (cpipe->pipe_buffer.buffer != NULL) { - atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); - vm_map_remove(pipe_map, - (vm_offset_t)cpipe->pipe_buffer.buffer, - (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); - cpipe->pipe_buffer.buffer = NULL; - } -#ifndef PIPE_NODIRECT - { - cpipe->pipe_map.cnt = 0; - cpipe->pipe_map.pos = 0; - cpipe->pipe_map.npages = 0; - } -#endif -} - -/* - * shutdown the pipe - */ -static void -pipeclose(cpipe) - struct pipe *cpipe; -{ - struct pipepair *pp; - struct pipe *ppipe; - ino_t ino; - - KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); - - PIPE_LOCK(cpipe); - pipelock(cpipe, 0); - pp = cpipe->pipe_pair; - - pipeselwakeup(cpipe); - - /* - * If the other side is blocked, wake it up saying that - * we want to close it down. - */ - cpipe->pipe_state |= PIPE_EOF; - while (cpipe->pipe_busy) { - wakeup(cpipe); - cpipe->pipe_state |= PIPE_WANT; - pipeunlock(cpipe); - msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); - pipelock(cpipe, 0); - } - - - /* - * Disconnect from peer, if any. - */ - ppipe = cpipe->pipe_peer; - if (ppipe->pipe_present == PIPE_ACTIVE) { - pipeselwakeup(ppipe); - - ppipe->pipe_state |= PIPE_EOF; - wakeup(ppipe); - KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); - } - - /* - * Mark this endpoint as free. Release kmem resources. We - * don't mark this endpoint as unused until we've finished - * doing that, or the pipe might disappear out from under - * us. - */ - PIPE_UNLOCK(cpipe); - pipe_free_kmem(cpipe); - PIPE_LOCK(cpipe); - cpipe->pipe_present = PIPE_CLOSING; - pipeunlock(cpipe); - - /* - * knlist_clear() may sleep dropping the PIPE_MTX. Set the - * PIPE_FINALIZED, that allows other end to free the - * pipe_pair, only after the knotes are completely dismantled. - */ - knlist_clear(&cpipe->pipe_sel.si_note, 1); - cpipe->pipe_present = PIPE_FINALIZED; - seldrain(&cpipe->pipe_sel); - knlist_destroy(&cpipe->pipe_sel.si_note); - - /* - * Postpone the destroy of the fake inode number allocated for - * our end, until pipe mtx is unlocked. - */ - ino = cpipe->pipe_ino; - - /* - * If both endpoints are now closed, release the memory for the - * pipe pair. If not, unlock. - */ - if (ppipe->pipe_present == PIPE_FINALIZED) { - PIPE_UNLOCK(cpipe); -#ifdef MAC - mac_pipe_destroy(pp); -#endif - uma_zfree(pipe_zone, cpipe->pipe_pair); - } else - PIPE_UNLOCK(cpipe); - - if (ino > 0) - free_unr(pipeino_unr, cpipe->pipe_ino); -} - -/*ARGSUSED*/ static int -pipe_kqfilter(struct file *fp, struct knote *kn) +pipe_kqfilter_f(struct file *fp, struct knote *kn) { - struct pipe *cpipe; + struct pipe *pipe = kn->kn_fp->f_data; - cpipe = kn->kn_fp->f_data; - PIPE_LOCK(cpipe); - switch (kn->kn_filter) { - case EVFILT_READ: - kn->kn_fop = &pipe_rfiltops; - break; - case EVFILT_WRITE: - kn->kn_fop = &pipe_wfiltops; - if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { - /* other end of pipe has been closed */ - PIPE_UNLOCK(cpipe); - return (EPIPE); - } - cpipe = cpipe->pipe_peer; - break; - default: - PIPE_UNLOCK(cpipe); - return (EINVAL); - } - - knlist_add(&cpipe->pipe_sel.si_note, kn, 1); - PIPE_UNLOCK(cpipe); - return (0); + return pipe_kqfilter(pipe, kn); } - -static void -filt_pipedetach(struct knote *kn) -{ - struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; - - PIPE_LOCK(cpipe); - if (kn->kn_filter == EVFILT_WRITE) - cpipe = cpipe->pipe_peer; - knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); - PIPE_UNLOCK(cpipe); -} - -/*ARGSUSED*/ -static int -filt_piperead(struct knote *kn, long hint) -{ - struct pipe *rpipe = kn->kn_fp->f_data; - struct pipe *wpipe = rpipe->pipe_peer; - int ret; - - PIPE_LOCK(rpipe); - kn->kn_data = rpipe->pipe_buffer.cnt; - if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) - kn->kn_data = rpipe->pipe_map.cnt; - - if ((rpipe->pipe_state & PIPE_EOF) || - wpipe->pipe_present != PIPE_ACTIVE || - (wpipe->pipe_state & PIPE_EOF)) { - kn->kn_flags |= EV_EOF; - PIPE_UNLOCK(rpipe); - return (1); - } - ret = kn->kn_data > 0; - PIPE_UNLOCK(rpipe); - return ret; -} - -/*ARGSUSED*/ -static int -filt_pipewrite(struct knote *kn, long hint) -{ - struct pipe *rpipe = kn->kn_fp->f_data; - struct pipe *wpipe = rpipe->pipe_peer; - - PIPE_LOCK(rpipe); - if (wpipe->pipe_present != PIPE_ACTIVE || - (wpipe->pipe_state & PIPE_EOF)) { - kn->kn_data = 0; - kn->kn_flags |= EV_EOF; - PIPE_UNLOCK(rpipe); - return (1); - } - kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; - if (wpipe->pipe_state & PIPE_DIRECTW) - kn->kn_data = 0; - - PIPE_UNLOCK(rpipe); - return (kn->kn_data >= PIPE_BUF); -} Index: sys/fs/fifofs/fifo_vnops.c =================================================================== --- sys/fs/fifofs/fifo_vnops.c (revision 226066) +++ sys/fs/fifofs/fifo_vnops.c (working copy) @@ -3,6 +3,9 @@ * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. + * + * Copyright (c) 2009 Zhao Shuai + * Google Summer of Code project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,35 +36,28 @@ */ #include -#include -#include #include #include #include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include +#include +#include #include -static fo_rdwr_t fifo_read_f; -static fo_rdwr_t fifo_write_f; -static fo_ioctl_t fifo_ioctl_f; -static fo_poll_t fifo_poll_f; -static fo_kqfilter_t fifo_kqfilter_f; -static fo_stat_t fifo_stat_f; -static fo_close_t fifo_close_f; -static fo_truncate_t fifo_truncate_f; +static fo_rdwr_t fifo_read_f; +static fo_rdwr_t fifo_write_f; +static fo_ioctl_t fifo_ioctl_f; +static fo_poll_t fifo_poll_f; +static fo_kqfilter_t fifo_kqfilter_f; +static fo_stat_t fifo_stat_f; +static fo_close_t fifo_close_f; +static fo_truncate_t fifo_truncate_f; struct fileops fifo_ops_f = { .fo_read = fifo_read_f, @@ -72,8 +68,8 @@ .fo_kqfilter = fifo_kqfilter_f, .fo_stat = fifo_stat_f, .fo_close = fifo_close_f, - .fo_chmod = vn_chmod, - .fo_chown = vn_chown, + .fo_chmod = vn_chmod, + .fo_chown = vn_chown, .fo_flags = DFLAG_PASSABLE }; @@ -86,34 +82,34 @@ * - fi_wgen is fif_mtx lock protected. */ struct fifoinfo { - struct socket *fi_readsock; - struct socket *fi_writesock; + struct pipe *fi_rpipe; + struct pipe *fi_wpipe; long fi_readers; long fi_writers; - int fi_wgen; }; static vop_print_t fifo_print; static vop_open_t fifo_open; static vop_close_t fifo_close; +static vop_ioctl_t fifo_ioctl; +static vop_kqfilter_t fifo_kqfilter; static vop_pathconf_t fifo_pathconf; static vop_advlock_t fifo_advlock; -static void filt_fifordetach(struct knote *kn); +static void filt_fifodetach(struct knote *kn); static int filt_fiforead(struct knote *kn, long hint); -static void filt_fifowdetach(struct knote *kn); static int filt_fifowrite(struct knote *kn, long hint); static void filt_fifodetach_notsup(struct knote *kn); static int filt_fifo_notsup(struct knote *kn, long hint); static struct filterops fiforead_filtops = { .f_isfd = 1, - .f_detach = filt_fifordetach, + .f_detach = filt_fifodetach, .f_event = filt_fiforead, }; static struct filterops fifowrite_filtops = { .f_isfd = 1, - .f_detach = filt_fifowdetach, + .f_detach = filt_fifodetach, .f_event = filt_fifowrite, }; static struct filterops fifo_notsup_filtops = { @@ -129,8 +125,8 @@ .vop_close = fifo_close, .vop_create = VOP_PANIC, .vop_getattr = VOP_EBADF, - .vop_ioctl = VOP_PANIC, - .vop_kqfilter = VOP_PANIC, + .vop_ioctl = fifo_ioctl, + .vop_kqfilter = fifo_kqfilter, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, @@ -154,23 +150,6 @@ MTX_SYSINIT(fifo, &fifo_mtx, "fifo mutex", MTX_DEF); /* - * Dispose of fifo resources. - */ -static void -fifo_cleanup(struct vnode *vp) -{ - struct fifoinfo *fip = vp->v_fifoinfo; - - ASSERT_VOP_ELOCKED(vp, "fifo_cleanup"); - if (fip->fi_readers == 0 && fip->fi_writers == 0) { - vp->v_fifoinfo = NULL; - (void)soclose(fip->fi_readsock); - (void)soclose(fip->fi_writesock); - free(fip, M_VNODE); - } -} - -/* * Open called to set up a new instance of a fifo or * to find an active instance of a fifo. */ @@ -182,15 +161,14 @@ int a_mode; struct ucred *a_cred; struct thread *a_td; - struct file *a_fp; + int a_fdidx; } */ *ap; { struct vnode *vp = ap->a_vp; - struct fifoinfo *fip; - struct thread *td = ap->a_td; - struct ucred *cred = ap->a_cred; struct file *fp = ap->a_fp; - struct socket *rso, *wso; + struct thread *td = ap->a_td; + struct fifoinfo *fip; + struct pipe *rpipe, *wpipe; int error; ASSERT_VOP_ELOCKED(vp, "fifo_open"); @@ -198,33 +176,15 @@ return (EINVAL); if ((fip = vp->v_fifoinfo) == NULL) { fip = malloc(sizeof(*fip), M_VNODE, M_WAITOK); - error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, cred, td); - if (error) - goto fail1; - fip->fi_readsock = rso; - error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, cred, td); - if (error) - goto fail2; - fip->fi_writesock = wso; - error = soconnect2(wso, rso); - /* Close the direction we do not use, so we can get POLLHUP. */ - if (error == 0) - error = soshutdown(rso, SHUT_WR); + error = pipepair_create(td, &rpipe, &wpipe); if (error) { - (void)soclose(wso); -fail2: - (void)soclose(rso); -fail1: free(fip, M_VNODE); return (error); } + fip->fi_rpipe = rpipe; + fip->fi_wpipe = wpipe; fip->fi_readers = fip->fi_writers = 0; - wso->so_snd.sb_lowat = PIPE_BUF; - SOCKBUF_LOCK(&rso->so_rcv); - rso->so_rcv.sb_state |= SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&rso->so_rcv); - KASSERT(vp->v_fifoinfo == NULL, - ("fifo_open: v_fifoinfo race")); + KASSERT(vp->v_fifoinfo == NULL, ("fifo_open: v_fifoinfo race")); vp->v_fifoinfo = fip; } @@ -236,46 +196,34 @@ mtx_lock(&fifo_mtx); if (ap->a_mode & FREAD) { fip->fi_readers++; - if (fip->fi_readers == 1) { - SOCKBUF_LOCK(&fip->fi_writesock->so_snd); - fip->fi_writesock->so_snd.sb_state &= ~SBS_CANTSENDMORE; - SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd); - if (fip->fi_writers > 0) { - wakeup(&fip->fi_writers); - sowwakeup(fip->fi_writesock); - } - } - fp->f_seqcount = fip->fi_wgen - fip->fi_writers; + if (fip->fi_readers == 1 && fip->fi_writers > 0) + wakeup(&fip->fi_writers); } if (ap->a_mode & FWRITE) { if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { mtx_unlock(&fifo_mtx); - if (fip->fi_writers == 0) - fifo_cleanup(vp); + /* XXX release all resources here? */ return (ENXIO); } - fip->fi_writers++; - if (fip->fi_writers == 1) { - SOCKBUF_LOCK(&fip->fi_readsock->so_rcv); - fip->fi_readsock->so_rcv.sb_state &= ~SBS_CANTRCVMORE; - SOCKBUF_UNLOCK(&fip->fi_readsock->so_rcv); - if (fip->fi_readers > 0) { - wakeup(&fip->fi_readers); - sorwakeup(fip->fi_readsock); - } - } + fip->fi_writers++; + if (fip->fi_writers == 1 && fip->fi_readers > 0) + wakeup(&fip->fi_readers); } if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp, 0); - error = msleep(&fip->fi_readers, &fifo_mtx, - PDROP | PCATCH | PSOCK, "fifoor", 0); + error = msleep(&fip->fi_readers, &fifo_mtx, + PDROP | PCATCH, "fifoor", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { fip->fi_readers--; if (fip->fi_readers == 0) { - socantsendmore(fip->fi_writesock); - fifo_cleanup(vp); + pipe_close(fip->fi_rpipe); + if (fip->fi_writers == 0) { + pipe_close(fip->fi_wpipe); + vp->v_fifoinfo = NULL; + free(fip, M_VNODE); + } } return (error); } @@ -284,21 +232,22 @@ * We must have got woken up because we had a writer. * That (and not still having one) is the condition * that we must wait for. - */ + */ } if ((ap->a_mode & FWRITE) && fip->fi_readers == 0) { VOP_UNLOCK(vp, 0); error = msleep(&fip->fi_writers, &fifo_mtx, - PDROP | PCATCH | PSOCK, "fifoow", 0); + PDROP | PCATCH, "fifoow", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { fip->fi_writers--; if (fip->fi_writers == 0) { - socantrcvmore(fip->fi_readsock); - mtx_lock(&fifo_mtx); - fip->fi_wgen++; - mtx_unlock(&fifo_mtx); - fifo_cleanup(vp); + pipe_close(fip->fi_wpipe); + if (fip->fi_readers == 0) { + pipe_close(fip->fi_rpipe); + vp->v_fifoinfo = NULL; + free(fip, M_VNODE); + } } return (error); } @@ -317,60 +266,95 @@ return (0); } -static void -filt_fifordetach(struct knote *kn) +/* + * Now unused vnode ioctl routine. + */ +/* ARGSUSED */ +static int +fifo_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + u_long a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; { - struct socket *so = (struct socket *)kn->kn_hook; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) - so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + printf("WARNING: fifo_ioctl called unexpectedly\n"); + return (ENOTTY); } +/* + * Now unused vnode kqfilter routine. + */ +/* ARGSUSED */ static int -filt_fiforead(struct knote *kn, long hint) +fifo_kqfilter(ap) + struct vop_kqfilter_args /* { + struct vnode *a_vp; + struct knote *a_kn; + } */ *ap; { - struct socket *so = (struct socket *)kn->kn_hook; - SOCKBUF_LOCK_ASSERT(&so->so_rcv); - kn->kn_data = so->so_rcv.sb_cc; - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - kn->kn_flags |= EV_EOF; - return (1); - } else { - kn->kn_flags &= ~EV_EOF; - return (kn->kn_data > 0); - } + printf("WARNING: fifo_kqfilter called unexpectedly\n"); + return (EINVAL); } static void -filt_fifowdetach(struct knote *kn) +filt_fifodetach(struct knote *kn) { - struct socket *so = (struct socket *)kn->kn_hook; + struct pipe *pipe = (struct pipe *)kn->kn_hook; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) - so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + PIPE_LOCK(pipe); + knlist_remove(&pipe->pipe_sel.si_note, kn, 1); + PIPE_UNLOCK(pipe); } static int +filt_fiforead(struct knote *kn, long hint) +{ + struct pipe *rpipe = (struct pipe *)kn->kn_hook; + struct pipe *wpipe = rpipe->pipe_peer; + int ret; + + PIPE_LOCK(rpipe); + kn->kn_data = rpipe->pipe_buffer.cnt; + if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + kn->kn_data = rpipe->pipe_map.cnt; + + if ((rpipe->pipe_state & PIPE_EOF) || + wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(rpipe); + return (1); + } + ret = kn->kn_data > 0; + PIPE_UNLOCK(rpipe); + return ret; +} + +static int filt_fifowrite(struct knote *kn, long hint) { - struct socket *so = (struct socket *)kn->kn_hook; + struct pipe *wpipe = (struct pipe *)kn->kn_hook; - SOCKBUF_LOCK_ASSERT(&so->so_snd); - kn->kn_data = sbspace(&so->so_snd); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + PIPE_LOCK(wpipe); + if (wpipe->pipe_present != PIPE_ACTIVE || + (wpipe->pipe_state & PIPE_EOF)) { + kn->kn_data = 0; kn->kn_flags |= EV_EOF; + PIPE_UNLOCK(wpipe); return (1); - } else { - kn->kn_flags &= ~EV_EOF; - return (kn->kn_data >= so->so_snd.sb_lowat); } + kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + if (wpipe->pipe_state & PIPE_DIRECTW) + kn->kn_data = 0; + + PIPE_UNLOCK(wpipe); + return (kn->kn_data >= PIPE_BUF); } static void @@ -403,25 +387,20 @@ struct fifoinfo *fip = vp->v_fifoinfo; ASSERT_VOP_ELOCKED(vp, "fifo_close"); - if (fip == NULL) { - printf("fifo_close: no v_fifoinfo %p\n", vp); - return (0); - } if (ap->a_fflag & FREAD) { fip->fi_readers--; - if (fip->fi_readers == 0) - socantsendmore(fip->fi_writesock); + if (fip->fi_readers == 0) + pipe_close(fip->fi_rpipe); } if (ap->a_fflag & FWRITE) { fip->fi_writers--; - if (fip->fi_writers == 0) { - socantrcvmore(fip->fi_readsock); - mtx_lock(&fifo_mtx); - fip->fi_wgen++; - mtx_unlock(&fifo_mtx); - } + if (fip->fi_writers == 0) + pipe_close(fip->fi_wpipe); } - fifo_cleanup(vp); + if (fip->fi_readers == 0 && fip->fi_writers == 0) { + vp->v_fifoinfo = NULL; + free(fip, M_VNODE); + } return (0); } @@ -429,12 +408,11 @@ * Print out internal contents of a fifo vnode. */ int -fifo_printinfo(vp) - struct vnode *vp; +fifo_printinfo(struct vnode *vp) { - register struct fifoinfo *fip = vp->v_fifoinfo; + struct fifoinfo *fip = vp->v_fifoinfo; - if (fip == NULL){ + if (fip == NULL) { printf(", NULL v_fifoinfo"); return (0); } @@ -487,14 +465,13 @@ } /* - * Fifo advisory byte-level locks. + * FIFO advisory byte-level locks. */ -/* ARGSUSED */ static int fifo_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; - caddr_t a_id; + caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; @@ -514,62 +491,32 @@ /* * The implementation of ioctl() for named fifos is complicated by the fact * that we permit O_RDWR fifo file descriptors, meaning that the actions of - * ioctls may have to be applied to both the underlying sockets rather than - * just one. The original implementation simply forward the ioctl to one - * or both sockets based on fp->f_flag. We now consider each ioctl - * separately, as the composition effect requires careful ordering. + * ioctls may have to be applied to both the underlying pipes rather than + * just one. * - * We do not blindly pass all ioctls through to the socket in order to avoid - * providing unnecessary ioctls that might be improperly depended on by - * applications (such as socket-specific, routing, and interface ioctls). - * * Unlike sys_pipe.c, fifos do not implement the deprecated TIOCSPGRP and * TIOCGPGRP ioctls. Earlier implementations of fifos did forward SIOCSPGRP * and SIOCGPGRP ioctls, so we might need to re-add those here. */ static int -fifo_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, +fifo_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { - struct fifoinfo *fi; - struct file filetmp; /* Local, so need not be locked. */ - int error; + int error = ENOTTY; + struct fifoinfo *fip = fp->f_data; - error = ENOTTY; - fi = fp->f_data; - switch (com) { case FIONBIO: - /* - * Non-blocking I/O is implemented at the fifo layer using - * MSG_NBIO, so does not need to be forwarded down the stack. - */ - return (0); - case FIOASYNC: case FIOSETOWN: case FIOGETOWN: - /* - * These socket ioctls don't have any ordering requirements, - * so are called in an arbitrary order, and only on the - * sockets indicated by the file descriptor rights. - * - * XXXRW: If O_RDWR and the read socket accepts an ioctl but - * the write socket doesn't, the socketpair is left in an - * inconsistent state. - */ if (fp->f_flag & FREAD) { - filetmp.f_data = fi->fi_readsock; - filetmp.f_cred = cred; - error = soo_ioctl(&filetmp, com, data, cred, td); + error = pipe_ioctl(fip->fi_rpipe, com, data, cred, td); if (error) return (error); } - if (fp->f_flag & FWRITE) { - filetmp.f_data = fi->fi_writesock; - filetmp.f_cred = cred; - error = soo_ioctl(&filetmp, com, data, cred, td); - } + if (fp->f_flag & FWRITE) + error = pipe_ioctl(fip->fi_wpipe, com, data, cred, td); return (error); case FIONREAD: @@ -582,12 +529,10 @@ *(int *)data = 0; return (0); } - filetmp.f_data = fi->fi_readsock; - filetmp.f_cred = cred; - return (soo_ioctl(&filetmp, com, data, cred, td)); + return (pipe_ioctl(fip->fi_rpipe, com, data, cred, td)); default: - return (ENOTTY); + return (ENOTTY); } } @@ -600,12 +545,9 @@ static int fifo_kqfilter_f(struct file *fp, struct knote *kn) { - struct fifoinfo *fi; - struct socket *so; - struct sockbuf *sb; + struct fifoinfo *fip = fp->f_data; + struct pipe *rpipe = fip->fi_rpipe; - fi = fp->f_data; - /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an @@ -615,7 +557,6 @@ kn->kn_fop = &fifo_notsup_filtops; return (0); } - if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &fifo_notsup_filtops; return (0); @@ -624,92 +565,83 @@ switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &fiforead_filtops; - so = fi->fi_readsock; - sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &fifowrite_filtops; - so = fi->fi_writesock; - sb = &so->so_snd; + PIPE_LOCK(rpipe); + if (rpipe->pipe_present != PIPE_ACTIVE) { + /* other end of pipe has been closed */ + PIPE_UNLOCK(rpipe); + return (EPIPE); + } + PIPE_UNLOCK(rpipe); break; default: return (EINVAL); } - kn->kn_hook = (caddr_t)so; + kn->kn_hook = (void *)rpipe; - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + PIPE_LOCK(rpipe); + knlist_add(&rpipe->pipe_sel.si_note, kn, 1); + PIPE_UNLOCK(rpipe); return (0); } -static int +static int fifo_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { - struct fifoinfo *fip; - struct file filetmp; + struct fifoinfo *fip = fp->f_data; int levents, revents = 0; - fip = fp->f_data; - levents = events & - (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); - if ((fp->f_flag & FREAD) && levents) { - filetmp.f_data = fip->fi_readsock; - filetmp.f_cred = cred; - mtx_lock(&fifo_mtx); - if (fp->f_seqcount == fip->fi_wgen) - levents |= POLLINIGNEOF; - mtx_unlock(&fifo_mtx); - revents |= soo_poll(&filetmp, levents, cred, td); - } - levents = events & (POLLOUT | POLLWRNORM | POLLWRBAND); - if ((fp->f_flag & FWRITE) && levents) { - filetmp.f_data = fip->fi_writesock; - filetmp.f_cred = cred; - revents |= soo_poll(&filetmp, levents, cred, td); - } + levents = events & (POLLIN | POLLRDNORM); + if ((fp->f_flag & FREAD) && levents) + revents |= pipe_poll(fip->fi_rpipe, levents, cred, td); + levents = events & (POLLOUT | POLLWRNORM); + if ((fp->f_flag & FWRITE) && levents) + revents |= pipe_poll(fip->fi_wpipe, levents, cred, td); return (revents); } static int fifo_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { - struct fifoinfo *fip; - int sflags; - fip = fp->f_data; - KASSERT(uio->uio_rw == UIO_READ,("fifo_read mode")); - if (uio->uio_resid == 0) - return (0); - sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; - return (soreceive(fip->fi_readsock, NULL, uio, NULL, NULL, &sflags)); + struct fifoinfo *fip = fp->f_data; + + /* The 4th argument of pipe_read is file flag */ + return (pipe_read(fip->fi_rpipe, uio, cred, fp->f_flag, td)); } static int fifo_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { + struct fifoinfo *fip = fp->f_data; + int error; - return (vnops.fo_stat(fp, sb, cred, td)); + error = pipe_stat(fip->fi_rpipe, sb, cred, td); + if (error) + return (error); + /* pipe_stat zeros all fields of *ub, following assignment should be done after it */ + sb->st_uid = fp->f_cred->cr_uid; + sb->st_gid = fp->f_cred->cr_gid; + return (0); } static int fifo_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { - + return (vnops.fo_truncate(fp, length, cred, td)); } static int fifo_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { - struct fifoinfo *fip; - int sflags; - fip = fp->f_data; - KASSERT(uio->uio_rw == UIO_WRITE,("fifo_write mode")); - sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; - return (sosend(fip->fi_writesock, NULL, uio, 0, NULL, sflags, td)); + struct fifoinfo *fip = fp->f_data; + + /* The 4th argument of pipe_write is file flag */ + return (pipe_write(fip->fi_wpipe, uio, cred, fp->f_flag, td)); } Index: sys/sys/pipe.h =================================================================== --- sys/sys/pipe.h (revision 226066) +++ sys/sys/pipe.h (working copy) @@ -28,6 +28,8 @@ #error "no user-servicable parts inside" #endif +#include + /* * Pipe buffer size, keep moderate in value, pipes take kva space. */ @@ -54,7 +56,7 @@ #define PIPENPAGES (BIG_PIPE_SIZE / PAGE_SIZE + 1) /* - * See sys_pipe.c for info on what these limits mean. + * See subr_pipe.c for info on what these limits mean. */ extern long maxpipekva; @@ -138,5 +140,20 @@ #define PIPE_UNLOCK(pipe) mtx_unlock(PIPE_MTX(pipe)) #define PIPE_LOCK_ASSERT(pipe, type) mtx_assert(PIPE_MTX(pipe), (type)) +int pipe_read(struct pipe *pipe, struct uio *uio, struct ucred *active_cred, + int f_flags, struct thread *td); +int pipe_write(struct pipe *pipe, struct uio *uio, struct ucred *active_cred, + int f_flags, struct thread *td); +int pipe_truncate(struct pipe *pipe, off_t length, struct ucred *active_cred, + struct thread *td); +int pipe_ioctl(struct pipe *pipe, u_long cmd, void *data, struct ucred *active_cred, + struct thread *td); +int pipe_poll(struct pipe *pipe, int events, struct ucred *active_cred, + struct thread *td); +int pipe_stat(struct pipe *pipe, struct stat *ub, struct ucred *active_cred, + struct thread *td); +int pipe_kqfilter(struct pipe *pipe, struct knote *kn); +void pipe_close(struct pipe *pipe); +int pipepair_create(struct thread *td, struct pipe **p_rpipe, struct pipe **p_wpipe); #endif /* !_SYS_PIPE_H_ */