diff -urN xnu-344.49/EXTERNAL_HEADERS/bsd/i386/ansi.h xnu-517/EXTERNAL_HEADERS/bsd/i386/ansi.h --- xnu-344.49/EXTERNAL_HEADERS/bsd/i386/ansi.h Thu Sep 18 21:02:01 2003 +++ xnu-517/EXTERNAL_HEADERS/bsd/i386/ansi.h Sat Oct 25 00:24:25 2003 @@ -80,6 +80,7 @@ #define _BSD_SSIZE_T_ int /* byte count or error */ #define _BSD_TIME_T_ long /* time() */ #define _BSD_VA_LIST_ void * /* va_list */ +#define _BSD_SOCKLEN_T_ int32_t /* socklen_t (duh) */ /* * Runes (wchar_t) is declared to be an ``int'' instead of the more natural diff -urN xnu-344.49/EXTERNAL_HEADERS/bsd/ppc/ansi.h xnu-517/EXTERNAL_HEADERS/bsd/ppc/ansi.h --- xnu-344.49/EXTERNAL_HEADERS/bsd/ppc/ansi.h Thu Sep 18 21:02:01 2003 +++ xnu-517/EXTERNAL_HEADERS/bsd/ppc/ansi.h Sat Oct 25 00:24:25 2003 @@ -80,6 +80,7 @@ #define _BSD_SSIZE_T_ int /* byte count or error */ #define _BSD_TIME_T_ long /* time() */ #define _BSD_VA_LIST_ char * /* va_list */ +#define _BSD_SOCKLEN_T_ int32_t /* socklen_t (duh) */ /* * Runes (wchar_t) is declared to be an ``int'' instead of the more natural diff -urN xnu-344.49/EXTERNAL_HEADERS/mach-o/kld.h xnu-517/EXTERNAL_HEADERS/mach-o/kld.h --- xnu-344.49/EXTERNAL_HEADERS/mach-o/kld.h Thu Sep 18 21:02:01 2003 +++ xnu-517/EXTERNAL_HEADERS/mach-o/kld.h Sat Oct 25 00:24:25 2003 @@ -33,6 +33,9 @@ * These API's are in libkld. Both kmodload(8) and /mach_kernel should * link with -lkld and then ld(1) will expand -lkld to libkld.dylib or * libkld.a depending on if -dynamic or -static is in effect. + * + * Note: we are using the __DYNAMIC__ flag to indicate user space kernel + * linking and __STATIC__ as a synonym of KERNEL. */ /* @@ -42,7 +45,7 @@ extern void kld_error_vprintf(const char *format, va_list ap); /* - * This two are only in libkld.dylib for use by kmodload(8) (user code compiled + * These two are only in libkld.dylib for use by kmodload(8) (user code compiled * with the default -dynamic). */ #ifdef __DYNAMIC__ @@ -54,6 +57,13 @@ struct mach_header **header_addr, const char *object_filename, const char *output_filename); + +__private_extern__ long kld_load_from_memory( + struct mach_header **header_addr, + const char *object_name, + char *object_addr, + long object_size, + const char *output_filename); #endif /* __DYNAMIC__ */ /* @@ -69,6 +79,11 @@ long object_size); #endif /* __STATIC__ */ +__private_extern__ long kld_load_basefile_from_memory( + const char *base_filename, + char *base_addr, + long base_size); + __private_extern__ long kld_unload_all( long deallocate_sets); @@ -81,5 +96,11 @@ __private_extern__ void kld_address_func( unsigned long (*func)(unsigned long size, unsigned long headers_size)); + +#define KLD_STRIP_ALL 0x00000000 +#define KLD_STRIP_NONE 0x00000001 + +__private_extern__ void kld_set_link_options( + unsigned long link_options); #endif /* _MACHO_KLD_H_ */ diff -urN xnu-344.49/EXTERNAL_HEADERS/mach-o/loader.h xnu-517/EXTERNAL_HEADERS/mach-o/loader.h --- xnu-344.49/EXTERNAL_HEADERS/mach-o/loader.h Thu Sep 18 21:02:01 2003 +++ xnu-517/EXTERNAL_HEADERS/mach-o/loader.h Sat Oct 25 00:24:25 2003 @@ -723,4 +723,4 @@ unsigned long header_addr; /* files virtual address */ }; -#endif _MACHO_LOADER_H_ +#endif /* _MACHO_LOADER_H_ */ diff -urN xnu-344.49/Makefile xnu-517/Makefile --- xnu-344.49/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/Makefile Tue Oct 21 21:24:55 2003 @@ -31,6 +31,8 @@ libkern \ libsa +CONFIG_SUBDIRS = config + INSTINC_SUBDIRS = $(ALL_SUBDIRS) INSTINC_SUBDIRS_PPC = $(INSTINC_SUBDIRS) @@ -44,6 +46,7 @@ EXPINC_SUBDIRS_I386 = $(EXPINC_SUBDIRS) COMP_SUBDIRS = $(ALL_SUBDIRS) + INST_SUBDIRS = \ libkern \ diff -urN xnu-344.49/bsd/conf/MASTER xnu-517/bsd/conf/MASTER --- xnu-344.49/bsd/conf/MASTER Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/MASTER Tue Oct 21 21:24:55 2003 @@ -144,6 +144,7 @@ options RANDOM_IP_ID # random (not sequential) ip ids # options TCP_DROP_SYNFIN # Drop TCP packets with SYN+FIN set # options ICMP_BANDLIM # ICMP bandwidth limiting sysctl +options AUDIT # Security event auditing # # @@ -152,6 +153,7 @@ options COMPAT_43 # 4.3 BSD compatibility # options DIAGNOSTIC # diagnostics # options KTRACE # ktrace support # +options GPROF # build profiling # # # 4.4 filesystems @@ -245,6 +247,10 @@ # # vnode device pseudo-device vndevice 4 init vndevice_init + +# +# memory device +pseudo-device mdevdevice 1 init mdevinit # # diff -urN xnu-344.49/bsd/conf/MASTER.ppc xnu-517/bsd/conf/MASTER.ppc --- xnu-344.49/bsd/conf/MASTER.ppc Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/MASTER.ppc Tue Oct 21 21:24:55 2003 @@ -47,7 +47,7 @@ # # RELEASE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs noprofiling hfs volfs devfs synthfs netat mrouting ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf] # RELEASE_TRACE = [RELEASE kdebug] -# PROFILE = [ppc mach medium vol pst gdb debug simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs profile hfs volfs devfs synthfs netat mrouting ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf] +# PROFILE = [ppc mach medium vol pst gdb simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs profile hfs volfs devfs synthfs netat mrouting ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf] # DEBUG = [ppc mach medium vol pst gdb debug simple_clock kernstack nfsclient nfsserver quota fifo fdesc union ffs cd9660 compat_43 revfs profiling hfs volfs devfs synthfs netat mrouting mach_assert ipdivert ipfirewall ktrace inet6 ipsec tcpdrop_synfin gif stf] # DEBUG_TRACE = [DEBUG kdebug] # diff -urN xnu-344.49/bsd/conf/Makefile xnu-517/bsd/conf/Makefile --- xnu-344.49/bsd/conf/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/Makefile Tue Oct 21 21:24:55 2003 @@ -18,7 +18,7 @@ export BSD_KERNEL_CONFIG = $(KERNEL_CONFIG) endif -COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) +export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/doconf: make build_setup @@ -53,6 +53,7 @@ SOURCE=$${next_source} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ + KERNEL_CONFIG=$(BSD_KERNEL_CONFIG) \ build_all; \ echo "[ $(SOURCE) ] Returning do_all $(COMPONENT) $(BSD_KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)"; diff -urN xnu-344.49/bsd/conf/Makefile.template xnu-517/bsd/conf/Makefile.template --- xnu-344.49/bsd/conf/Makefile.template Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/Makefile.template Tue Oct 21 21:24:55 2003 @@ -70,6 +70,13 @@ %MACHDEP # +# This rule insures that the subr_prof.c does NOT get compiled with +# profiling. It implements mcount() and profiling it leads to recursion. +# + +subr_prof.o_CFLAGS_RM = -pg + +# # OBJSDEPS is the set of files (defined in the machine dependent # template if necessary) which all objects depend on (such as an # in-line assembler expansion filter) @@ -84,7 +91,7 @@ $(COMPONENT).o: $(LDOBJS) @echo "[ creating $(COMPONENT).o ]" $(RM) $(RMFLAGS) vers.c - $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/newvers \ + $(COMPOBJROOT)/newvers \ `$(CAT) ${VERSION_FILES}` ${COPYRIGHT_FILES} ${KCC} $(CFLAGS) $(INCLUDES) -c vers.c @echo [ updating $(COMPONENT).o ${BSD_KERNEL_CONFIG} ] diff -urN xnu-344.49/bsd/conf/files xnu-517/bsd/conf/files --- xnu-344.49/bsd/conf/files Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/files Tue Oct 21 21:24:55 2003 @@ -60,6 +60,7 @@ OPTIONS/ktrace optional ktrace OPTIONS/profiling optional profiling OPTIONS/vndevice optional vndevice +OPTIONS/audit optional audit # # Network options @@ -115,6 +116,9 @@ bsd/dev/random/YarrowCoreLib/src/prng.c standard bsd/dev/random/YarrowCoreLib/src/sha1mod.c standard bsd/dev/random/YarrowCoreLib/src/yarrowUtils.c standard + +bsd/dev/memdev.c standard + bsd/dev/vn/vn.c optional vndevice bsd/dev/vn/shadow.c optional vndevice @@ -289,6 +293,7 @@ bsd/crypto/sha1.c optional crypto bsd/crypto/sha2/sha2.c optional crypto bsd/crypto/des/des_ecb.c optional crypto +bsd/crypto/des/des_enc.c optional crypto bsd/crypto/des/des_setkey.c optional crypto bsd/crypto/blowfish/bf_enc.c optional crypto bsd/crypto/blowfish/bf_skey.c optional crypto @@ -380,6 +385,7 @@ bsd/nfs/nfs_syscalls.c optional nfsclient nfsserver bsd/nfs/nfs_vfsops.c optional nfsclient bsd/nfs/nfs_vnops.c optional nfsclient +bsd/nfs/nfs_lock.c optional nfsclient bsd/kern/netboot.c optional nfsclient @@ -392,6 +398,7 @@ bsd/ufs/ffs/ffs_vnops.c standard bsd/ufs/mfs/mfs_vfsops.c optional mfs bsd/ufs/mfs/mfs_vnops.c optional mfs +bsd/ufs/ufs/ufs_attrlist.c standard bsd/ufs/ufs/ufs_bmap.c standard bsd/ufs/ufs/ufs_byte_order.c optional rev_endian_fs bsd/ufs/ufs/ufs_ihash.c standard @@ -410,9 +417,11 @@ bsd/hfs/hfs_encodinghint.c optional hfs bsd/hfs/hfs_encodings.c optional hfs bsd/hfs/hfs_endian.c optional hfs +bsd/hfs/hfs_hotfiles.c optional hfs bsd/hfs/hfs_link.c optional hfs bsd/hfs/hfs_lockf.c optional hfs bsd/hfs/hfs_lookup.c optional hfs +bsd/hfs/hfs_notification.c optional hfs bsd/hfs/hfs_quota.c optional quota bsd/hfs/hfs_readwrite.c optional hfs bsd/hfs/hfs_search.c optional hfs @@ -425,6 +434,7 @@ bsd/hfs/hfscommon/BTree/BTreeAllocate.c optional hfs bsd/hfs/hfscommon/BTree/BTreeMiscOps.c optional hfs bsd/hfs/hfscommon/BTree/BTreeNodeOps.c optional hfs +bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c optional hfs bsd/hfs/hfscommon/BTree/BTreeScanner.c optional hfs bsd/hfs/hfscommon/BTree/BTreeTreeOps.c optional hfs bsd/hfs/hfscommon/Catalog/Catalog.c optional hfs @@ -440,6 +450,11 @@ bsd/kern/init_sysent.c standard bsd/kern/kdebug.c standard bsd/kern/kern_acct.c standard +bsd/kern/kern_aio.c standard +bsd/kern/kern_audit.c standard +bsd/kern/kern_bsm_token.c standard +bsd/kern/kern_bsm_audit.c standard +bsd/kern/kern_bsm_klib.c standard bsd/kern/kern_clock.c standard bsd/kern/kern_core.c standard bsd/kern/kern_symfile.c standard diff -urN xnu-344.49/bsd/conf/files.i386 xnu-517/bsd/conf/files.i386 --- xnu-344.49/bsd/conf/files.i386 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/files.i386 Tue Oct 21 21:24:55 2003 @@ -11,6 +11,7 @@ bsd/dev/i386/memmove.c standard bsd/dev/i386/stubs.c standard bsd/dev/i386/lock_stubs.c standard +bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/i386/unix_startup.c standard diff -urN xnu-344.49/bsd/conf/files.ppc xnu-517/bsd/conf/files.ppc --- xnu-344.49/bsd/conf/files.ppc Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/files.ppc Tue Oct 21 21:24:55 2003 @@ -17,6 +17,10 @@ bsd/dev/ppc/systemcalls.c standard bsd/dev/ppc/km.c standard bsd/dev/ppc/xsumas.s standard +bsd/dev/ppc/sysctl.c standard + +bsd/dev/ppc/chud/chud_bsd_callback.c standard +bsd/dev/ppc/chud/chud_process.c standard bsd/kern/bsd_stubs.c standard diff -urN xnu-344.49/bsd/conf/param.c xnu-517/bsd/conf/param.c --- xnu-344.49/bsd/conf/param.c Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/conf/param.c Sat Oct 25 00:25:25 2003 @@ -77,16 +77,19 @@ #include #include #include +#include struct timezone tz = { TIMEZONE, PST }; #define NPROC (20 + 16 * MAXUSERS) +#define HNPROC (20 + 64 * MAXUSERS) int maxproc = NPROC; +__private_extern__ int hard_maxproc = HNPROC; /* hardcoded limit */ int nprocs = 0; /* XXX */ #define NTEXT (80 + NPROC / 8) /* actually the object cache */ #define NVNODE (NPROC + NTEXT + 300) -int desiredvnodes = NVNODE + 350; +int desiredvnodes = NVNODE + 700; #define MAXFILES (OPEN_MAX + 2048) int maxfiles = MAXFILES; @@ -97,6 +100,16 @@ #define MAXSOCKETS NMBCLUSTERS int maxsockets = MAXSOCKETS; + +/* + * async IO (aio) configurable limits + */ +#define AIO_MAX 90 /* system wide limit of async IO requests */ +#define AIO_PROCESS_MAX AIO_LISTIO_MAX /* process limit of async IO requests */ +#define AIO_THREAD_COUNT 4 /* number of async IO worker threads created */ +int aio_max_requests = AIO_MAX; +int aio_max_requests_per_process = AIO_PROCESS_MAX; +int aio_worker_threads = AIO_THREAD_COUNT; /* * These have to be allocated somewhere; allocating diff -urN xnu-344.49/bsd/conf/version.major xnu-517/bsd/conf/version.major --- xnu-344.49/bsd/conf/version.major Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/version.major Tue Oct 21 21:24:55 2003 @@ -1 +1 @@ -6 +7 diff -urN xnu-344.49/bsd/conf/version.minor xnu-517/bsd/conf/version.minor --- xnu-344.49/bsd/conf/version.minor Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/conf/version.minor Tue Oct 21 21:24:55 2003 @@ -1 +1 @@ -8 +0 diff -urN xnu-344.49/bsd/conf/version.variant xnu-517/bsd/conf/version.variant --- xnu-344.49/bsd/conf/version.variant Thu Sep 18 03:15:10 2003 +++ xnu-517/bsd/conf/version.variant Tue Oct 21 21:24:55 2003 @@ -0,0 +1 @@ +0 diff -urN xnu-344.49/bsd/crypto/blowfish/bf_enc.c xnu-517/bsd/crypto/blowfish/bf_enc.c --- xnu-344.49/bsd/crypto/blowfish/bf_enc.c Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/blowfish/bf_enc.c Sat Oct 25 00:25:25 2003 @@ -1,12 +1,12 @@ -/* $FreeBSD: src/sys/crypto/blowfish/bf_enc.c,v 1.1.2.2 2001/07/03 11:01:28 ume Exp $ */ -/* $KAME: bf_enc.c,v 1.5 2000/09/18 21:21:19 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/blowfish/bf_enc.c,v 1.1.2.3 2002/03/26 10:12:23 ume Exp $ */ +/* $KAME: bf_enc.c,v 1.7 2002/02/27 01:33:59 itojun Exp $ */ /* crypto/bf/bf_enc.c */ -/* Copyright (C) 1995-1997 Eric Young (eay@mincom.oz.au) +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) * All rights reserved. * * This package is an SSL implementation written - * by Eric Young (eay@mincom.oz.au). + * by Eric Young (eay@cryptsoft.com). * The implementation was written so as to conform with Netscapes SSL. * * This library is free for commercial and non-commercial use as long as @@ -14,7 +14,7 @@ * apply to all code found in this distribution, be it the RC4, RSA, * lhash, DES, etc., code; not just the SSL code. The SSL documentation * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@mincom.oz.au). + * except that the holder is Tim Hudson (tjh@cryptsoft.com). * * Copyright remains Eric Young's, and as such any Copyright notices in * the code are not to be removed. @@ -34,12 +34,12 @@ * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * "This product includes cryptographic software written by - * Eric Young (eay@mincom.oz.au)" + * Eric Young (eay@cryptsoft.com)" * The word 'cryptographic' can be left out if the rouines from the library * being used are not cryptographic related :-). * 4. If you include any Windows specific code (or a derivative thereof) from * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@mincom.oz.au)" + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" * * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -75,10 +75,9 @@ /* XXX "data" is host endian */ void -BF_encrypt(data, key, encrypt) +BF_encrypt(data, key) BF_LONG *data; BF_KEY *key; - int encrypt; { register BF_LONG l, r, *p, *s; @@ -87,57 +86,73 @@ l = data[0]; r = data[1]; - if (encrypt) { - l^=p[0]; - BF_ENC(r, l, s, p[ 1]); - BF_ENC(l, r, s, p[ 2]); - BF_ENC(r, l, s, p[ 3]); - BF_ENC(l, r, s, p[ 4]); - BF_ENC(r, l, s, p[ 5]); - BF_ENC(l, r, s, p[ 6]); - BF_ENC(r, l, s, p[ 7]); - BF_ENC(l, r, s, p[ 8]); - BF_ENC(r, l, s, p[ 9]); - BF_ENC(l, r, s, p[10]); - BF_ENC(r, l, s, p[11]); - BF_ENC(l, r, s, p[12]); - BF_ENC(r, l, s, p[13]); - BF_ENC(l, r, s, p[14]); - BF_ENC(r, l, s, p[15]); - BF_ENC(l, r, s, p[16]); + l^=p[0]; + BF_ENC(r, l, s, p[ 1]); + BF_ENC(l, r, s, p[ 2]); + BF_ENC(r, l, s, p[ 3]); + BF_ENC(l, r, s, p[ 4]); + BF_ENC(r, l, s, p[ 5]); + BF_ENC(l, r, s, p[ 6]); + BF_ENC(r, l, s, p[ 7]); + BF_ENC(l, r, s, p[ 8]); + BF_ENC(r, l, s, p[ 9]); + BF_ENC(l, r, s, p[10]); + BF_ENC(r, l, s, p[11]); + BF_ENC(l, r, s, p[12]); + BF_ENC(r, l, s, p[13]); + BF_ENC(l, r, s, p[14]); + BF_ENC(r, l, s, p[15]); + BF_ENC(l, r, s, p[16]); #if BF_ROUNDS == 20 - BF_ENC(r, l, s, p[17]); - BF_ENC(l, r, s, p[18]); - BF_ENC(r, l, s, p[19]); - BF_ENC(l, r, s, p[20]); + BF_ENC(r, l, s, p[17]); + BF_ENC(l, r, s, p[18]); + BF_ENC(r, l, s, p[19]); + BF_ENC(l, r, s, p[20]); #endif - r ^= p[BF_ROUNDS + 1]; - } else { - l ^= p[BF_ROUNDS + 1]; + r ^= p[BF_ROUNDS + 1]; + + data[1] = l & 0xffffffff; + data[0] = r & 0xffffffff; +} + +/* XXX "data" is host endian */ +void +BF_decrypt(data, key) + BF_LONG *data; + BF_KEY *key; +{ + register BF_LONG l, r, *p, *s; + + p = key->P; + s= &key->S[0]; + l = data[0]; + r = data[1]; + + l ^= p[BF_ROUNDS + 1]; #if BF_ROUNDS == 20 - BF_ENC(r, l, s, p[20]); - BF_ENC(l, r, s, p[19]); - BF_ENC(r, l, s, p[18]); - BF_ENC(l, r, s, p[17]); + BF_ENC(r, l, s, p[20]); + BF_ENC(l, r, s, p[19]); + BF_ENC(r, l, s, p[18]); + BF_ENC(l, r, s, p[17]); #endif - BF_ENC(r, l, s, p[16]); - BF_ENC(l, r, s, p[15]); - BF_ENC(r, l, s, p[14]); - BF_ENC(l, r, s, p[13]); - BF_ENC(r, l, s, p[12]); - BF_ENC(l, r, s, p[11]); - BF_ENC(r, l, s, p[10]); - BF_ENC(l, r, s, p[ 9]); - BF_ENC(r, l, s, p[ 8]); - BF_ENC(l, r, s, p[ 7]); - BF_ENC(r, l, s, p[ 6]); - BF_ENC(l, r, s, p[ 5]); - BF_ENC(r, l, s, p[ 4]); - BF_ENC(l, r, s, p[ 3]); - BF_ENC(r, l, s, p[ 2]); - BF_ENC(l, r, s, p[ 1]); - r ^= p[0]; - } + BF_ENC(r, l, s, p[16]); + BF_ENC(l, r, s, p[15]); + BF_ENC(r, l, s, p[14]); + BF_ENC(l, r, s, p[13]); + BF_ENC(r, l, s, p[12]); + BF_ENC(l, r, s, p[11]); + BF_ENC(r, l, s, p[10]); + BF_ENC(l, r, s, p[ 9]); + BF_ENC(r, l, s, p[ 8]); + BF_ENC(l, r, s, p[ 7]); + BF_ENC(r, l, s, p[ 6]); + BF_ENC(l, r, s, p[ 5]); + BF_ENC(r, l, s, p[ 4]); + BF_ENC(l, r, s, p[ 3]); + BF_ENC(r, l, s, p[ 2]); + BF_ENC(l, r, s, p[ 1]); + r ^= p[0]; + data[1] = l & 0xffffffff; data[0] = r & 0xffffffff; } diff -urN xnu-344.49/bsd/crypto/blowfish/bf_pi.h xnu-517/bsd/crypto/blowfish/bf_pi.h --- xnu-344.49/bsd/crypto/blowfish/bf_pi.h Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/blowfish/bf_pi.h Sat Oct 25 00:25:25 2003 @@ -59,7 +59,7 @@ * [including the GNU Public Licence.] */ -static BF_KEY bf_init= { +static const BF_KEY bf_init= { { 0x243f6a88L, 0x85a308d3L, 0x13198a2eL, 0x03707344L, 0xa4093822L, 0x299f31d0L, 0x082efa98L, 0xec4e6c89L, diff -urN xnu-344.49/bsd/crypto/blowfish/bf_skey.c xnu-517/bsd/crypto/blowfish/bf_skey.c --- xnu-344.49/bsd/crypto/blowfish/bf_skey.c Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/blowfish/bf_skey.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/crypto/blowfish/bf_skey.c,v 1.1.2.2 2001/07/03 11:01:28 ume Exp $ */ -/* $KAME: bf_skey.c,v 1.5 2000/11/06 13:58:08 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/blowfish/bf_skey.c,v 1.1.2.3 2002/03/26 10:12:23 ume Exp $ */ +/* $KAME: bf_skey.c,v 1.7 2002/02/27 01:33:59 itojun Exp $ */ /* crypto/bf/bf_skey.c */ /* Copyright (C) 1995-1997 Eric Young (eay@mincom.oz.au) @@ -76,7 +76,7 @@ BF_LONG *p, ri, in[2]; unsigned char *d, *end; - memcpy((char *)key, (char *)&bf_init, sizeof(BF_KEY)); + memcpy((char *)key, (const char *)&bf_init, sizeof(BF_KEY)); p = key->P; if (len > ((BF_ROUNDS + 2) * 4)) @@ -106,14 +106,14 @@ in[0] = 0L; in[1] = 0L; for (i = 0; i < BF_ROUNDS + 2; i += 2) { - BF_encrypt(in, key, BF_ENCRYPT); + BF_encrypt(in, key); p[i ] = in[0]; p[i+1] = in[1]; } p = key->S; for (i = 0; i < 4 * 256; i += 2) { - BF_encrypt(in, key, BF_ENCRYPT); + BF_encrypt(in, key); p[i ] = in[0]; p[i+1] = in[1]; } diff -urN xnu-344.49/bsd/crypto/blowfish/blowfish.h xnu-517/bsd/crypto/blowfish/blowfish.h --- xnu-344.49/bsd/crypto/blowfish/blowfish.h Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/blowfish/blowfish.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/crypto/blowfish/blowfish.h,v 1.1.2.2 2001/07/03 11:01:28 ume Exp $ */ -/* $KAME: blowfish.h,v 1.10 2000/09/18 21:21:20 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/blowfish/blowfish.h,v 1.1.2.3 2002/03/26 10:12:23 ume Exp $ */ +/* $KAME: blowfish.h,v 1.12 2002/02/27 01:33:59 itojun Exp $ */ /* crypto/bf/blowfish.h */ /* Copyright (C) 1995-1997 Eric Young (eay@mincom.oz.au) @@ -81,7 +81,11 @@ } BF_KEY; void BF_set_key __P((BF_KEY *, int, unsigned char *)); -void BF_encrypt __P((BF_LONG *, BF_KEY *, int)); +void BF_encrypt __P((BF_LONG *, BF_KEY *)); +void BF_decrypt __P((BF_LONG *, BF_KEY *)); +void BF_cbc_encrypt(const unsigned char *, unsigned char *, long, + const BF_KEY *, unsigned char *, int); + #ifdef __cplusplus } #endif diff -urN xnu-344.49/bsd/crypto/des/des.h xnu-517/bsd/crypto/des/des.h --- xnu-344.49/bsd/crypto/des/des.h Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/des/des.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/crypto/des/des.h,v 1.1.2.2 2001/07/03 11:01:31 ume Exp $ */ -/* $KAME: des.h,v 1.7 2000/09/18 20:59:21 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/des/des.h,v 1.1.2.3 2002/03/26 10:12:24 ume Exp $ */ +/* $KAME: des.h,v 1.8 2001/09/10 04:03:57 itojun Exp $ */ /* lib/des/des.h */ /* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) @@ -61,15 +61,14 @@ typedef unsigned char des_cblock[8]; typedef struct des_ks_struct { - union { - des_cblock _; - /* make sure things are correct size on machines with - * 8 byte longs */ - DES_LONG pad[2]; - } ks; -#undef _ -#define _ ks._ - } des_key_schedule[16]; + union { + des_cblock cblock; + /* make sure things are correct size on machines with + * 8 byte longs */ + DES_LONG deslong[2]; + } ks; + int weak_key; +} des_key_schedule[16]; #define DES_KEY_SZ (sizeof(des_cblock)) #define DES_SCHEDULE_SZ (sizeof(des_key_schedule)) @@ -85,13 +84,32 @@ char *des_options __P((void)); void des_ecb_encrypt __P((des_cblock *, des_cblock *, des_key_schedule, int)); -void des_encrypt __P((DES_LONG *, des_key_schedule, int)); + +void des_encrypt1 __P((DES_LONG *, des_key_schedule, int)); void des_encrypt2 __P((DES_LONG *, des_key_schedule, int)); +void des_encrypt3 __P((DES_LONG *, des_key_schedule, des_key_schedule, + des_key_schedule)); +void des_decrypt3 __P((DES_LONG *, des_key_schedule, des_key_schedule, + des_key_schedule)); + +void des_ecb3_encrypt __P((des_cblock *, des_cblock *, des_key_schedule, + des_key_schedule, des_key_schedule, int)); + +void des_ncbc_encrypt __P((const unsigned char *, unsigned char *, long, + des_key_schedule, des_cblock *, int)); + +void des_ede3_cbc_encrypt(const unsigned char *, unsigned char *, long, + des_key_schedule, des_key_schedule, + des_key_schedule, des_cblock *, int); void des_set_odd_parity __P((des_cblock *)); +void des_fixup_key_parity __P((des_cblock *)); int des_is_weak_key __P((des_cblock *)); int des_set_key __P((des_cblock *, des_key_schedule)); int des_key_sched __P((des_cblock *, des_key_schedule)); +int des_set_key_checked __P((des_cblock *, des_key_schedule)); +void des_set_key_unchecked __P((des_cblock *, des_key_schedule)); +int des_check_key_parity __P((des_cblock *)); #ifdef __cplusplus } diff -urN xnu-344.49/bsd/crypto/des/des_ecb.c xnu-517/bsd/crypto/des/des_ecb.c --- xnu-344.49/bsd/crypto/des/des_ecb.c Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/des/des_ecb.c Sat Oct 25 00:25:25 2003 @@ -1,8 +1,8 @@ -/* $FreeBSD: src/sys/crypto/des/des_ecb.c,v 1.1.2.2 2001/07/03 11:01:31 ume Exp $ */ -/* $KAME: des_ecb.c,v 1.5 2000/11/06 13:58:08 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/des/des_ecb.c,v 1.1.2.3 2002/03/26 10:12:24 ume Exp $ */ +/* $KAME: des_ecb.c,v 1.6 2001/09/10 04:03:58 itojun Exp $ */ /* crypto/des/ecb_enc.c */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) +/* Copyright (C) 1995-1998 Eric Young (eay@mincom.oz.au) * All rights reserved. * * This file is part of an SSL implementation written @@ -53,182 +53,84 @@ #include #include -char *libdes_version="libdes v 3.24 - 20-Apr-1996 - eay"; -char *DES_version="DES part of SSLeay 0.6.4 30-Aug-1996"; +/* char *libdes_version="libdes v 3.24 - 20-Apr-1996 - eay"; */ /* wrong */ +/* char *DES_version="DES part of SSLeay 0.6.4 30-Aug-1996"; */ + +char *des_options(void) + { + static int init=1; + static char buf[32]; + + if (init) + { + const char *ptr,*unroll,*risc,*size; -char *des_options() - { #ifdef DES_PTR - if (sizeof(DES_LONG) != sizeof(long)) - return("des(ptr,int)"); - else - return("des(ptr,long)"); + ptr="ptr"; #else - if (sizeof(DES_LONG) != sizeof(long)) - return("des(idx,int)"); - else - return("des(idx,long)"); + ptr="idx"; #endif - } - - -void des_ecb_encrypt(input, output, ks, encrypt) -des_cblock (*input); -des_cblock (*output); -des_key_schedule ks; -int encrypt; - { +#if defined(DES_RISC1) || defined(DES_RISC2) +#ifdef DES_RISC1 + risc="risc1"; +#endif +#ifdef DES_RISC2 + risc="risc2"; +#endif +#else + risc="cisc"; +#endif +#ifdef DES_UNROLL + unroll="16"; +#else + unroll="4"; +#endif + if (sizeof(DES_LONG) != sizeof(long)) + size="int"; + else + size="long"; + sprintf(buf,"des(%s,%s,%s,%s)",ptr,risc,unroll,size); + init=0; + } + return(buf); +} +void des_ecb_encrypt(des_cblock *input, des_cblock *output, + des_key_schedule ks, int enc) +{ register DES_LONG l; - register unsigned char *in,*out; DES_LONG ll[2]; + const unsigned char *in=&(*input)[0]; + unsigned char *out = &(*output)[0]; - in=(unsigned char *)input; - out=(unsigned char *)output; c2l(in,l); ll[0]=l; c2l(in,l); ll[1]=l; - des_encrypt(ll,ks,encrypt); + des_encrypt1(ll,ks,enc); l=ll[0]; l2c(l,out); l=ll[1]; l2c(l,out); l=ll[0]=ll[1]=0; - } +} -void des_encrypt(data, ks, encrypt) -DES_LONG *data; -des_key_schedule ks; -int encrypt; - { - register DES_LONG l,r,t,u; -#ifdef DES_PTR - register unsigned char *des_SP=(unsigned char *)des_SPtrans; -#endif -#ifdef undef - union fudge { - DES_LONG l; - unsigned short s[2]; - unsigned char c[4]; - } U,T; -#endif - register int i; - register DES_LONG *s; - - u=data[0]; - r=data[1]; +void des_ecb3_encrypt(des_cblock *input, des_cblock *output, + des_key_schedule ks1, des_key_schedule ks2, des_key_schedule ks3, + int enc) +{ + register DES_LONG l0,l1; + DES_LONG ll[2]; + const unsigned char *in = &(*input)[0]; + unsigned char *out = &(*output)[0]; + + c2l(in,l0); + c2l(in,l1); + ll[0]=l0; + ll[1]=l1; - IP(u,r); - /* Things have been modified so that the initial rotate is - * done outside the loop. This required the - * des_SPtrans values in sp.h to be rotated 1 bit to the right. - * One perl script later and things have a 5% speed up on a sparc2. - * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> - * for pointing this out. */ - l=(r<<1)|(r>>31); - r=(u<<1)|(u>>31); - - /* clear the top bits on machines with 8byte longs */ - l&=0xffffffffL; - r&=0xffffffffL; - - s=(DES_LONG *)ks; - /* I don't know if it is worth the effort of loop unrolling the - * inner loop - */ - if (encrypt) - { - for (i=0; i<32; i+=8) - { - D_ENCRYPT(l,r,i+0); /* 1 */ - D_ENCRYPT(r,l,i+2); /* 2 */ - D_ENCRYPT(l,r,i+4); /* 3 */ - D_ENCRYPT(r,l,i+6); /* 4 */ - } - } + if (enc) + des_encrypt3(ll,ks1,ks2,ks3); else - { - for (i=30; i>0; i-=8) - { - D_ENCRYPT(l,r,i-0); /* 16 */ - D_ENCRYPT(r,l,i-2); /* 15 */ - D_ENCRYPT(l,r,i-4); /* 14 */ - D_ENCRYPT(r,l,i-6); /* 13 */ - } - } - l=(l>>1)|(l<<31); - r=(r>>1)|(r<<31); - /* clear the top bits on machines with 8byte longs */ - l&=0xffffffffL; - r&=0xffffffffL; - - FP(r,l); - data[0]=l; - data[1]=r; - l=r=t=u=0; - } - -void des_encrypt2(data, ks, encrypt) -DES_LONG *data; -des_key_schedule ks; -int encrypt; - { - register DES_LONG l,r,t,u; -#ifdef DES_PTR - register unsigned char *des_SP=(unsigned char *)des_SPtrans; -#endif -#ifdef undef - union fudge { - DES_LONG l; - unsigned short s[2]; - unsigned char c[4]; - } U,T; -#endif - register int i; - register DES_LONG *s; + des_decrypt3(ll,ks1,ks2,ks3); - u=data[0]; - r=data[1]; - - /* Things have been modified so that the initial rotate is - * done outside the loop. This required the - * des_SPtrans values in sp.h to be rotated 1 bit to the right. - * One perl script later and things have a 5% speed up on a sparc2. - * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> - * for pointing this out. */ - l=(r<<1)|(r>>31); - r=(u<<1)|(u>>31); - - /* clear the top bits on machines with 8byte longs */ - l&=0xffffffffL; - r&=0xffffffffL; - - s=(DES_LONG *)ks; - /* I don't know if it is worth the effort of loop unrolling the - * inner loop */ - if (encrypt) - { - for (i=0; i<32; i+=8) - { - D_ENCRYPT(l,r,i+0); /* 1 */ - D_ENCRYPT(r,l,i+2); /* 2 */ - D_ENCRYPT(l,r,i+4); /* 3 */ - D_ENCRYPT(r,l,i+6); /* 4 */ - } - } - else - { - for (i=30; i>0; i-=8) - { - D_ENCRYPT(l,r,i-0); /* 16 */ - D_ENCRYPT(r,l,i-2); /* 15 */ - D_ENCRYPT(l,r,i-4); /* 14 */ - D_ENCRYPT(r,l,i-6); /* 13 */ - } - } - l=(l>>1)|(l<<31); - r=(r>>1)|(r<<31); - /* clear the top bits on machines with 8byte longs */ - l&=0xffffffffL; - r&=0xffffffffL; - - data[0]=l; - data[1]=r; - l=r=t=u=0; - } + l0=ll[0]; + l1=ll[1]; + l2c(l0,out); + l2c(l1,out); +} diff -urN xnu-344.49/bsd/crypto/des/des_enc.c xnu-517/bsd/crypto/des/des_enc.c --- xnu-344.49/bsd/crypto/des/des_enc.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/crypto/des/des_enc.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,294 @@ +/* $KAME: kame/kame/sys/crypto/des/des_enc.c,v 1.1 2001/09/10 04:03:58 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/des/des_enc.c,v 1.1.2.1 2002/03/26 10:12:24 ume Exp $ */ + +/* crypto/des/des_enc.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include + +extern const DES_LONG des_SPtrans[8][64]; + +void des_encrypt1(DES_LONG *data, des_key_schedule ks, int enc) +{ + register DES_LONG l,r,t,u; +#ifdef DES_PTR + register const unsigned char *des_SP=(const unsigned char *)des_SPtrans; +#endif +#ifndef DES_UNROLL + register int i; +#endif + register DES_LONG *s; + + r=data[0]; + l=data[1]; + + IP(r,l); + /* Things have been modified so that the initial rotate is + * done outside the loop. This required the + * des_SPtrans values in sp.h to be rotated 1 bit to the right. + * One perl script later and things have a 5% speed up on a sparc2. + * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> + * for pointing this out. */ + /* clear the top bits on machines with 8byte longs */ + /* shift left by 2 */ + r=ROTATE(r,29)&0xffffffffL; + l=ROTATE(l,29)&0xffffffffL; + + s=ks->ks.deslong; + /* I don't know if it is worth the effort of loop unrolling the + * inner loop */ + if (enc) + { +#ifdef DES_UNROLL + D_ENCRYPT(l,r, 0); /* 1 */ + D_ENCRYPT(r,l, 2); /* 2 */ + D_ENCRYPT(l,r, 4); /* 3 */ + D_ENCRYPT(r,l, 6); /* 4 */ + D_ENCRYPT(l,r, 8); /* 5 */ + D_ENCRYPT(r,l,10); /* 6 */ + D_ENCRYPT(l,r,12); /* 7 */ + D_ENCRYPT(r,l,14); /* 8 */ + D_ENCRYPT(l,r,16); /* 9 */ + D_ENCRYPT(r,l,18); /* 10 */ + D_ENCRYPT(l,r,20); /* 11 */ + D_ENCRYPT(r,l,22); /* 12 */ + D_ENCRYPT(l,r,24); /* 13 */ + D_ENCRYPT(r,l,26); /* 14 */ + D_ENCRYPT(l,r,28); /* 15 */ + D_ENCRYPT(r,l,30); /* 16 */ +#else + for (i=0; i<32; i+=8) + { + D_ENCRYPT(l,r,i+0); /* 1 */ + D_ENCRYPT(r,l,i+2); /* 2 */ + D_ENCRYPT(l,r,i+4); /* 3 */ + D_ENCRYPT(r,l,i+6); /* 4 */ + } +#endif + } + else + { +#ifdef DES_UNROLL + D_ENCRYPT(l,r,30); /* 16 */ + D_ENCRYPT(r,l,28); /* 15 */ + D_ENCRYPT(l,r,26); /* 14 */ + D_ENCRYPT(r,l,24); /* 13 */ + D_ENCRYPT(l,r,22); /* 12 */ + D_ENCRYPT(r,l,20); /* 11 */ + D_ENCRYPT(l,r,18); /* 10 */ + D_ENCRYPT(r,l,16); /* 9 */ + D_ENCRYPT(l,r,14); /* 8 */ + D_ENCRYPT(r,l,12); /* 7 */ + D_ENCRYPT(l,r,10); /* 6 */ + D_ENCRYPT(r,l, 8); /* 5 */ + D_ENCRYPT(l,r, 6); /* 4 */ + D_ENCRYPT(r,l, 4); /* 3 */ + D_ENCRYPT(l,r, 2); /* 2 */ + D_ENCRYPT(r,l, 0); /* 1 */ +#else + for (i=30; i>0; i-=8) + { + D_ENCRYPT(l,r,i-0); /* 16 */ + D_ENCRYPT(r,l,i-2); /* 15 */ + D_ENCRYPT(l,r,i-4); /* 14 */ + D_ENCRYPT(r,l,i-6); /* 13 */ + } +#endif + } + + /* rotate and clear the top bits on machines with 8byte longs */ + l=ROTATE(l,3)&0xffffffffL; + r=ROTATE(r,3)&0xffffffffL; + + FP(r,l); + data[0]=l; + data[1]=r; + l=r=t=u=0; +} + +void des_encrypt2(DES_LONG *data, des_key_schedule ks, int enc) +{ + register DES_LONG l,r,t,u; +#ifdef DES_PTR + register const unsigned char *des_SP=(const unsigned char *)des_SPtrans; +#endif +#ifndef DES_UNROLL + register int i; +#endif + register DES_LONG *s; + + r=data[0]; + l=data[1]; + + /* Things have been modified so that the initial rotate is + * done outside the loop. This required the + * des_SPtrans values in sp.h to be rotated 1 bit to the right. + * One perl script later and things have a 5% speed up on a sparc2. + * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> + * for pointing this out. */ + /* clear the top bits on machines with 8byte longs */ + r=ROTATE(r,29)&0xffffffffL; + l=ROTATE(l,29)&0xffffffffL; + + s=ks->ks.deslong; + /* I don't know if it is worth the effort of loop unrolling the + * inner loop */ + if (enc) + { +#ifdef DES_UNROLL + D_ENCRYPT(l,r, 0); /* 1 */ + D_ENCRYPT(r,l, 2); /* 2 */ + D_ENCRYPT(l,r, 4); /* 3 */ + D_ENCRYPT(r,l, 6); /* 4 */ + D_ENCRYPT(l,r, 8); /* 5 */ + D_ENCRYPT(r,l,10); /* 6 */ + D_ENCRYPT(l,r,12); /* 7 */ + D_ENCRYPT(r,l,14); /* 8 */ + D_ENCRYPT(l,r,16); /* 9 */ + D_ENCRYPT(r,l,18); /* 10 */ + D_ENCRYPT(l,r,20); /* 11 */ + D_ENCRYPT(r,l,22); /* 12 */ + D_ENCRYPT(l,r,24); /* 13 */ + D_ENCRYPT(r,l,26); /* 14 */ + D_ENCRYPT(l,r,28); /* 15 */ + D_ENCRYPT(r,l,30); /* 16 */ +#else + for (i=0; i<32; i+=8) + { + D_ENCRYPT(l,r,i+0); /* 1 */ + D_ENCRYPT(r,l,i+2); /* 2 */ + D_ENCRYPT(l,r,i+4); /* 3 */ + D_ENCRYPT(r,l,i+6); /* 4 */ + } +#endif + } + else + { +#ifdef DES_UNROLL + D_ENCRYPT(l,r,30); /* 16 */ + D_ENCRYPT(r,l,28); /* 15 */ + D_ENCRYPT(l,r,26); /* 14 */ + D_ENCRYPT(r,l,24); /* 13 */ + D_ENCRYPT(l,r,22); /* 12 */ + D_ENCRYPT(r,l,20); /* 11 */ + D_ENCRYPT(l,r,18); /* 10 */ + D_ENCRYPT(r,l,16); /* 9 */ + D_ENCRYPT(l,r,14); /* 8 */ + D_ENCRYPT(r,l,12); /* 7 */ + D_ENCRYPT(l,r,10); /* 6 */ + D_ENCRYPT(r,l, 8); /* 5 */ + D_ENCRYPT(l,r, 6); /* 4 */ + D_ENCRYPT(r,l, 4); /* 3 */ + D_ENCRYPT(l,r, 2); /* 2 */ + D_ENCRYPT(r,l, 0); /* 1 */ +#else + for (i=30; i>0; i-=8) + { + D_ENCRYPT(l,r,i-0); /* 16 */ + D_ENCRYPT(r,l,i-2); /* 15 */ + D_ENCRYPT(l,r,i-4); /* 14 */ + D_ENCRYPT(r,l,i-6); /* 13 */ + } +#endif + } + /* rotate and clear the top bits on machines with 8byte longs */ + data[0]=ROTATE(l,3)&0xffffffffL; + data[1]=ROTATE(r,3)&0xffffffffL; + l=r=t=u=0; +} + +void des_encrypt3(DES_LONG *data, des_key_schedule ks1, des_key_schedule ks2, + des_key_schedule ks3) +{ + register DES_LONG l,r; + + l=data[0]; + r=data[1]; + IP(l,r); + data[0]=l; + data[1]=r; + des_encrypt2((DES_LONG *)data,ks1,DES_ENCRYPT); + des_encrypt2((DES_LONG *)data,ks2,DES_DECRYPT); + des_encrypt2((DES_LONG *)data,ks3,DES_ENCRYPT); + l=data[0]; + r=data[1]; + FP(r,l); + data[0]=l; + data[1]=r; +} + +void des_decrypt3(DES_LONG *data, des_key_schedule ks1, des_key_schedule ks2, + des_key_schedule ks3) +{ + register DES_LONG l,r; + + l=data[0]; + r=data[1]; + IP(l,r); + data[0]=l; + data[1]=r; + des_encrypt2((DES_LONG *)data,ks3,DES_DECRYPT); + des_encrypt2((DES_LONG *)data,ks2,DES_ENCRYPT); + des_encrypt2((DES_LONG *)data,ks1,DES_DECRYPT); + l=data[0]; + r=data[1]; + FP(r,l); + data[0]=l; + data[1]=r; +} diff -urN xnu-344.49/bsd/crypto/des/des_locl.h xnu-517/bsd/crypto/des/des_locl.h --- xnu-344.49/bsd/crypto/des/des_locl.h Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/des/des_locl.h Sat Oct 25 00:25:25 2003 @@ -1,8 +1,8 @@ -/* $FreeBSD: src/sys/crypto/des/des_locl.h,v 1.2.2.2 2001/07/03 11:01:31 ume Exp $ */ -/* $KAME: des_locl.h,v 1.6 2000/11/06 13:58:09 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/des/des_locl.h,v 1.2.2.3 2002/03/26 10:12:25 ume Exp $ */ +/* $KAME: des_locl.h,v 1.7 2001/09/10 04:03:58 itojun Exp $ */ -/* lib/des/des_locl.h */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) +/* crypto/des/des_locl.h */ +/* Copyright (C) 1995-1997 Eric Young (eay@mincom.oz.au) * All rights reserved. * * This file is part of an SSL implementation written @@ -47,13 +47,6 @@ * copied and put under another distribution licence * [including the GNU Public Licence.] */ -/* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING - * - * Always modify des_locl.org since des_locl.h is automatically generated from - * it during SSLeay configuration. - * - * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING - */ #ifndef HEADER_DES_LOCL_H #define HEADER_DES_LOCL_H @@ -130,6 +123,11 @@ #define ROTATE(a,n) (((a)>>(n))+((a)<<(32-(n)))) +#define LOAD_DATA_tmp(a,b,c,d,e,f) LOAD_DATA(a,b,c,d,e,f,g) +#define LOAD_DATA(R,S,u,t,E0,E1,tmp) \ + u=R^s[S ]; \ + t=R^s[S+1] + /* The changes to this macro may help or hinder, depending on the * compiler and the achitecture. gcc2 always seems to do well :-). * Inspired by Dana How @@ -138,49 +136,170 @@ * bytes, probably an issue of accessing non-word aligned objects :-( */ #ifdef DES_PTR -#define D_ENCRYPT(L,R,S) { \ - u=((R^s[S ])<<2); \ - t= R^s[S+1]; \ - t=ROTATE(t,2); \ - L^= (\ - *(DES_LONG *)((unsigned char *)des_SP+0x100+((t )&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x300+((t>> 8)&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x500+((t>>16)&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x700+((t>>24)&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP +((u )&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x200+((u>> 8)&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x400+((u>>16)&0xfc))+ \ - *(DES_LONG *)((unsigned char *)des_SP+0x600+((u>>24)&0xfc))); } +/* It recently occurred to me that 0^0^0^0^0^0^0 == 0, so there + * is no reason to not xor all the sub items together. This potentially + * saves a register since things can be xored directly into L */ + +#if defined(DES_RISC1) || defined(DES_RISC2) +#ifdef DES_RISC1 +#define D_ENCRYPT(LL,R,S) { \ + unsigned int u1,u2,u3; \ + LOAD_DATA(R,S,u,t,E0,E1,u1); \ + u2=(int)u>>8L; \ + u1=(int)u&0xfc; \ + u2&=0xfc; \ + t=ROTATE(t,4); \ + u>>=16L; \ + LL^= *(const DES_LONG *)(des_SP +u1); \ + LL^= *(const DES_LONG *)(des_SP+0x200+u2); \ + u3=(int)(u>>8L); \ + u1=(int)u&0xfc; \ + u3&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x400+u1); \ + LL^= *(const DES_LONG *)(des_SP+0x600+u3); \ + u2=(int)t>>8L; \ + u1=(int)t&0xfc; \ + u2&=0xfc; \ + t>>=16L; \ + LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ + LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ + u3=(int)t>>8L; \ + u1=(int)t&0xfc; \ + u3&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x500+u1); \ + LL^= *(const DES_LONG *)(des_SP+0x700+u3); } +#endif /* DES_RISC1 */ +#ifdef DES_RISC2 +#define D_ENCRYPT(LL,R,S) { \ + unsigned int u1,u2,s1,s2; \ + LOAD_DATA(R,S,u,t,E0,E1,u1); \ + u2=(int)u>>8L; \ + u1=(int)u&0xfc; \ + u2&=0xfc; \ + t=ROTATE(t,4); \ + LL^= *(const DES_LONG *)(des_SP +u1); \ + LL^= *(const DES_LONG *)(des_SP+0x200+u2); \ + s1=(int)(u>>16L); \ + s2=(int)(u>>24L); \ + s1&=0xfc; \ + s2&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x400+s1); \ + LL^= *(const DES_LONG *)(des_SP+0x600+s2); \ + u2=(int)t>>8L; \ + u1=(int)t&0xfc; \ + u2&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ + LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ + s1=(int)(t>>16L); \ + s2=(int)(t>>24L); \ + s1&=0xfc; \ + s2&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x400+s1); \ + LL^= *(const DES_LONG *)(des_SP+0x600+s2); \ + u2=(int)t>>8L; \ + u1=(int)t&0xfc; \ + u2&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ + LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ + s1=(int)(t>>16L); \ + s2=(int)(t>>24L); \ + s1&=0xfc; \ + s2&=0xfc; \ + LL^= *(const DES_LONG *)(des_SP+0x500+s1); \ + LL^= *(const DES_LONG *)(des_SP+0x700+s2); } +#endif /* DES_RISC2 */ +#else /* DES_RISC1 || DES_RISC2 */ +#define D_ENCRYPT(LL,R,S) { \ + LOAD_DATA_tmp(R,S,u,t,E0,E1); \ + t=ROTATE(t,4); \ + LL^= \ + *(const DES_LONG *)(des_SP +((u )&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x200+((u>> 8L)&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x400+((u>>16L)&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x600+((u>>24L)&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x100+((t )&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x300+((t>> 8L)&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x500+((t>>16L)&0xfc))^ \ + *(const DES_LONG *)(des_SP+0x700+((t>>24L)&0xfc)); } +#endif /* DES_RISC1 || DES_RISC2 */ #else /* original version */ -#ifdef undef -#define D_ENCRYPT(L,R,S) \ - U.l=R^s[S+1]; \ - T.s[0]=((U.s[0]>>4)|(U.s[1]<<12))&0x3f3f; \ - T.s[1]=((U.s[1]>>4)|(U.s[0]<<12))&0x3f3f; \ - U.l=(R^s[S ])&0x3f3f3f3fL; \ - L^= des_SPtrans[1][(T.c[0])]| \ - des_SPtrans[3][(T.c[1])]| \ - des_SPtrans[5][(T.c[2])]| \ - des_SPtrans[7][(T.c[3])]| \ - des_SPtrans[0][(U.c[0])]| \ - des_SPtrans[2][(U.c[1])]| \ - des_SPtrans[4][(U.c[2])]| \ - des_SPtrans[6][(U.c[3])]; -#else -#define D_ENCRYPT(Q,R,S) {\ - u=(R^s[S ]); \ - t=R^s[S+1]; \ + +#if defined(DES_RISC1) || defined(DES_RISC2) +#ifdef DES_RISC1 +#define D_ENCRYPT(LL,R,S) {\ + unsigned int u1,u2,u3; \ + LOAD_DATA(R,S,u,t,E0,E1,u1); \ + u>>=2L; \ + t=ROTATE(t,6); \ + u2=(int)u>>8L; \ + u1=(int)u&0x3f; \ + u2&=0x3f; \ + u>>=16L; \ + LL^=des_SPtrans[0][u1]; \ + LL^=des_SPtrans[2][u2]; \ + u3=(int)u>>8L; \ + u1=(int)u&0x3f; \ + u3&=0x3f; \ + LL^=des_SPtrans[4][u1]; \ + LL^=des_SPtrans[6][u3]; \ + u2=(int)t>>8L; \ + u1=(int)t&0x3f; \ + u2&=0x3f; \ + t>>=16L; \ + LL^=des_SPtrans[1][u1]; \ + LL^=des_SPtrans[3][u2]; \ + u3=(int)t>>8L; \ + u1=(int)t&0x3f; \ + u3&=0x3f; \ + LL^=des_SPtrans[5][u1]; \ + LL^=des_SPtrans[7][u3]; } +#endif /* DES_RISC1 */ +#ifdef DES_RISC2 +#define D_ENCRYPT(LL,R,S) {\ + unsigned int u1,u2,s1,s2; \ + LOAD_DATA(R,S,u,t,E0,E1,u1); \ + u>>=2L; \ + t=ROTATE(t,6); \ + u2=(int)u>>8L; \ + u1=(int)u&0x3f; \ + u2&=0x3f; \ + LL^=des_SPtrans[0][u1]; \ + LL^=des_SPtrans[2][u2]; \ + s1=(int)u>>16L; \ + s2=(int)u>>24L; \ + s1&=0x3f; \ + s2&=0x3f; \ + LL^=des_SPtrans[4][s1]; \ + LL^=des_SPtrans[6][s2]; \ + u2=(int)t>>8L; \ + u1=(int)t&0x3f; \ + u2&=0x3f; \ + LL^=des_SPtrans[1][u1]; \ + LL^=des_SPtrans[3][u2]; \ + s1=(int)t>>16; \ + s2=(int)t>>24L; \ + s1&=0x3f; \ + s2&=0x3f; \ + LL^=des_SPtrans[5][s1]; \ + LL^=des_SPtrans[7][s2]; } +#endif /* DES_RISC2 */ + +#else /* DES_RISC1 || DES_RISC2 */ + +#define D_ENCRYPT(LL,R,S) {\ + LOAD_DATA_tmp(R,S,u,t,E0,E1); \ t=ROTATE(t,4); \ - Q^= des_SPtrans[1][(t )&0x3f]| \ - des_SPtrans[3][(t>> 8L)&0x3f]| \ - des_SPtrans[5][(t>>16L)&0x3f]| \ - des_SPtrans[7][(t>>24L)&0x3f]| \ - des_SPtrans[0][(u )&0x3f]| \ - des_SPtrans[2][(u>> 8L)&0x3f]| \ - des_SPtrans[4][(u>>16L)&0x3f]| \ - des_SPtrans[6][(u>>24L)&0x3f]; } -#endif -#endif + LL^=\ + des_SPtrans[0][(u>> 2L)&0x3f]^ \ + des_SPtrans[2][(u>>10L)&0x3f]^ \ + des_SPtrans[4][(u>>18L)&0x3f]^ \ + des_SPtrans[6][(u>>26L)&0x3f]^ \ + des_SPtrans[1][(t>> 2L)&0x3f]^ \ + des_SPtrans[3][(t>>10L)&0x3f]^ \ + des_SPtrans[5][(t>>18L)&0x3f]^ \ + des_SPtrans[7][(t>>26L)&0x3f]; } +#endif /* DES_RISC1 || DES_RISC2 */ +#endif /* DES_PTR */ /* IP and FP * The problem is more of a geometric problem that random bit fiddling. diff -urN xnu-344.49/bsd/crypto/des/des_setkey.c xnu-517/bsd/crypto/des/des_setkey.c --- xnu-344.49/bsd/crypto/des/des_setkey.c Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/des/des_setkey.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/crypto/des/des_setkey.c,v 1.1.2.3 2001/07/10 09:46:35 ume Exp $ */ -/* $KAME: des_setkey.c,v 1.6 2001/07/03 14:27:53 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/des/des_setkey.c,v 1.1.2.4 2002/03/26 10:12:25 ume Exp $ */ +/* $KAME: des_setkey.c,v 1.7 2001/09/10 04:03:58 itojun Exp $ */ /* crypto/des/set_key.c */ /* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) @@ -61,22 +61,18 @@ #include #include -static int check_parity __P((des_cblock (*))); - int des_check_key=0; -void des_set_odd_parity(key) -des_cblock (*key); - { +void des_set_odd_parity(des_cblock *key) +{ int i; for (i=0; i>(n))^(b))&(m)),\ * (b)^=(t),\ * (a)=((a)^((t)<<(n)))) @@ -141,49 +138,48 @@ #define HPERM_OP(a,t,n,m) ((t)=((((a)<<(16-(n)))^(a))&(m)),\ (a)=(a)^(t)^(t>>(16-(n)))) +int des_set_key(des_cblock *key, des_key_schedule schedule) +{ + if (des_check_key) + { + return des_set_key_checked(key, schedule); + } + else + { + des_set_key_unchecked(key, schedule); + return 0; + } +} + /* return 0 if key parity is odd (correct), * return -1 if key parity error, * return -2 if illegal weak key. */ -int des_set_key(key, schedule) -des_cblock (*key); -des_key_schedule schedule; - { +int des_set_key_checked(des_cblock *key, des_key_schedule schedule) +{ + if (!des_check_key_parity(key)) + return(-1); + if (des_is_weak_key(key)) + return(-2); + des_set_key_unchecked(key, schedule); + return 0; +} + +void des_set_key_unchecked(des_cblock *key, des_key_schedule schedule) +{ static int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0}; - register DES_LONG c,d,t,s; - register unsigned char *in; + register DES_LONG c,d,t,s,t2; + register const unsigned char *in; register DES_LONG *k; register int i; - if (des_check_key) - { - if (!check_parity(key)) - return(-1); - - if (des_is_weak_key(key)) - return(-2); - } - - k=(DES_LONG *)schedule; - in=(unsigned char *)key; + k = &schedule->ks.deslong[0]; + in = &(*key)[0]; c2l(in,c); c2l(in,d); - /* do PC1 in 60 simple operations */ -/* PERM_OP(d,c,t,4,0x0f0f0f0fL); - HPERM_OP(c,t,-2, 0xcccc0000L); - HPERM_OP(c,t,-1, 0xaaaa0000L); - HPERM_OP(c,t, 8, 0x00ff0000L); - HPERM_OP(c,t,-1, 0xaaaa0000L); - HPERM_OP(d,t,-8, 0xff000000L); - HPERM_OP(d,t, 8, 0x00ff0000L); - HPERM_OP(d,t, 2, 0x33330000L); - d=((d&0x00aa00aaL)<<7L)|((d&0x55005500L)>>7L)|(d&0xaa55aa55L); - d=(d>>8)|((c&0xf0000000L)>>4); - c&=0x0fffffffL; */ - - /* I now do it in 47 simple operations :-) + /* do PC1 in 47 simple operations :-) * Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) * for the inspiration. :-) */ PERM_OP (d,c,t,4,0x0f0f0f0fL); @@ -197,7 +193,7 @@ c&=0x0fffffffL; for (i=0; i>2L)|(c<<26L)); d=((d>>2L)|(d<<26L)); } else @@ -205,30 +201,32 @@ c&=0x0fffffffL; d&=0x0fffffffL; /* could be a few less shifts but I am to lazy at this - * point in time to investigate */ + * point in time to investigate */ s= des_skb[0][ (c )&0x3f ]| - des_skb[1][((c>> 6)&0x03)|((c>> 7L)&0x3c)]| - des_skb[2][((c>>13)&0x0f)|((c>>14L)&0x30)]| - des_skb[3][((c>>20)&0x01)|((c>>21L)&0x06) | - ((c>>22L)&0x38)]; + des_skb[1][((c>> 6L)&0x03)|((c>> 7L)&0x3c)]| + des_skb[2][((c>>13L)&0x0f)|((c>>14L)&0x30)]| + des_skb[3][((c>>20L)&0x01)|((c>>21L)&0x06) | + ((c>>22L)&0x38)]; t= des_skb[4][ (d )&0x3f ]| des_skb[5][((d>> 7L)&0x03)|((d>> 8L)&0x3c)]| des_skb[6][ (d>>15L)&0x3f ]| des_skb[7][((d>>21L)&0x0f)|((d>>22L)&0x30)]; /* table contained 0213 4657 */ - *(k++)=((t<<16L)|(s&0x0000ffffL))&0xffffffffL; - s= ((s>>16L)|(t&0xffff0000L)); - - s=(s<<4L)|(s>>28L); - *(k++)=s&0xffffffffL; - } - return(0); + t2=((t<<16L)|(s&0x0000ffffL))&0xffffffffL; + *(k++)=ROTATE(t2,30)&0xffffffffL; + + t2=((s>>16L)|(t&0xffff0000L)); + *(k++)=ROTATE(t2,26)&0xffffffffL; } +} -int des_key_sched(key, schedule) -des_cblock (*key); -des_key_schedule schedule; - { +int des_key_sched(des_cblock *key, des_key_schedule schedule) +{ return(des_set_key(key,schedule)); - } +} + +void des_fixup_key_parity(des_cblock *key) +{ + des_set_odd_parity(key); +} diff -urN xnu-344.49/bsd/crypto/des/spr.h xnu-517/bsd/crypto/des/spr.h --- xnu-344.49/bsd/crypto/des/spr.h Thu Sep 18 21:00:31 2003 +++ xnu-517/bsd/crypto/des/spr.h Sat Oct 25 00:25:25 2003 @@ -1,23 +1,28 @@ -/* $FreeBSD: src/sys/crypto/des/spr.h,v 1.1.2.1 2000/07/15 07:14:22 kris Exp $ */ -/* $KAME: spr.h,v 1.3 2000/03/27 04:36:35 sumikawa Exp $ */ +/* $FreeBSD: src/sys/crypto/des/spr.h,v 1.1.2.2 2002/03/26 10:12:25 ume Exp $ */ +/* $KAME: spr.h,v 1.4 2001/09/10 04:03:58 itojun Exp $ */ /* crypto/des/spr.h */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) * All rights reserved. * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. * This can be in the form of a textual message at program startup or * in documentation (online or textual) provided with the package. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -28,8 +33,14 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -41,156 +52,156 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * + * * The licence and distribution terms for any publically available version or * derivative of this code cannot be changed. i.e. this code cannot simply be * copied and put under another distribution licence * [including the GNU Public Licence.] */ -static const DES_LONG des_SPtrans[8][64]={ +const DES_LONG des_SPtrans[8][64]={ { /* nibble 0 */ -0x00820200L, 0x00020000L, 0x80800000L, 0x80820200L, -0x00800000L, 0x80020200L, 0x80020000L, 0x80800000L, -0x80020200L, 0x00820200L, 0x00820000L, 0x80000200L, -0x80800200L, 0x00800000L, 0x00000000L, 0x80020000L, -0x00020000L, 0x80000000L, 0x00800200L, 0x00020200L, -0x80820200L, 0x00820000L, 0x80000200L, 0x00800200L, -0x80000000L, 0x00000200L, 0x00020200L, 0x80820000L, -0x00000200L, 0x80800200L, 0x80820000L, 0x00000000L, -0x00000000L, 0x80820200L, 0x00800200L, 0x80020000L, -0x00820200L, 0x00020000L, 0x80000200L, 0x00800200L, -0x80820000L, 0x00000200L, 0x00020200L, 0x80800000L, -0x80020200L, 0x80000000L, 0x80800000L, 0x00820000L, -0x80820200L, 0x00020200L, 0x00820000L, 0x80800200L, -0x00800000L, 0x80000200L, 0x80020000L, 0x00000000L, -0x00020000L, 0x00800000L, 0x80800200L, 0x00820200L, -0x80000000L, 0x80820000L, 0x00000200L, 0x80020200L, +0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, +0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L, +0x00080802L, 0x02080800L, 0x02080000L, 0x00000802L, +0x02000802L, 0x02000000L, 0x00000000L, 0x00080002L, +0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L, +0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L, +0x00000002L, 0x00000800L, 0x00080800L, 0x02080002L, +0x00000800L, 0x02000802L, 0x02080002L, 0x00000000L, +0x00000000L, 0x02080802L, 0x02000800L, 0x00080002L, +0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L, +0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L, +0x00080802L, 0x00000002L, 0x02000002L, 0x02080000L, +0x02080802L, 0x00080800L, 0x02080000L, 0x02000802L, +0x02000000L, 0x00000802L, 0x00080002L, 0x00000000L, +0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L, +0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L, },{ /* nibble 1 */ -0x10042004L, 0x00000000L, 0x00042000L, 0x10040000L, -0x10000004L, 0x00002004L, 0x10002000L, 0x00042000L, -0x00002000L, 0x10040004L, 0x00000004L, 0x10002000L, -0x00040004L, 0x10042000L, 0x10040000L, 0x00000004L, -0x00040000L, 0x10002004L, 0x10040004L, 0x00002000L, -0x00042004L, 0x10000000L, 0x00000000L, 0x00040004L, -0x10002004L, 0x00042004L, 0x10042000L, 0x10000004L, -0x10000000L, 0x00040000L, 0x00002004L, 0x10042004L, -0x00040004L, 0x10042000L, 0x10002000L, 0x00042004L, -0x10042004L, 0x00040004L, 0x10000004L, 0x00000000L, -0x10000000L, 0x00002004L, 0x00040000L, 0x10040004L, -0x00002000L, 0x10000000L, 0x00042004L, 0x10002004L, -0x10042000L, 0x00002000L, 0x00000000L, 0x10000004L, -0x00000004L, 0x10042004L, 0x00042000L, 0x10040000L, -0x10040004L, 0x00040000L, 0x00002004L, 0x10002000L, -0x10002004L, 0x00000004L, 0x10040000L, 0x00042000L, +0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L, +0x40000010L, 0x00008010L, 0x40008000L, 0x00108000L, +0x00008000L, 0x40100010L, 0x00000010L, 0x40008000L, +0x00100010L, 0x40108000L, 0x40100000L, 0x00000010L, +0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L, +0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L, +0x40008010L, 0x00108010L, 0x40108000L, 0x40000010L, +0x40000000L, 0x00100000L, 0x00008010L, 0x40108010L, +0x00100010L, 0x40108000L, 0x40008000L, 0x00108010L, +0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L, +0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L, +0x00008000L, 0x40000000L, 0x00108010L, 0x40008010L, +0x40108000L, 0x00008000L, 0x00000000L, 0x40000010L, +0x00000010L, 0x40108010L, 0x00108000L, 0x40100000L, +0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L, +0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L, },{ /* nibble 2 */ -0x41000000L, 0x01010040L, 0x00000040L, 0x41000040L, -0x40010000L, 0x01000000L, 0x41000040L, 0x00010040L, -0x01000040L, 0x00010000L, 0x01010000L, 0x40000000L, -0x41010040L, 0x40000040L, 0x40000000L, 0x41010000L, -0x00000000L, 0x40010000L, 0x01010040L, 0x00000040L, -0x40000040L, 0x41010040L, 0x00010000L, 0x41000000L, -0x41010000L, 0x01000040L, 0x40010040L, 0x01010000L, -0x00010040L, 0x00000000L, 0x01000000L, 0x40010040L, -0x01010040L, 0x00000040L, 0x40000000L, 0x00010000L, -0x40000040L, 0x40010000L, 0x01010000L, 0x41000040L, -0x00000000L, 0x01010040L, 0x00010040L, 0x41010000L, -0x40010000L, 0x01000000L, 0x41010040L, 0x40000000L, -0x40010040L, 0x41000000L, 0x01000000L, 0x41010040L, -0x00010000L, 0x01000040L, 0x41000040L, 0x00010040L, -0x01000040L, 0x00000000L, 0x41010000L, 0x40000040L, -0x41000000L, 0x40010040L, 0x00000040L, 0x01010000L, +0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L, +0x00040001L, 0x04000000L, 0x04000101L, 0x00040100L, +0x04000100L, 0x00040000L, 0x04040000L, 0x00000001L, +0x04040101L, 0x00000101L, 0x00000001L, 0x04040001L, +0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L, +0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L, +0x04040001L, 0x04000100L, 0x00040101L, 0x04040000L, +0x00040100L, 0x00000000L, 0x04000000L, 0x00040101L, +0x04040100L, 0x00000100L, 0x00000001L, 0x00040000L, +0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L, +0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L, +0x00040001L, 0x04000000L, 0x04040101L, 0x00000001L, +0x00040101L, 0x04000001L, 0x04000000L, 0x04040101L, +0x00040000L, 0x04000100L, 0x04000101L, 0x00040100L, +0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L, +0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L, },{ /* nibble 3 */ -0x00100402L, 0x04000400L, 0x00000002L, 0x04100402L, -0x00000000L, 0x04100000L, 0x04000402L, 0x00100002L, -0x04100400L, 0x04000002L, 0x04000000L, 0x00000402L, -0x04000002L, 0x00100402L, 0x00100000L, 0x04000000L, -0x04100002L, 0x00100400L, 0x00000400L, 0x00000002L, -0x00100400L, 0x04000402L, 0x04100000L, 0x00000400L, -0x00000402L, 0x00000000L, 0x00100002L, 0x04100400L, -0x04000400L, 0x04100002L, 0x04100402L, 0x00100000L, -0x04100002L, 0x00000402L, 0x00100000L, 0x04000002L, -0x00100400L, 0x04000400L, 0x00000002L, 0x04100000L, -0x04000402L, 0x00000000L, 0x00000400L, 0x00100002L, -0x00000000L, 0x04100002L, 0x04100400L, 0x00000400L, -0x04000000L, 0x04100402L, 0x00100402L, 0x00100000L, -0x04100402L, 0x00000002L, 0x04000400L, 0x00100402L, -0x00100002L, 0x00100400L, 0x04100000L, 0x04000402L, -0x00000402L, 0x04000000L, 0x04000002L, 0x04100400L, +0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L, +0x00000000L, 0x10400000L, 0x10001008L, 0x00400008L, +0x10401000L, 0x10000008L, 0x10000000L, 0x00001008L, +0x10000008L, 0x00401008L, 0x00400000L, 0x10000000L, +0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L, +0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L, +0x00001008L, 0x00000000L, 0x00400008L, 0x10401000L, +0x10001000L, 0x10400008L, 0x10401008L, 0x00400000L, +0x10400008L, 0x00001008L, 0x00400000L, 0x10000008L, +0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L, +0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L, +0x00000000L, 0x10400008L, 0x10401000L, 0x00001000L, +0x10000000L, 0x10401008L, 0x00401008L, 0x00400000L, +0x10401008L, 0x00000008L, 0x10001000L, 0x00401008L, +0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L, +0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L, },{ /* nibble 4 */ -0x02000000L, 0x00004000L, 0x00000100L, 0x02004108L, -0x02004008L, 0x02000100L, 0x00004108L, 0x02004000L, -0x00004000L, 0x00000008L, 0x02000008L, 0x00004100L, -0x02000108L, 0x02004008L, 0x02004100L, 0x00000000L, -0x00004100L, 0x02000000L, 0x00004008L, 0x00000108L, -0x02000100L, 0x00004108L, 0x00000000L, 0x02000008L, -0x00000008L, 0x02000108L, 0x02004108L, 0x00004008L, -0x02004000L, 0x00000100L, 0x00000108L, 0x02004100L, -0x02004100L, 0x02000108L, 0x00004008L, 0x02004000L, -0x00004000L, 0x00000008L, 0x02000008L, 0x02000100L, -0x02000000L, 0x00004100L, 0x02004108L, 0x00000000L, -0x00004108L, 0x02000000L, 0x00000100L, 0x00004008L, -0x02000108L, 0x00000100L, 0x00000000L, 0x02004108L, -0x02004008L, 0x02004100L, 0x00000108L, 0x00004000L, -0x00004100L, 0x02004008L, 0x02000100L, 0x00000108L, -0x00000008L, 0x00004108L, 0x02004000L, 0x02000008L, +0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L, +0x08010020L, 0x08000400L, 0x00010420L, 0x08010000L, +0x00010000L, 0x00000020L, 0x08000020L, 0x00010400L, +0x08000420L, 0x08010020L, 0x08010400L, 0x00000000L, +0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L, +0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L, +0x00000020L, 0x08000420L, 0x08010420L, 0x00010020L, +0x08010000L, 0x00000400L, 0x00000420L, 0x08010400L, +0x08010400L, 0x08000420L, 0x00010020L, 0x08010000L, +0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L, +0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L, +0x00010420L, 0x08000000L, 0x00000400L, 0x00010020L, +0x08000420L, 0x00000400L, 0x00000000L, 0x08010420L, +0x08010020L, 0x08010400L, 0x00000420L, 0x00010000L, +0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L, +0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L, },{ /* nibble 5 */ -0x20000010L, 0x00080010L, 0x00000000L, 0x20080800L, -0x00080010L, 0x00000800L, 0x20000810L, 0x00080000L, -0x00000810L, 0x20080810L, 0x00080800L, 0x20000000L, -0x20000800L, 0x20000010L, 0x20080000L, 0x00080810L, -0x00080000L, 0x20000810L, 0x20080010L, 0x00000000L, -0x00000800L, 0x00000010L, 0x20080800L, 0x20080010L, -0x20080810L, 0x20080000L, 0x20000000L, 0x00000810L, -0x00000010L, 0x00080800L, 0x00080810L, 0x20000800L, -0x00000810L, 0x20000000L, 0x20000800L, 0x00080810L, -0x20080800L, 0x00080010L, 0x00000000L, 0x20000800L, -0x20000000L, 0x00000800L, 0x20080010L, 0x00080000L, -0x00080010L, 0x20080810L, 0x00080800L, 0x00000010L, -0x20080810L, 0x00080800L, 0x00080000L, 0x20000810L, -0x20000010L, 0x20080000L, 0x00080810L, 0x00000000L, -0x00000800L, 0x20000010L, 0x20000810L, 0x20080800L, -0x20080000L, 0x00000810L, 0x00000010L, 0x20080010L, +0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L, +0x00200040L, 0x00002000L, 0x80002040L, 0x00200000L, +0x00002040L, 0x80202040L, 0x00202000L, 0x80000000L, +0x80002000L, 0x80000040L, 0x80200000L, 0x00202040L, +0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L, +0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L, +0x80202040L, 0x80200000L, 0x80000000L, 0x00002040L, +0x00000040L, 0x00202000L, 0x00202040L, 0x80002000L, +0x00002040L, 0x80000000L, 0x80002000L, 0x00202040L, +0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L, +0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L, +0x00200040L, 0x80202040L, 0x00202000L, 0x00000040L, +0x80202040L, 0x00202000L, 0x00200000L, 0x80002040L, +0x80000040L, 0x80200000L, 0x00202040L, 0x00000000L, +0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L, +0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L, },{ /* nibble 6 */ -0x00001000L, 0x00000080L, 0x00400080L, 0x00400001L, -0x00401081L, 0x00001001L, 0x00001080L, 0x00000000L, -0x00400000L, 0x00400081L, 0x00000081L, 0x00401000L, -0x00000001L, 0x00401080L, 0x00401000L, 0x00000081L, -0x00400081L, 0x00001000L, 0x00001001L, 0x00401081L, -0x00000000L, 0x00400080L, 0x00400001L, 0x00001080L, -0x00401001L, 0x00001081L, 0x00401080L, 0x00000001L, -0x00001081L, 0x00401001L, 0x00000080L, 0x00400000L, -0x00001081L, 0x00401000L, 0x00401001L, 0x00000081L, -0x00001000L, 0x00000080L, 0x00400000L, 0x00401001L, -0x00400081L, 0x00001081L, 0x00001080L, 0x00000000L, -0x00000080L, 0x00400001L, 0x00000001L, 0x00400080L, -0x00000000L, 0x00400081L, 0x00400080L, 0x00001080L, -0x00000081L, 0x00001000L, 0x00401081L, 0x00400000L, -0x00401080L, 0x00000001L, 0x00001001L, 0x00401081L, -0x00400001L, 0x00401080L, 0x00401000L, 0x00001001L, +0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L, +0x01004204L, 0x00004004L, 0x00004200L, 0x00000000L, +0x01000000L, 0x01000204L, 0x00000204L, 0x01004000L, +0x00000004L, 0x01004200L, 0x01004000L, 0x00000204L, +0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L, +0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L, +0x01004004L, 0x00004204L, 0x01004200L, 0x00000004L, +0x00004204L, 0x01004004L, 0x00000200L, 0x01000000L, +0x00004204L, 0x01004000L, 0x01004004L, 0x00000204L, +0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L, +0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L, +0x00000200L, 0x01000004L, 0x00000004L, 0x01000200L, +0x00000000L, 0x01000204L, 0x01000200L, 0x00004200L, +0x00000204L, 0x00004000L, 0x01004204L, 0x01000000L, +0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L, +0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L, },{ /* nibble 7 */ -0x08200020L, 0x08208000L, 0x00008020L, 0x00000000L, -0x08008000L, 0x00200020L, 0x08200000L, 0x08208020L, -0x00000020L, 0x08000000L, 0x00208000L, 0x00008020L, -0x00208020L, 0x08008020L, 0x08000020L, 0x08200000L, -0x00008000L, 0x00208020L, 0x00200020L, 0x08008000L, -0x08208020L, 0x08000020L, 0x00000000L, 0x00208000L, -0x08000000L, 0x00200000L, 0x08008020L, 0x08200020L, -0x00200000L, 0x00008000L, 0x08208000L, 0x00000020L, -0x00200000L, 0x00008000L, 0x08000020L, 0x08208020L, -0x00008020L, 0x08000000L, 0x00000000L, 0x00208000L, -0x08200020L, 0x08008020L, 0x08008000L, 0x00200020L, -0x08208000L, 0x00000020L, 0x00200020L, 0x08008000L, -0x08208020L, 0x00200000L, 0x08200000L, 0x08000020L, -0x00208000L, 0x00008020L, 0x08008020L, 0x08200000L, -0x00000020L, 0x08208000L, 0x00208020L, 0x00000000L, -0x08000000L, 0x08200020L, 0x00008000L, 0x00208020L, +0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L, +0x20020000L, 0x00800080L, 0x20800000L, 0x20820080L, +0x00000080L, 0x20000000L, 0x00820000L, 0x00020080L, +0x00820080L, 0x20020080L, 0x20000080L, 0x20800000L, +0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L, +0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L, +0x20000000L, 0x00800000L, 0x20020080L, 0x20800080L, +0x00800000L, 0x00020000L, 0x20820000L, 0x00000080L, +0x00800000L, 0x00020000L, 0x20000080L, 0x20820080L, +0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L, +0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L, +0x20820000L, 0x00000080L, 0x00800080L, 0x20020000L, +0x20820080L, 0x00800000L, 0x20800000L, 0x20000080L, +0x00820000L, 0x00020080L, 0x20020080L, 0x20800000L, +0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L, +0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L, }}; diff -urN xnu-344.49/bsd/crypto/sha2/sha2.c xnu-517/bsd/crypto/sha2/sha2.c --- xnu-344.49/bsd/crypto/sha2/sha2.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/crypto/sha2/sha2.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/crypto/sha2/sha2.c,v 1.2.2.1 2001/07/03 11:01:36 ume Exp $ */ -/* $KAME: sha2.c,v 1.6 2001/03/12 11:31:04 itojun Exp $ */ +/* $FreeBSD: src/sys/crypto/sha2/sha2.c,v 1.2.2.2 2002/03/05 08:36:47 ume Exp $ */ +/* $KAME: sha2.c,v 1.8 2001/11/08 01:07:52 itojun Exp $ */ /* * sha2.c @@ -565,7 +565,7 @@ /* Begin padding with a 1 bit: */ context->buffer[usedspace++] = 0x80; - if (usedspace < SHA256_SHORT_BLOCK_LENGTH) { + if (usedspace <= SHA256_SHORT_BLOCK_LENGTH) { /* Set-up for the last transform: */ bzero(&context->buffer[usedspace], SHA256_SHORT_BLOCK_LENGTH - usedspace); } else { @@ -882,7 +882,7 @@ /* Begin padding with a 1 bit: */ context->buffer[usedspace++] = 0x80; - if (usedspace < SHA512_SHORT_BLOCK_LENGTH) { + if (usedspace <= SHA512_SHORT_BLOCK_LENGTH) { /* Set-up for the last transform: */ bzero(&context->buffer[usedspace], SHA512_SHORT_BLOCK_LENGTH - usedspace); } else { diff -urN xnu-344.49/bsd/dev/disk.h xnu-517/bsd/dev/disk.h --- xnu-344.49/bsd/dev/disk.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/disk.h Sat Oct 25 00:25:25 2003 @@ -22,129 +22,5 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* @(#)disk.h 1.0 08/29/87 (c) 1987 NeXT */ -#ifndef _BSD_DEV_DISK_ -#define _BSD_DEV_DISK_ -#ifndef _SYS_DISK_H_ -#define _SYS_DISK_H_ - -#include -#include -#include -#include -#include -#include -#include - -/* - * USE INSTEAD (NOTE: DKIOCGETBLOCKCOUNT -> DKIOCGETBLOCKCOUNT32) - */ - -#ifdef __APPLE_API_OBSOLETE - -#define DR_CMDSIZE 32 -#define DR_ERRSIZE 32 - -struct disk_req { - int dr_bcount; /* byte count for data transfers */ - caddr_t dr_addr; /* memory addr for data transfers */ - struct timeval dr_exec_time; /* execution time of operation */ - - /* - * interpretation of cmdblk and errblk is driver specific. - */ - char dr_cmdblk[DR_CMDSIZE]; - char dr_errblk[DR_ERRSIZE]; -}; - -struct sdc_wire { - vm_offset_t start, end; - boolean_t new_pageable; -}; - - -#define BAD_BLK_OFF 4 /* offset of bad blk tbl from label */ -#define NBAD_BLK (12 * 1024 / sizeof (int)) - -struct bad_block { /* bad block table, sized to be 12KB */ - int bad_blk[NBAD_BLK]; -}; - -/* - * sector bitmap states (2 bits per sector) - */ -#define SB_UNTESTED 0 /* must be zero */ -#define SB_BAD 1 -#define SB_WRITTEN 2 -#define SB_ERASED 3 - -struct drive_info { /* info about drive hardware */ - char di_name[MAXDNMLEN]; /* drive type name */ - int di_label_blkno[NLABELS];/* label loc'ns in DEVICE SECTORS */ - int di_devblklen; /* device sector size */ - int di_maxbcount; /* max bytes per transfer request */ -}; - -#define DS_STATSIZE 32 - -struct disk_stats { - int s_ecccnt; /* avg ECC corrections per sector */ - int s_maxecc; /* max ECC corrections observed */ - - /* - * interpretation of s_stats is driver specific - */ - char s_stats[DS_STATSIZE]; -}; - -struct drive_location { - char location[ 128 ]; -}; - -#define DKIOCGLABEL _IOR('d', 0,struct disk_label) /* read label */ -#define DKIOCSLABEL _IOW('d', 1,struct disk_label) /* write label */ -#define DKIOCGBITMAP _IO('d', 2) /* read bitmap */ -#define DKIOCSBITMAP _IO('d', 3) /* write bitmap */ -#define DKIOCREQ _IOWR('d', 4, struct disk_req) /* cmd request */ -#define DKIOCINFO _IOR('d', 5, struct drive_info) /* get drive info */ -#define DKIOCZSTATS _IO('d',7) /* zero statistics */ -#define DKIOCGSTATS _IO('d', 8) /* get statistics */ -#define DKIOCRESET _IO('d', 9) /* reset disk */ -#define DKIOCGFLAGS _IOR('d', 11, int) /* get driver flags */ -#define DKIOCSFLAGS _IOW('d', 12, int) /* set driver flags */ -#define DKIOCSDCWIRE _IOW('d', 14, struct sdc_wire) /* sdc wire memory */ -#define DKIOCSDCLOCK _IO('d', 15) /* sdc lock */ -#define DKIOCSDCUNLOCK _IO('d', 16) /* sdc unlock */ -#define DKIOCGFREEVOL _IOR('d', 17, int) /* get free volume # */ -#define DKIOCGBBT _IO('d', 18) /* read bad blk tbl */ -#define DKIOCSBBT _IO('d', 19) /* write bad blk tbl */ -#define DKIOCMNOTIFY _IOW('d', 20, int) /* message on insert */ -#define DKIOCEJECT _IO('d', 21) /* eject disk */ -#define DKIOCPANELPRT _IOW('d', 22, int) /* register Panel */ - /* Request port */ -#define DKIOCSFORMAT _IOW('d', 23, int) /* set 'Formatted' flag */ -#define DKIOCGFORMAT _IOR('d', 23, int) /* get 'Formatted' flag */ -#define DKIOCBLKSIZE _IOR('d', 24, int) /* device sector size */ -#define DKIOCNUMBLKS _IOR('d', 25, int) /* number of sectors */ -#define DKIOCCHECKINSERT _IO('d',26) /* manually poll removable */ - /* media drive */ -#define DKIOCCANCELAUTOMOUNT _IOW('d',27, dev_t) /* cancel automount request */ -#define DKIOCGLOCATION _IOR('d',28, struct drive_location) /* arch dependent location descrip */ -#define DKIOCSETBLOCKSIZE _IOW('d', 24, int) /* set media's preferred sector size */ -#define DKIOCGETBLOCKSIZE DKIOCBLKSIZE /* get media's preferred sector size */ -#define DKIOCGETBLOCKCOUNT32 DKIOCNUMBLKS /* get media's sector count */ -#define DKIOCGETBLOCKCOUNT64 _IOR('d', 25, u_int64_t) /* get media's sector count */ -#define DKIOCGETLOCATION DKIOCGLOCATION /* get media's location description */ -#define DKIOCISFORMATTED DKIOCGFORMAT /* is media formatted? */ -#define DKIOCISWRITABLE _IOR('d', 29, int) /* is media writable? */ - -#define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, u_int64_t) /* get device's maximum block count for read requests */ -#define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, u_int64_t) /* get device's maximum block count for write requests */ -#define DKIOCGETMAXSEGMENTCOUNTREAD _IOR('d', 66, u_int64_t) /* get device's maximum physical segment count for read buffers */ -#define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t) /* get device's maximum physical segment count for write buffers */ - -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _SYS_DISK_H_ */ -#endif /* _BSD_DEV_DISK_ */ +#warning is obsolete, please use instead diff -urN xnu-344.49/bsd/dev/disk_label.h xnu-517/bsd/dev/disk_label.h --- xnu-344.49/bsd/dev/disk_label.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/disk_label.h Sat Oct 25 00:25:25 2003 @@ -22,91 +22,5 @@ * * @APPLE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1991 by NeXT Computer, Inc. - * - * File: bsd/dev/disk_label.h - NeXT disk label definition. - * - */ - -#ifndef _BSD_DEV_DISK_LABEL_ -#define _BSD_DEV_DISK_LABEL_ - -#include -#include - -#ifdef __APPLE_API_OBSOLETE - -#define NLABELS 4 /* # of labels on a disk */ -#define MAXLBLLEN 24 /* dl_label[] size */ -#define NBAD 1670 /* sized to make label ~= 8KB */ - -/* - * if dl_version >= DL_V3 then the bad block table is relocated - * to a structure separate from the disk label. - */ -typedef union { - unsigned short DL_v3_checksum; - int DL_bad[NBAD]; /* block number that is bad */ -} dl_un_t; - -typedef struct disk_label { - int dl_version; // label version number - int dl_label_blkno; // block # where this label is - int dl_size; // size of media area (sectors) - char dl_label[MAXLBLLEN]; // media label - unsigned dl_flags; // flags (see DL_xxx, below) - unsigned dl_tag; // volume tag - struct disktab dl_dt; // common info in disktab - dl_un_t dl_un; - unsigned short dl_checksum; // ones complement checksum - - /* add things here so dl_checksum stays in a fixed place */ -} disk_label_t; - -/* - * Known label versions. - */ -#define DL_V1 0x4e655854 /* version #1: "NeXT" */ -#define DL_V2 0x646c5632 /* version #2: "dlV2" */ -#define DL_V3 0x646c5633 /* version #3: "dlV3" */ -#define DL_VERSION DL_V3 /* default version */ - - -/* - * dl_flags values - */ -#define DL_UNINIT 0x80000000 /* label is uninitialized */ - -/* - * Aliases for disktab fields - */ -#define dl_name dl_dt.d_name -#define dl_type dl_dt.d_type -#define dl_part dl_dt.d_partitions -#define dl_front dl_dt.d_front -#define dl_back dl_dt.d_back -#define dl_ngroups dl_dt.d_ngroups -#define dl_ag_size dl_dt.d_ag_size -#define dl_ag_alts dl_dt.d_ag_alts -#define dl_ag_off dl_dt.d_ag_off -#define dl_secsize dl_dt.d_secsize -#define dl_ncyl dl_dt.d_ncylinders -#define dl_nsect dl_dt.d_nsectors -#define dl_ntrack dl_dt.d_ntracks -#define dl_rpm dl_dt.d_rpm -#define dl_bootfile dl_dt.d_bootfile -#define dl_boot0_blkno dl_dt.d_boot0_blkno -#define dl_hostname dl_dt.d_hostname -#define dl_rootpartition dl_dt.d_rootpartition -#define dl_rwpartition dl_dt.d_rwpartition - -/* - * Other aliases - */ -#define dl_v3_checksum dl_un.DL_v3_checksum -#define dl_bad dl_un.DL_bad - -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _BSD_DEV_DISK_LABEL_ */ +#warning is obsolete diff -urN xnu-344.49/bsd/dev/i386/conf.c xnu-517/bsd/dev/i386/conf.c --- xnu-344.49/bsd/dev/i386/conf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/i386/conf.c Sat Oct 25 00:25:25 2003 @@ -109,7 +109,7 @@ extern int cttyopen(), cttyread(), cttywrite(), cttyioctl(), cttyselect(); extern int mmread(),mmwrite(); -#define mmselect seltrue +#define mmselect (select_fcn_t *)seltrue #define mmmmap eno_mmap #include @@ -138,8 +138,6 @@ extern int fdesc_open(), fdesc_read(), fdesc_write(), fdesc_ioctl(), fdesc_select(); -extern int seltrue(); - struct cdevsw cdevsw[] = { /* @@ -241,7 +239,7 @@ NO_CDEVICE, /*41*/ { volopen, volclose, eno_rdwrt, eno_rdwrt, /*42*/ - volioctl, eno_stop, eno_reset, 0, seltrue, + volioctl, eno_stop, eno_reset, 0, (select_fcn_t *)seltrue, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, }; diff -urN xnu-344.49/bsd/dev/i386/km.c xnu-517/bsd/dev/i386/km.c --- xnu-344.49/bsd/dev/i386/km.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/i386/km.c Sat Oct 25 00:25:25 2003 @@ -62,7 +62,6 @@ int initialized = 0; static int kmoutput(struct tty *tp); -static void kmtimeout(struct tty *tp); static void kmstart(struct tty *tp); extern void KeyboardOpen(void); @@ -311,9 +310,10 @@ } static void -kmtimeout( struct tty *tp) +kmtimeout(void *arg) { boolean_t funnel_state; + struct tty *tp = (struct tty *) arg; funnel_state = thread_funnel_set(kernel_flock, TRUE); kmoutput(tp); diff -urN xnu-344.49/bsd/dev/i386/stubs.c xnu-517/bsd/dev/i386/stubs.c --- xnu-344.49/bsd/dev/i386/stubs.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/i386/stubs.c Sat Oct 25 00:25:25 2003 @@ -60,10 +60,12 @@ int slen,len,error=0; slen = strlen(from) + 1; + if (slen > maxlen) + error = ENAMETOOLONG; len = min(maxlen,slen); if (copyout(from, to, len)) - error = EIO; + error = EFAULT; *lencopied = len; return error; @@ -110,8 +112,6 @@ bcopy(src,dst,count); return 0; } - -cpu_number() {return(0);} set_bsduthreadargs(thread_t th, void * pcb, void *ignored_arg) { diff -urN xnu-344.49/bsd/dev/i386/sysctl.c xnu-517/bsd/dev/i386/sysctl.c --- xnu-344.49/bsd/dev/i386/sysctl.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/i386/sysctl.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +static int +hw_cpu_sysctl SYSCTL_HANDLER_ARGS +{ + i386_cpu_info_t cpu_info; + void *ptr = (uint8_t *)&cpu_info + (uint32_t)arg1; + int value; + + cpuid_get_info(&cpu_info); + + if (arg2 == sizeof(uint8_t)) { + value = (uint32_t) *(uint8_t *)ptr; + ptr = &value; + arg2 = sizeof(uint32_t); + } + return SYSCTL_OUT(req, ptr, arg2 ? arg2 : strlen((char *)ptr)+1); + return 0; +} + +static int +hw_cpu_features SYSCTL_HANDLER_ARGS +{ + i386_cpu_info_t cpu_info; + char buf[256]; + vm_size_t size; + + cpuid_get_info(&cpu_info); + buf[0] = '\0'; + cpuid_get_feature_names(cpu_info.cpuid_features, buf, sizeof(buf)); + + return SYSCTL_OUT(req, buf, strlen(buf) + 1); +} + +SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW, 0, + "CPU info"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, vendor, CTLTYPE_STRING | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_vendor), 0, + hw_cpu_sysctl, "A", "CPU vendor"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_brand_string), 0, + hw_cpu_sysctl, "A", "CPU brand string"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, value, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_value), sizeof(uint32_t), + hw_cpu_sysctl, "I", "CPU value"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, family, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_family), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU family"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, model, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_model), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU model"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extmodel, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_extmodel), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU extended model"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfamily, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_extfamily), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU extended family"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, stepping, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_stepping), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU stepping"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, feature_bits, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_features), sizeof(uint32_t), + hw_cpu_sysctl, "I", "CPU features"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, signature, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_signature), sizeof(uint32_t), + hw_cpu_sysctl, "I", "CPU signature"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_brand), sizeof(uint8_t), + hw_cpu_sysctl, "I", "CPU brand"); + +#if 0 +SYSCTL_PROC(_machdep_cpu, OID_AUTO, model_string, CTLTYPE_STRING | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, model_string), 0, + hw_cpu_sysctl, "A", "CPU model string"); +#endif + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD, + 0, 0, + hw_cpu_features, "A", "CPU feature names"); + + +struct sysctl_oid *machdep_sysctl_list[] = +{ + &sysctl__machdep_cpu, + &sysctl__machdep_cpu_vendor, + &sysctl__machdep_cpu_brand_string, + &sysctl__machdep_cpu_value, + &sysctl__machdep_cpu_family, + &sysctl__machdep_cpu_model, + &sysctl__machdep_cpu_extmodel, + &sysctl__machdep_cpu_extfamily, + &sysctl__machdep_cpu_feature_bits, + &sysctl__machdep_cpu_stepping, + &sysctl__machdep_cpu_signature, + &sysctl__machdep_cpu_brand, + &sysctl__machdep_cpu_features, + (struct sysctl_oid *) 0 +}; + diff -urN xnu-344.49/bsd/dev/i386/unix_signal.c xnu-517/bsd/dev/i386/unix_signal.c --- xnu-344.49/bsd/dev/i386/unix_signal.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/i386/unix_signal.c Sat Oct 25 00:25:25 2003 @@ -52,9 +52,11 @@ #define USER_CS 0x17 #define USER_DS 0x1f +#define USER_CTHREAD 0x27 #define UDATA_SEL USER_DS #define UCODE_SEL USER_CS +#define UCTHREAD_SEL USER_CTHREAD #define valid_user_code_selector(x) (TRUE) #define valid_user_data_selector(x) (TRUE) @@ -63,6 +65,10 @@ #define NULL_SEG 0 +/* Signal handler flavors supported */ +/* These defns should match the Libc implmn */ +#define UC_TRAD 1 + /* * Send an interrupt to process. * @@ -95,7 +101,8 @@ thread_t thread = current_thread(); thread_act_t th_act = current_act(); struct uthread * ut; - struct i386_saved_state * saved_state = get_user_regs(th_act); + struct i386_saved_state * saved_state = (struct i386_saved_state *) + get_user_regs(th_act); sig_t trampact; ut = get_bsdthread_info(th_act); @@ -116,7 +123,7 @@ /* Handler should call sigreturn to get out of it */ frame.retaddr = 0xffffffff; frame.catcher = catcher; - frame.sigstyle = 1; + frame.sigstyle = UC_TRAD; frame.sig = sig; if (sig == SIGILL || sig == SIGFPE) { @@ -179,7 +186,7 @@ saved_state->ds = UDATA_SEL; saved_state->es = UDATA_SEL; saved_state->fs = NULL_SEG; - saved_state->gs = NULL_SEG; + saved_state->gs = USER_CTHREAD; return; bad: @@ -217,7 +224,8 @@ thread_t thread = current_thread(); thread_act_t th_act = current_act(); int error; - struct i386_saved_state* saved_state = get_user_regs(th_act); + struct i386_saved_state* saved_state = (struct i386_saved_state*) + get_user_regs(th_act); struct uthread * ut; diff -urN xnu-344.49/bsd/dev/ldd.h xnu-517/bsd/dev/ldd.h --- xnu-344.49/bsd/dev/ldd.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ldd.h Sat Oct 25 00:25:25 2003 @@ -44,7 +44,7 @@ #define _BSD_DEV_LDD_PRIV_ #include -#include +#include typedef int (*PFI)(); diff -urN xnu-344.49/bsd/dev/memdev.c xnu-517/bsd/dev/memdev.c --- xnu-344.49/bsd/dev/memdev.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/memdev.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,578 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah Hdr: vn.c 1.13 94/04/02 + * + * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 + * $FreeBSD: src/sys/dev/vn/vn.c,v 1.105.2.4 2001/11/18 07:11:00 dillon Exp $ + */ + +/* + * RAM disk driver. + * + * Block interface to a ramdisk. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +static open_close_fcn_t mdevopen; +static open_close_fcn_t mdevclose; +static psize_fcn_t mdevsize; +static strategy_fcn_t mdevstrategy; +static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); +static int mdevrw(dev_t dev, struct uio *uio, int ioflag); +static char *nonspace(char *pos, char *end); +static char *getspace(char *pos, char *end); +static char *cvtnum(char *pos, char *end, unsigned int *num); + +/* + * cdevsw + * D_DISK we want to look like a disk + * D_CANFREE We support B_FREEBUF + */ + +static struct bdevsw mdevbdevsw = { + /* open */ mdevopen, + /* close */ mdevclose, + /* strategy */ mdevstrategy, + /* ioctl */ mdevbioctl, + /* dump */ eno_dump, + /* psize */ mdevsize, + /* flags */ D_DISK, +}; + +static struct cdevsw mdevcdevsw = { + /* open */ mdevopen, + /* close */ mdevclose, + /* read */ mdevrw, + /* write */ mdevrw, + /* ioctl */ mdevcioctl, + /* stop */ eno_stop, + /* reset */ eno_reset, + /* ttys */ 0, + /* select */ eno_select, + /* mmap */ eno_mmap, + /* strategy */ eno_strat, + /* getc */ eno_getc, + /* putc */ eno_putc, + /* flags */ D_DISK, +}; + +struct mdev { + vm_offset_t mdBase; /* file size in bytes */ + uint32_t mdSize; /* file size in bytes */ + int mdFlags; /* flags */ + int mdSecsize; /* sector size */ + int mdBDev; /* Block device number */ + int mdCDev; /* Character device number */ + void * mdbdevb; + void * mdcdevb; +} mdev[16]; + +/* mdFlags */ +#define mdInited 0x01 /* This device defined */ +#define mdRO 0x02 /* This device is read-only */ +#define mdPhys 0x04 /* This device is in physical memory */ + +int mdevBMajor = -1; +int mdevCMajor = -1; + +static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); +dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); +dev_t mdevlookup(int devid); + +static int mdevclose(dev_t dev, int flags, int devtype, struct proc *p) { + return (0); +} + +static int mdevopen(dev_t dev, int flags, int devtype, struct proc *p) { + + int devid; + + devid = minor(dev); /* Get minor device number */ + + if (devid > 16) return (ENXIO); /* Not valid */ + + if ((flags & FWRITE) && (mdev[devid].mdFlags & mdRO)) return (EACCES); /* Currently mounted RO */ + + return(0); +} + +static int mdevrw(dev_t dev, struct uio *uio, int ioflag) { + int status; + int unit; + addr64_t mdata; + int devid; + enum uio_seg saveflag; + + devid = minor(dev); /* Get minor device number */ + + if (devid > 16) return (ENXIO); /* Not valid */ + if (!(mdev[devid].mdFlags & mdInited)) return (ENXIO); /* Have we actually been defined yet? */ + + mdata = ((addr64_t)mdev[devid].mdBase << 12) + uio->uio_offset; /* Point to the area in "file" */ + + saveflag = uio->uio_segflg; /* Remember what the request is */ + if (mdev[devid].mdFlags & mdPhys) uio->uio_segflg = UIO_PHYS_USERSPACE; /* Make sure we are moving from physical ram if physical device */ + status = uiomove64(mdata, uio->uio_resid, uio); /* Move the data */ + uio->uio_segflg = saveflag; /* Restore the flag */ + + return (status); +} + +static void mdevstrategy(struct buf *bp) { + int unmap; + unsigned int sz, left, lop, csize; + kern_return_t ret; + vm_offset_t vaddr, blkoff; + struct buf *tbuf; + int devid; + addr64_t paddr, fvaddr; + ppnum_t pp; + + devid = minor(bp->b_dev); /* Get minor device number */ + + if ((mdev[devid].mdFlags & mdInited) == 0) { /* Have we actually been defined yet? */ + bp->b_error = ENXIO; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + + bp->b_resid = bp->b_bcount; /* Set byte count */ + + blkoff = bp->b_blkno * mdev[devid].mdSecsize; /* Get offset into file */ + +/* + * Note that reading past end is an error, but reading at end is an EOF. For these + * we just return with b_resid == b_bcount. + */ + + if (blkoff >= (mdev[devid].mdSize << 12)) { /* Are they trying to read/write at/after end? */ + if(blkoff != (mdev[devid].mdSize << 12)) { /* Are we trying to read after EOF? */ + bp->b_error = EINVAL; /* Yeah, this is an error */ + bp->b_flags |= B_ERROR | B_INVAL; + } + biodone(bp); /* Return */ + return; + } + + if ((blkoff + bp->b_bcount) > (mdev[devid].mdSize << 12)) { /* Will this read go past end? */ + bp->b_bcount = ((mdev[devid].mdSize << 12) - blkoff); /* Yes, trim to max */ + } + + vaddr = 0; /* Assume not mapped yet */ + unmap = 0; + + if (bp->b_flags & B_VECTORLIST) { /* Do we have a list of UPLs? */ + tbuf = (struct buf *)bp->b_real_bp; /* Get this for C's inadequacies */ + if((bp->b_flags & B_NEED_IODONE) && /* If we have a UPL, is it already mapped? */ + tbuf && + tbuf->b_data) { + vaddr = (vm_offset_t)tbuf->b_data; /* We already have this mapped in, get base address */ + } + else { /* Not mapped yet */ + ret = ubc_upl_map(bp->b_pagelist, &vaddr); /* Map it in */ + if(ret != KERN_SUCCESS) panic("ramstrategy: ubc_upl_map failed, rc = %08X\n", ret); + unmap = 1; /* Remember to unmap later */ + } + vaddr = vaddr += bp->b_uploffset; /* Calculate actual vaddr */ + } + else vaddr = (vm_offset_t)bp->b_data; /* No UPL, we already have address */ + + fvaddr = (mdev[devid].mdBase << 12) + blkoff; /* Point to offset into ram disk */ + + if(bp->b_flags & B_READ) { /* Is this a read? */ + if(!(mdev[devid].mdFlags & mdPhys)) { /* Physical mapped disk? */ + bcopy((void *)((uintptr_t)fvaddr), + (void *)vaddr, (size_t)bp->b_bcount); /* This is virtual, just get the data */ + } + else { + left = bp->b_bcount; /* Init the amount left to copy */ + while(left) { /* Go until it is all copied */ + + lop = min((4096 - (vaddr & 4095)), (4096 - (fvaddr & 4095))); /* Get smallest amount left on sink and source */ + csize = min(lop, left); /* Don't move more than we need to */ + + pp = pmap_find_phys(kernel_pmap, (addr64_t)((unsigned int)vaddr)); /* Get the sink physical address */ + if(!pp) { /* Not found, what gives? */ + panic("mdevstrategy: sink address %016llX not mapped\n", (addr64_t)((unsigned int)vaddr)); + } + paddr = (addr64_t)(((addr64_t)pp << 12) | (addr64_t)(vaddr & 4095)); /* Get actual address */ + bcopy_phys(fvaddr, paddr, csize); /* Copy this on in */ + mapping_set_mod(paddr >> 12); /* Make sure we know that it is modified */ + + left = left - csize; /* Calculate what is left */ + vaddr = vaddr + csize; /* Move to next sink address */ + fvaddr = fvaddr + csize; /* Bump to next physical address */ + } + } + } + else { /* This is a write */ + if(!(mdev[devid].mdFlags & mdPhys)) { /* Physical mapped disk? */ + bcopy((void *)vaddr, (void *)((uintptr_t)fvaddr), + (size_t)bp->b_bcount); /* This is virtual, just put the data */ + } + else { + left = bp->b_bcount; /* Init the amount left to copy */ + while(left) { /* Go until it is all copied */ + + lop = min((4096 - (vaddr & 4095)), (4096 - (fvaddr & 4095))); /* Get smallest amount left on sink and source */ + csize = min(lop, left); /* Don't move more than we need to */ + + pp = pmap_find_phys(kernel_pmap, (addr64_t)((unsigned int)vaddr)); /* Get the source physical address */ + if(!pp) { /* Not found, what gives? */ + panic("mdevstrategy: source address %016llX not mapped\n", (addr64_t)((unsigned int)vaddr)); + } + paddr = (addr64_t)(((addr64_t)pp << 12) | (addr64_t)(vaddr & 4095)); /* Get actual address */ + + bcopy_phys(paddr, fvaddr, csize); /* Move this on out */ + + left = left - csize; /* Calculate what is left */ + vaddr = vaddr + csize; /* Move to next sink address */ + fvaddr = fvaddr + csize; /* Bump to next physical address */ + } + } + } + + if (unmap) { /* Do we need to unmap this? */ + ubc_upl_unmap(bp->b_pagelist); /* Yes, unmap it */ + } + + bp->b_resid = 0; /* Nothing more to do */ + biodone(bp); /* Say we've finished */ +} + +static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { + return (mdevioctl(dev, cmd, data, flag, p, 0)); +} + +static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { + return (mdevioctl(dev, cmd, data, flag, p, 1)); +} + +static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char) { + + int error; + u_long *f; + u_int64_t *o; + int devid; + + devid = minor(dev); /* Get minor device number */ + + if (devid > 16) return (ENXIO); /* Not valid */ + + error = suser(p->p_ucred, &p->p_acflag); /* Are we superman? */ + if (error) return (error); /* Nope... */ + + f = (u_long*)data; + o = (u_int64_t *)data; + + switch (cmd) { + + case DKIOCGETMAXBLOCKCOUNTREAD: + *o = 32; + break; + + case DKIOCGETMAXBLOCKCOUNTWRITE: + *o = 32; + break; + + case DKIOCGETMAXSEGMENTCOUNTREAD: + *o = 32; + break; + + case DKIOCGETMAXSEGMENTCOUNTWRITE: + *o = 32; + break; + + case DKIOCGETBLOCKSIZE: + *f = mdev[devid].mdSecsize; + break; + + case DKIOCSETBLOCKSIZE: + if (is_char) return (ENODEV); /* We can only do this for a block */ + + if (*f < DEV_BSIZE) return (EINVAL); /* Too short? */ + + mdev[devid].mdSecsize = *f; /* set the new block size */ + break; + + case DKIOCISWRITABLE: + *f = 1; + break; + + case DKIOCGETBLOCKCOUNT32: + if(!(mdev[devid].mdFlags & mdInited)) return (ENXIO); + *f = ((mdev[devid].mdSize << 12) + mdev[devid].mdSecsize - 1) / mdev[devid].mdSecsize; + break; + + case DKIOCGETBLOCKCOUNT: + if(!(mdev[devid].mdFlags & mdInited)) return (ENXIO); + *o = ((mdev[devid].mdSize << 12) + mdev[devid].mdSecsize - 1) / mdev[devid].mdSecsize; + break; + + default: + error = ENOTTY; + break; + } + return(error); +} + + +static int mdevsize(dev_t dev) { + + int devid; + + devid = minor(dev); /* Get minor device number */ + if (devid > 16) return (ENXIO); /* Not valid */ + + if ((mdev[devid].mdFlags & mdInited) == 0) return(-1); /* Not inited yet */ + + return(mdev[devid].mdSecsize); +} + +#include + +void mdevinit(int cnt) { + + int devid, phys; + ppnum_t base; + unsigned int size; + char *ba, *lp; + dev_t dev; + + + ba = PE_boot_args(); /* Get the boot arguments */ + lp = ba + 256; /* Point to the end */ + + while(1) { /* Step through, looking for our keywords */ + phys = 0; /* Assume virtual memory device */ + ba = nonspace(ba, lp); /* Find non-space */ + if(ba >= lp) return; /* We are done if no more... */ + if(((ba[0] != 'v') && (ba[0] != 'p')) + || (ba[1] != 'm') || (ba[2] != 'd') || (ba[4] != '=') + || (ba[3] < '0') || (ba[3] > 'f') + || ((ba[3] > '9') && (ba[3] < 'a'))) { /* Is this of form "vmdx=" or "pmdx=" where x is hex digit? */ + + ba = getspace(ba, lp); /* Find next white space or end */ + continue; /* Start looking for the next one */ + } + + if(ba[0] == 'p') phys = 1; /* Set physical memory disk */ + + devid = ba[3] & 0xF; /* Assume digit */ + if(ba[3] > '9') devid += 9; /* Adjust for hex digits */ + + ba = &ba[5]; /* Step past keyword */ + ba = cvtnum(ba, lp, &base); /* Convert base of memory disk */ + if(ba >= lp) return; /* Malformed one at the end, leave */ + if(ba[0] != '.') continue; /* If not length separater, try next... */ + if(base & 0xFFF) continue; /* Only allow page aligned stuff */ + + ba++; /* Step past '.' */ + ba = cvtnum(ba, lp, &size); /* Try to convert it */ + if(!size || (size & 0xFFF)) continue; /* Allow only non-zer page size multiples */ + if(ba < lp) { /* If we are not at end, check end character */ + if((ba[0] != ' ') && (ba[0] != 0)) continue; /* End must be null or space */ + } + + dev = mdevadd(devid, base >> 12, size >> 12, phys); /* Go add the device */ + } + + return; + +} + +char *nonspace(char *pos, char *end) { /* Find next non-space in string */ + + if(pos >= end) return end; /* Don't go past end */ + if(pos[0] == 0) return end; /* If at null, make end */ + + while(1) { /* Keep going */ + if(pos[0] != ' ') return pos; /* Leave if we found one */ + pos++; /* Stop */ + if(pos >= end) return end; /* Quit if we run off end */ + } +} + +char *getspace(char *pos, char *end) { /* Find next non-space in string */ + + while(1) { /* Keep going */ + if(pos >= end) return end; /* Don't go past end */ + if(pos[0] == 0) return end; /* Leave if we hit null */ + if(pos[0] == ' ') return pos; /* Leave if we found one */ + pos++; /* Stop */ + } +} + +char *cvtnum(char *pos, char *end, unsigned int *num) { /* Convert to a number */ + + int rad, dig; + + *num = 0; /* Set answer to 0 to start */ + rad = 10; + + if(pos >= end) return end; /* Don't go past end */ + if(pos[0] == 0) return end; /* If at null, make end */ + + if(pos[0] == '0' && ((pos[1] == 'x') || (pos[1] == 'x'))) { /* A hex constant? */ + rad = 16; + pos += 2; /* Point to the number */ + } + + while(1) { /* Convert it */ + + if(pos >= end) return end; /* Don't go past end */ + if(pos[0] == 0) return end; /* If at null, make end */ + if(pos[0] < '0') return pos; /* Leave if non-digit */ + dig = pos[0] & 0xF; /* Extract digit */ + if(pos[0] > '9') { /* Is it bigger than 9? */ + if(rad == 10) return pos; /* Leave if not base 10 */ + if(!(((pos[0] >= 'A') && (pos[0] <= 'F')) + || ((pos[0] >= 'a') && (pos[0] <= 'f')))) return pos; /* Leave if bogus char */ + dig = dig + 9; /* Adjust for character */ + } + *num = (*num * rad) + dig; /* Accumulate the number */ + pos++; /* Step on */ + } +} + +dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { + + int i; + + if(devid < 0) { + + devid = -1; + for(i = 0; i < 16; i++) { /* Search all known memory devices */ + if(!(mdev[i].mdFlags & mdInited)) { /* Is this a free one? */ + if(devid < 0)devid = i; /* Remember first free one */ + continue; /* Skip check */ + } + if(!(((base + size -1 ) < mdev[i].mdBase) || ((mdev[i].mdBase + mdev[i].mdSize - 1) < base))) { /* Is there any overlap? */ + panic("mdevadd: attempt to add overlapping memory device at %08X-%08X\n", mdev[i].mdBase, mdev[i].mdBase + mdev[i].mdSize - 1); + } + } + if(devid < 0) { /* Do we have free slots? */ + panic("mdevadd: attempt to add more than 16 memory devices\n"); + } + } + else { + if(devid >= 16) { /* Giving us something bogus? */ + panic("mdevadd: attempt to explicitly add a bogus memory device: &08X\n", devid); + } + if(mdev[devid].mdFlags &mdInited) { /* Already there? */ + panic("mdevadd: attempt to explicitly add a previously defined memory device: &08X\n", devid); + } + } + + if(mdevBMajor < 0) { /* Have we gotten a major number yet? */ + mdevBMajor = bdevsw_add(-1, &mdevbdevsw); /* Add to the table and figure out a major number */ + if (mdevBMajor < 0) { + printf("mdevadd: error - bdevsw_add() returned %d\n", mdevBMajor); + return -1; + } + } + + if(mdevCMajor < 0) { /* Have we gotten a major number yet? */ + mdevCMajor = cdevsw_add_with_bdev(-1, &mdevcdevsw, mdevBMajor); /* Add to the table and figure out a major number */ + if (mdevCMajor < 0) { + printf("ramdevice_init: error - cdevsw_add() returned %d\n", mdevCMajor); + return -1; + } + } + + mdev[devid].mdBDev = makedev(mdevBMajor, devid); /* Get the device number */ + mdev[devid].mdbdevb = devfs_make_node(mdev[devid].mdBDev, DEVFS_BLOCK, /* Make the node */ + UID_ROOT, GID_OPERATOR, + 0600, "md%d", devid); + if (mdev[devid].mdbdevb == NULL) { /* Did we make one? */ + printf("mdevadd: devfs_make_node for block failed!\n"); + return -1; /* Nope... */ + } + + mdev[devid].mdCDev = makedev(mdevCMajor, devid); /* Get the device number */ + mdev[devid].mdcdevb = devfs_make_node(mdev[devid].mdCDev, DEVFS_CHAR, /* Make the node */ + UID_ROOT, GID_OPERATOR, + 0600, "rmd%d", devid); + if (mdev[devid].mdcdevb == NULL) { /* Did we make one? */ + printf("mdevadd: devfs_make_node for character failed!\n"); + return -1; /* Nope... */ + } + + mdev[devid].mdBase = base; /* Set the base address of ram disk */ + mdev[devid].mdSize = size; /* Set the length of the ram disk */ + mdev[devid].mdSecsize = DEV_BSIZE; /* Set starting block size */ + if(phys) mdev[devid].mdFlags |= mdPhys; /* Show that we are in physical memory */ + mdev[devid].mdFlags |= mdInited; /* Show we are all set up */ + printf("Added memory device md%x/rmd%x (%08X/%08X) at %08X for %08X\n", + devid, devid, mdev[devid].mdBDev, mdev[devid].mdCDev, base << 12, size << 12); + return mdev[devid].mdBDev; +} + + +dev_t mdevlookup(int devid) { + + if((devid < 0) || (devid > 15)) return -1; /* Filter any bogus requests */ + if(!(mdev[devid].mdFlags & mdInited)) return -1; /* This one hasn't been defined */ + return mdev[devid].mdBDev; /* Return the device number */ +} diff -urN xnu-344.49/bsd/dev/memdev.h xnu-517/bsd/dev/memdev.h --- xnu-344.49/bsd/dev/memdev.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/memdev.h Sat Oct 25 00:25:25 2003 @@ -0,0 +1,17 @@ + +#ifndef _SYS_MEMDEV_H_ +#define _SYS_MEMDEV_H_ + +#include + +#ifdef KERNEL_PRIVATE + +#ifdef __APPLE_API_PRIVATE + +void mdevinit(vm_offset_t base, unsigned int size); + +#endif /* __APPLE_API_PRIVATE */ + +#endif KERNEL_PRIVATE + +#endif /* _SYS_MEMDEV_H_*/ diff -urN xnu-344.49/bsd/dev/ppc/chud/chud_bsd_callback.c xnu-517/bsd/dev/ppc/chud/chud_bsd_callback.c --- xnu-344.49/bsd/dev/ppc/chud/chud_bsd_callback.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/ppc/chud/chud_bsd_callback.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include +#include /* u_int */ +#include /* struct proc */ +#include /* struct sysent */ + +struct exit_args { + int rval; +}; +extern void exit(struct proc *p, struct exit_args *uap, int *retval); +extern struct sysent sysent[]; + +#pragma mark **** kern debug **** +typedef void (*chudxnu_kdebug_callback_func_t)(uint32_t debugid, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4); +static chudxnu_kdebug_callback_func_t kdebug_callback_fn = NULL; + +extern void kdbg_control_chudxnu(int val, void *fn); +extern unsigned int kdebug_enable; + +static void chudxnu_private_kdebug_callback(unsigned int debugid, unsigned int arg0, unsigned int arg1, unsigned int arg2, unsigned int arg3, unsigned int arg4) +{ + if(kdebug_callback_fn) { + (kdebug_callback_fn)(debugid, arg0, arg1, arg2, arg3, arg4); + } +} + +__private_extern__ +kern_return_t chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) +{ + kdebug_callback_fn = func; + + kdbg_control_chud(TRUE, (void *)chudxnu_private_kdebug_callback); + kdebug_enable |= 0x10; + + return KERN_SUCCESS; +} + +__private_extern__ +kern_return_t chudxnu_kdebug_callback_cancel(void) +{ + kdebug_callback_fn = NULL; + kdbg_control_chud(FALSE, NULL); + kdebug_enable &= ~(0x10); + + return KERN_SUCCESS; +} + +#pragma mark **** task will exit **** + +typedef kern_return_t (*chudxnu_exit_callback_func_t)(int pid); + +__private_extern__ +kern_return_t chudxnu_exit_callback_enter(chudxnu_exit_callback_func_t func) +{ + + return KERN_FAILURE; + +} + +__private_extern__ +kern_return_t chudxnu_exit_callback_cancel(void) +{ + + return KERN_FAILURE; + +} diff -urN xnu-344.49/bsd/dev/ppc/chud/chud_process.c xnu-517/bsd/dev/ppc/chud/chud_process.c --- xnu-344.49/bsd/dev/ppc/chud/chud_process.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/ppc/chud/chud_process.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include + +__private_extern__ +int chudxnu_pid_for_task(task_t task) +{ + struct proc *p; + + if(task!=TASK_NULL) { + p = (struct proc *)(get_bsdtask_info(task)); + if(p) { + return (p->p_pid); + } + } + return -1; +} + +__private_extern__ +task_t chudxnu_task_for_pid(int pid) +{ + struct proc *p = pfind(pid); + if(p) { + return p->task; + } + return TASK_NULL; +} + +__private_extern__ +int chudxnu_current_pid(void) +{ + return current_proc()->p_pid; +} diff -urN xnu-344.49/bsd/dev/ppc/conf.c xnu-517/bsd/dev/ppc/conf.c --- xnu-344.49/bsd/dev/ppc/conf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/conf.c Sat Oct 25 00:25:25 2003 @@ -103,6 +103,9 @@ #define mmselect seltrue #if 1 +#ifdef NPTY +#undef NPTY +#endif /* NPTY */ #define NPTY 32 #else /* 1 */ #include @@ -147,7 +150,7 @@ { consopen, consclose, consread, conswrite, /* 0*/ consioctl, nulldev, nulldev, 0, consselect, - eno_mmap, eno_strat, cons_getc, cons_putc, D_TTY + eno_mmap, eno_strat, (getc_fcn_t *)cons_getc, (putc_fcn_t *)cons_putc, D_TTY }, NO_CDEVICE, /* 1*/ { @@ -157,7 +160,7 @@ }, { nulldev, nulldev, mmread, mmwrite, /* 3*/ - eno_ioctl, nulldev, nulldev, 0, mmselect, + eno_ioctl, nulldev, nulldev, 0, (select_fcn_t *)mmselect, eno_mmap, eno_strat, eno_getc, eno_putc, 0 }, { diff -urN xnu-344.49/bsd/dev/ppc/kern_machdep.c xnu-517/bsd/dev/ppc/kern_machdep.c --- xnu-344.49/bsd/dev/ppc/kern_machdep.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/kern_machdep.c Sat Oct 25 00:25:25 2003 @@ -55,18 +55,14 @@ if (cpu_subtype == ms->cpu_subtype) return (TRUE); - if (cpu_subtype == CPU_SUBTYPE_POWERPC_601) - return (FALSE); - switch (cpu_subtype) { + case CPU_SUBTYPE_POWERPC_970: + /* Do not allow a 970 binary to run on non-970 systems */ + if (ms->cpu_subtype != CPU_SUBTYPE_POWERPC_970) + break; case CPU_SUBTYPE_POWERPC_7450: case CPU_SUBTYPE_POWERPC_7400: case CPU_SUBTYPE_POWERPC_750: - case CPU_SUBTYPE_POWERPC_604e: - case CPU_SUBTYPE_POWERPC_604: - case CPU_SUBTYPE_POWERPC_603ev: - case CPU_SUBTYPE_POWERPC_603e: - case CPU_SUBTYPE_POWERPC_603: case CPU_SUBTYPE_POWERPC_ALL: return (TRUE); } @@ -93,43 +89,35 @@ * cctools project. As of 2/16/98 this is what has been agreed upon for * the PowerPC subtypes. If an exact match is not found the subtype will * be picked from the following order: - * 7400, 750, 604e, 604, 603ev, 603e, 603, ALL + * 970(but only on 970), 7450, 7400, 750, ALL * Note the 601 is NOT in the list above. It is only picked via an exact * match. For details see Radar 2213821. * * To implement this function to follow what was agreed upon above, we use - * the fact there are currently 10 different subtypes. Exact matches return - * the value 10, the value 0 is returned for 601 that is not an exact match, - * and the values 9 thru 1 are returned for the subtypes listed in the order - * above. + * the fact there are currently 4 different subtypes. Exact matches return + * the value 6, and the values 5 thru 1 are returned for the + * subtypes listed in the order above. */ if (ms->cpu_subtype == cpu_subtype) - return 10; - if (cpu_subtype == CPU_SUBTYPE_POWERPC_601) - return 0; + return 6; switch (cpu_subtype) { - case CPU_SUBTYPE_POWERPC_7450: - return 9; - case CPU_SUBTYPE_POWERPC_7400: - return 8; - case CPU_SUBTYPE_POWERPC_750: - return 7; - case CPU_SUBTYPE_POWERPC_604e: - return 6; - case CPU_SUBTYPE_POWERPC_604: + case CPU_SUBTYPE_POWERPC_970: + /* Do not allow a 970 binary to run on non-970 systems */ + if (ms->cpu_subtype != CPU_SUBTYPE_POWERPC_970) + break; return 5; - case CPU_SUBTYPE_POWERPC_603ev: + case CPU_SUBTYPE_POWERPC_7450: return 4; - case CPU_SUBTYPE_POWERPC_603e: + case CPU_SUBTYPE_POWERPC_7400: return 3; - case CPU_SUBTYPE_POWERPC_603: + case CPU_SUBTYPE_POWERPC_750: return 2; case CPU_SUBTYPE_POWERPC_ALL: return 1; } /* - * If we get here it is because it is a cpusubtype we don't support (602 and - * 620) or new cpusubtype that was added since this code was written. Both + * If we get here it is because it is a cpusubtype we don't support + * or a new cpusubtype that was added since this code was written. Both * will be considered unacceptable. */ return 0; @@ -144,7 +132,7 @@ off_t base; off_t end; - base = trunc_page(start); + base = trunc_page_64(start); end = start + len; while (base < end) { diff -urN xnu-344.49/bsd/dev/ppc/km.c xnu-517/bsd/dev/ppc/km.c --- xnu-344.49/bsd/dev/ppc/km.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/km.c Sat Oct 25 00:25:25 2003 @@ -345,7 +345,7 @@ } } if (tp->t_outq.c_cc > 0) { - timeout(kmtimeout, tp, hz); + timeout((timeout_fcn_t)kmtimeout, tp, hz); } tp->t_state &= ~TS_BUSY; ttwwakeup(tp); diff -urN xnu-344.49/bsd/dev/ppc/mem.c xnu-517/bsd/dev/ppc/mem.c --- xnu-344.49/bsd/dev/ppc/mem.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/mem.c Sat Oct 25 00:25:25 2003 @@ -89,7 +89,6 @@ static caddr_t devzerobuf; -extern vm_offset_t mem_actual; extern pmap_t kernel_pmap; mmread(dev, uio) @@ -115,6 +114,7 @@ { register int o; register u_int c, v; + addr64_t vll; register struct iovec *iov; int error = 0; vm_offset_t where; @@ -135,45 +135,48 @@ /* minor device 0 is physical memory */ case 0: - v = trunc_page(uio->uio_offset); - if (uio->uio_offset >= ((dgWork.dgFlags & enaDiagDM) ? mem_actual : mem_size)) + vll = trunc_page_64(uio->uio_offset); + if(((vll >> 31) == 1) || vll >= ((dgWork.dgFlags & enaDiagDM) ? mem_actual : max_mem)) goto fault; - size= PAGE_SIZE; - if(dgWork.dgFlags & enaDiagDM) { /* Can we really get all memory? */ - if (kmem_alloc_pageable(kernel_map, &where, size) != KERN_SUCCESS) { + if (kmem_alloc_pageable(kernel_map, &where, PAGE_SIZE) != KERN_SUCCESS) { goto fault; } else { - (void)mapping_make(kernel_pmap, 0, where, v, - VM_PROT_READ, 2, 0); /* Map it in for the moment */ + addr64_t collad; + + collad = mapping_make(kernel_pmap, (addr64_t)where, (ppnum_t)(vll >> 12), 0, 1, VM_PROT_READ); /* Map it in for the moment */ + if(collad) { /* See if it failed (shouldn't happen) */ + kmem_free(kernel_map, where, PAGE_SIZE); /* Toss the page */ + goto fault; /* Kill the transfer */ + } } } else { - if (kmem_alloc(kernel_map, &where, size) + if (kmem_alloc(kernel_map, &where, 4096) != KERN_SUCCESS) { goto fault; } } - o = uio->uio_offset - v; + o = uio->uio_offset - vll; c = min(PAGE_SIZE - o, (u_int)iov->iov_len); - error = uiomove((caddr_t) (where + o), c, uio); + error = uiomove((caddr_t)(where + o), c, uio); - if(dgWork.dgFlags & enaDiagDM) (void)mapping_remove(kernel_pmap, where); /* Unmap it */ + if(dgWork.dgFlags & enaDiagDM) (void)mapping_remove(kernel_pmap, (addr64_t)where); /* Unmap it */ kmem_free(kernel_map, where, PAGE_SIZE); continue; /* minor device 1 is kernel memory */ case 1: /* Do some sanity checking */ - if (((caddr_t)uio->uio_offset >= VM_MAX_KERNEL_ADDRESS) || - ((caddr_t)uio->uio_offset <= VM_MIN_KERNEL_ADDRESS)) + if (((addr64_t)uio->uio_offset > vm_last_addr) || + ((addr64_t)uio->uio_offset < VM_MIN_KERNEL_ADDRESS)) goto fault; c = iov->iov_len; - if (!kernacc((caddr_t)uio->uio_offset, c)) + if (!kernacc(uio->uio_offset, c)) goto fault; - error = uiomove((caddr_t)uio->uio_offset, (int)c, uio); + error = uiomove64(uio->uio_offset, (int)c, uio); continue; /* minor device 2 is EOF/RATHOLE */ diff -urN xnu-344.49/bsd/dev/ppc/stubs.c xnu-517/bsd/dev/ppc/stubs.c --- xnu-344.49/bsd/dev/ppc/stubs.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/stubs.c Sat Oct 25 00:25:25 2003 @@ -43,38 +43,6 @@ /* - * copy a null terminated string from the kernel address space into - * the user address space. - * - if the user is denied write access, return EFAULT. - * - if the end of string isn't found before - * maxlen bytes are copied, return ENAMETOOLONG, - * indicating an incomplete copy. - * - otherwise, return 0, indicating success. - * the number of bytes copied is always returned in lencopied. - */ -int -copyoutstr(from, to, maxlen, lencopied) - void * from, * to; - size_t maxlen, *lencopied; -{ - int slen,len,error=0; - - /* XXX Must optimize this */ - - slen = strlen(from) + 1; - if (slen > maxlen) - error = ENAMETOOLONG; - - len = min(maxlen,slen); - if (copyout(from, to, len)) - error = EFAULT; - *lencopied = len; - - return error; -} - - -/* * copy a null terminated string from one point to another in * the kernel address space. * - no access checks are performed. diff -urN xnu-344.49/bsd/dev/ppc/sysctl.c xnu-517/bsd/dev/ppc/sysctl.c --- xnu-344.49/bsd/dev/ppc/sysctl.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/dev/ppc/sysctl.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +struct sysctl_oid *machdep_sysctl_list[] = +{ + (struct sysctl_oid *) 0 +}; + diff -urN xnu-344.49/bsd/dev/ppc/systemcalls.c xnu-517/bsd/dev/ppc/systemcalls.c --- xnu-344.49/bsd/dev/ppc/systemcalls.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/systemcalls.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include extern void unix_syscall( @@ -49,8 +51,8 @@ find_user_regs( thread_act_t act); -extern enter_funnel_section(funnel_t *funnel_lock); -extern exit_funnel_section(funnel_t *funnel_lock); +extern void enter_funnel_section(funnel_t *funnel_lock); +extern void exit_funnel_section(void); /* * Function: unix_syscall @@ -73,6 +75,21 @@ boolean_t flavor; int funnel_type; + flavor = (((unsigned int)regs->save_r0) == NULL)? 1: 0; + + if (flavor) + code = regs->save_r3; + else + code = regs->save_r0; + + if (kdebug_enable && (code != 180)) { + if (flavor) + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->save_r4, regs->save_r5, regs->save_r6, regs->save_r7, 0); + else + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + regs->save_r3, regs->save_r4, regs->save_r5, regs->save_r6, 0); + } thread_act = current_act(); uthread = get_bsdthread_info(thread_act); @@ -81,15 +98,8 @@ else proc = current_proc(); - flavor = (regs->save_r0 == NULL)? 1: 0; - uthread->uu_ar0 = (int *)regs; - if (flavor) - code = regs->save_r3; - else - code = regs->save_r0; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; #ifdef DEBUG @@ -118,24 +128,12 @@ } } - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; - - if (kdebug_enable && (code != 180)) { - if (flavor) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r4, regs->save_r5, regs->save_r6, regs->save_r7, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r3, regs->save_r4, regs->save_r5, regs->save_r6, 0); - } - funnel_type = (int)callp->sy_funnel; - if(funnel_type == KERNEL_FUNNEL) + if (funnel_type == KERNEL_FUNNEL) enter_funnel_section(kernel_flock); else if (funnel_type == NETWORK_FUNNEL) enter_funnel_section(network_flock); - uthread->uu_rval[0] = 0; /* @@ -156,7 +154,9 @@ if (KTRPOINT(proc, KTR_SYSCALL)) ktrsyscall(proc, code, callp->sy_narg, uthread->uu_arg, funnel_type); + AUDIT_CMD(audit_syscall_enter(code, proc, uthread)); error = (*(callp->sy_call))(proc, (void *)uthread->uu_arg, &(uthread->uu_rval[0])); + AUDIT_CMD(audit_syscall_exit(error, proc, uthread)); regs = find_user_regs(thread_act); @@ -164,7 +164,7 @@ regs->save_srr0 -= 8; } else if (error != EJUSTRETURN) { if (error) { - regs->save_r3 = error; + regs->save_r3 = (long long)error; /* set the "pc" to execute cerror routine */ regs->save_srr0 -= 4; } else { /* (not error) */ @@ -177,10 +177,7 @@ if (KTRPOINT(proc, KTR_SYSRET)) ktrsysret(proc, code, error, uthread->uu_rval[0], funnel_type); - if(funnel_type == KERNEL_FUNNEL) - exit_funnel_section(kernel_flock); - else if (funnel_type == NETWORK_FUNNEL) - exit_funnel_section(network_flock); + exit_funnel_section(); if (kdebug_enable && (code != 180)) { KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, @@ -214,7 +211,7 @@ regs->save_srr0 -= 8; } else if (error != EJUSTRETURN) { if (error) { - regs->save_r3 = error; + regs->save_r3 = (long long)error; /* set the "pc" to execute cerror routine */ regs->save_srr0 -= 4; } else { /* (not error) */ @@ -236,10 +233,7 @@ if (KTRPOINT(proc, KTR_SYSRET)) ktrsysret(proc, code, error, uthread->uu_rval[0], funnel_type); - if(funnel_type == KERNEL_FUNNEL) - exit_funnel_section(kernel_flock); - else if (funnel_type == NETWORK_FUNNEL) - exit_funnel_section(network_flock); + exit_funnel_section(); if (kdebug_enable && (code != 180)) { KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, @@ -263,33 +257,31 @@ struct timeval *tp; struct timezone *tzp; }; -/* NOTE THIS implementation is for ppc architectures only */ +/* NOTE THIS implementation is for ppc architectures only. + * It is infrequently called, since the commpage intercepts + * most calls in user mode. + */ int ppc_gettimeofday(p, uap, retval) struct proc *p; register struct gettimeofday_args *uap; register_t *retval; { - struct timeval atv; int error = 0; - struct timezone ltz; - //struct savearea *child_state; - extern simple_lock_data_t tz_slock; - - if (uap->tp) { - microtime(&atv); - retval[0] = atv.tv_sec; - retval[1] = atv.tv_usec; - } + + if (uap->tp) + clock_gettimeofday(&retval[0], &retval[1]); if (uap->tzp) { + struct timezone ltz; + extern simple_lock_data_t tz_slock; + usimple_lock(&tz_slock); ltz = tz; usimple_unlock(&tz_slock); - error = copyout((caddr_t)<z, (caddr_t)uap->tzp, - sizeof (tz)); + error = copyout((caddr_t)<z, (caddr_t)uap->tzp, sizeof (tz)); } - return(error); + return (error); } diff -urN xnu-344.49/bsd/dev/ppc/unix_signal.c xnu-517/bsd/dev/ppc/unix_signal.c --- xnu-344.49/bsd/dev/ppc/unix_signal.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/unix_signal.c Sat Oct 25 00:25:25 2003 @@ -41,7 +41,6 @@ #include #include #include -#define __ELF__ 0 #include #define C_REDZONE_LEN 224 @@ -51,6 +50,42 @@ #define TRUNC_DOWN(a,b,c) (((((unsigned)a)-(b))/(c)) * (c)) /* + * The stack layout possibilities (info style); This needs to mach with signal trampoline code + * + * Traditional: 1 + * Traditional64: 20 + * Traditional64with vec: 25 + * 32bit context 30 + * 32bit context with vector 35 + * 64bit context 40 + * 64bit context with vector 45 + * Dual context 50 + * Dual context with vector 55 + * + */ + +#define UC_TRAD 1 +#define UC_TRAD_VEC 6 +#define UC_TRAD64 20 +#define UC_TRAD64_VEC 25 +#define UC_FLAVOR 30 +#define UC_FLAVOR_VEC 35 +#define UC_FLAVOR64 40 +#define UC_FLAVOR64_VEC 45 +#define UC_DUAL 50 +#define UC_DUAL_VEC 55 + + /* The following are valid mcontext sizes */ +#define UC_FLAVOR_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) + +#define UC_FLAVOR_VEC_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) + +#define UC_FLAVOR64_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) + +#define UC_FLAVOR64_VEC_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) + + +/* * Arrange for this process to run a signal handler */ @@ -61,7 +96,9 @@ int sig, mask; u_long code; { + kern_return_t kretn; struct mcontext mctx, *p_mctx; + struct mcontext64 mctx64, *p_mctx64; struct ucontext uctx, *p_uctx; siginfo_t sinfo, *p_sinfo; struct sigacts *ps = p->p_sigacts; @@ -72,42 +109,114 @@ thread_act_t th_act; struct uthread *ut; unsigned long paramp,linkp; - int infostyle = 1; + int infostyle = UC_TRAD; + int dualcontext =0; sig_t trampact; int vec_used = 0; int stack_size = 0; int stack_flags = 0; + void * tstate; + int flavor; + int ctx32 = 1; + int is_64signalregset(void); th_act = current_act(); ut = get_bsdthread_info(th_act); - state_count = PPC_EXCEPTION_STATE_COUNT; - if (act_machine_get_state(th_act, PPC_EXCEPTION_STATE, &mctx.es, &state_count) != KERN_SUCCESS) { - goto bad; - } + + if (p->p_sigacts->ps_siginfo & sigmask(sig)) { + infostyle = UC_FLAVOR; + } + if(is_64signalregset() && (infostyle == UC_FLAVOR)) { + dualcontext = 1; + infostyle = UC_DUAL; + } + if (p->p_sigacts->ps_64regset & sigmask(sig)) { + dualcontext = 0; + ctx32 = 0; + infostyle = UC_FLAVOR64; + } + if (is_64signalregset() && (infostyle == UC_TRAD)) { + ctx32=0; + infostyle = UC_TRAD64; + } + + /* I need this for SIGINFO anyway */ + flavor = PPC_THREAD_STATE; + tstate = (void *)&mctx.ss; state_count = PPC_THREAD_STATE_COUNT; - if (act_machine_get_state(th_act, PPC_THREAD_STATE, &mctx.ss, &state_count) != KERN_SUCCESS) { + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) goto bad; - } - state_count = PPC_FLOAT_STATE_COUNT; - if (act_machine_get_state(th_act, PPC_FLOAT_STATE, &mctx.fs, &state_count) != KERN_SUCCESS) { - goto bad; - } - vec_save(th_act); - if (find_user_vec(th_act)) { - vec_used = 1; - state_count = PPC_VECTOR_STATE_COUNT; - if (act_machine_get_state(th_act, PPC_VECTOR_STATE, &mctx.vs, &state_count) != KERN_SUCCESS) { - goto bad; - } - + if ((ctx32 == 0) || dualcontext) { + flavor = PPC_THREAD_STATE64; + tstate = (void *)&mctx64.ss; + state_count = PPC_THREAD_STATE64_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; } + if ((ctx32 == 1) || dualcontext) { + flavor = PPC_EXCEPTION_STATE; + tstate = (void *)&mctx.es; + state_count = PPC_EXCEPTION_STATE_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + } + + if ((ctx32 == 0) || dualcontext) { + flavor = PPC_EXCEPTION_STATE64; + tstate = (void *)&mctx64.es; + state_count = PPC_EXCEPTION_STATE64_COUNT; + + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + + } + + + if ((ctx32 == 1) || dualcontext) { + flavor = PPC_FLOAT_STATE; + tstate = (void *)&mctx.fs; + state_count = PPC_FLOAT_STATE_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + } + + if ((ctx32 == 0) || dualcontext) { + flavor = PPC_FLOAT_STATE; + tstate = (void *)&mctx64.fs; + state_count = PPC_FLOAT_STATE_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + + } + + + if (find_user_vec_curr()) { + vec_used = 1; + + if ((ctx32 == 1) || dualcontext) { + flavor = PPC_VECTOR_STATE; + tstate = (void *)&mctx.vs; + state_count = PPC_VECTOR_STATE_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + infostyle += 5; + } + + if ((ctx32 == 0) || dualcontext) { + flavor = PPC_VECTOR_STATE; + tstate = (void *)&mctx64.vs; + state_count = PPC_VECTOR_STATE_COUNT; + if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) + goto bad; + infostyle += 5; + } + } + trampact = ps->ps_trampact[sig]; oonstack = ps->ps_sigstk.ss_flags & SA_ONSTACK; - if (p->p_sigacts->ps_siginfo & sigmask(sig)) - infostyle = 2; /* figure out where our new stack lives */ if ((ps->ps_flags & SAS_ALTSTACK) && !oonstack && @@ -117,13 +226,30 @@ stack_size = ps->ps_sigstk.ss_size; ps->ps_sigstk.ss_flags |= SA_ONSTACK; } - else - sp = mctx.ss.r1; + else { + if (ctx32 == 0) + sp = (unsigned int)mctx64.ss.r1; + else + sp = mctx.ss.r1; + } + + /* put siginfo on top */ + /* preserve RED ZONE area */ sp = TRUNC_DOWN(sp, C_REDZONE_LEN, C_STK_ALIGN); - /* context goes first on stack */ + /* next are the saved registers */ + if ((ctx32 == 0) || dualcontext) { + sp -= sizeof(*p_mctx64); + p_mctx64 = (struct mcontext64 *)sp; + } + if ((ctx32 == 1) || dualcontext) { + sp -= sizeof(*p_mctx); + p_mctx = (struct mcontext *)sp; + } + + /* context goes first on stack */ sp -= sizeof(*p_uctx); p_uctx = (struct ucontext *) sp; @@ -131,13 +257,9 @@ sp -= sizeof(*p_sinfo); p_sinfo = (siginfo_t *) sp; - /* next are the saved registers */ - sp -= sizeof(*p_mctx); - p_mctx = (struct mcontext *)sp; - /* C calling conventions, create param save and linkage - * areas - */ + * areas + */ sp = TRUNC_DOWN(sp, C_PARAMSAVE_LEN, C_STK_ALIGN); paramp = sp; @@ -152,14 +274,25 @@ uctx.uc_stack.ss_flags |= SS_ONSTACK; uctx.uc_link = 0; - uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE_COUNT + PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); + if (ctx32 == 0) + uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE64_COUNT + PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); + else + uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE_COUNT + PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); + if (vec_used) uctx.uc_mcsize += (size_t)(PPC_VECTOR_STATE_COUNT * sizeof(int)); - uctx.uc_mcontext = p_mctx; + + if (ctx32 == 0) + uctx.uc_mcontext = (void *)p_mctx64; + else + uctx.uc_mcontext = (void *)p_mctx; /* setup siginfo */ bzero((caddr_t)&sinfo, sizeof(siginfo_t)); sinfo.si_signo = sig; + sinfo.si_addr = (void *)mctx.ss.srr0; + sinfo.pad[0] = (unsigned int)mctx.ss.r1; + switch (sig) { case SIGCHLD: sinfo.si_pid = p->si_pid; @@ -233,13 +366,23 @@ break; } + /* copy info out to user space */ if (copyout((caddr_t)&uctx, (caddr_t)p_uctx, sizeof(struct ucontext))) goto bad; if (copyout((caddr_t)&sinfo, (caddr_t)p_sinfo, sizeof(siginfo_t))) goto bad; - if (copyout((caddr_t)&mctx, (caddr_t)p_mctx, uctx.uc_mcsize)) + if ((ctx32 == 0) || dualcontext) { + tstate = &mctx64; + if (copyout((caddr_t)tstate, (caddr_t)p_mctx64, uctx.uc_mcsize)) goto bad; + } + if ((ctx32 == 1) || dualcontext) { + tstate = &mctx; + if (copyout((caddr_t)tstate, (caddr_t)p_mctx, uctx.uc_mcsize)) + goto bad; + } + /* Place our arguments in arg registers: rtm dependent */ @@ -253,10 +396,9 @@ mctx.ss.srr1 = get_msr_exportmask(); /* MSR_EXPORT_MASK_SET */ mctx.ss.r1 = sp; state_count = PPC_THREAD_STATE_COUNT; - if (act_machine_set_state(th_act, PPC_THREAD_STATE, &mctx.ss, &state_count) != KERN_SUCCESS) { - goto bad; + if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE, &mctx.ss, &state_count)) != KERN_SUCCESS) { + panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); } - return; bad: @@ -280,8 +422,122 @@ * psl to gain improper priviledges or to cause * a machine fault. */ + +#define FOR64_TRANSITION 1 + + +#ifdef FOR64_TRANSITION + +struct osigreturn_args { + struct ucontext *uctx; +}; + +/* ARGSUSED */ +int +osigreturn(p, uap, retval) + struct proc *p; + struct osigreturn_args *uap; + int *retval; +{ + struct ucontext uctx; + struct ucontext *p_uctx; + struct mcontext64 mctx64; + struct mcontext64 *p_64mctx; + struct mcontext *p_mctx; + int error; + thread_act_t th_act; + struct sigacts *ps = p->p_sigacts; + sigset_t mask; + register sig_t action; + unsigned long state_count; + unsigned int state_flavor; + struct uthread * ut; + int vec_used = 0; + void *tsptr, *fptr, *vptr, *mactx; + void ppc_checkthreadstate(void *, int); + + th_act = current_act(); + /* lets use the larger one */ + mactx = (void *)&mctx64; + + ut = (struct uthread *)get_bsdthread_info(th_act); + if (error = copyin(uap->uctx, &uctx, sizeof(struct ucontext))) { + return(error); + } + if (error = copyin(uctx.uc_mcontext, mactx, uctx.uc_mcsize)) { + return(error); + } + + if (uctx.uc_onstack & 01) + p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; + else + p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; + + ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; + if (ut->uu_siglist & ~ut->uu_sigmask) + signal_setast(current_act()); + + vec_used = 0; + switch (uctx.uc_mcsize) { + case UC_FLAVOR64_VEC_SIZE : + vec_used = 1; + case UC_FLAVOR64_SIZE : { + p_64mctx = (struct mcontext64 *)mactx; + tsptr = (void *)&p_64mctx->ss; + fptr = (void *)&p_64mctx->fs; + vptr = (void *)&p_64mctx->vs; + state_flavor = PPC_THREAD_STATE64; + state_count = PPC_THREAD_STATE64_COUNT; + } + break; + case UC_FLAVOR_VEC_SIZE : + vec_used = 1; + case UC_FLAVOR_SIZE: + default: { + p_mctx = (struct mcontext *)mactx; + tsptr = (void *)&p_mctx->ss; + fptr = (void *)&p_mctx->fs; + vptr = (void *)&p_mctx->vs; + state_flavor = PPC_THREAD_STATE; + state_count = PPC_THREAD_STATE_COUNT; + } + break; + } /* switch () */ + + /* validate the thread state, set/reset appropriate mode bits in srr1 */ + (void)ppc_checkthreadstate(tsptr, state_flavor); + + if (thread_setstatus(th_act, state_flavor, tsptr, &state_count) != KERN_SUCCESS) { + return(EINVAL); + } + + state_count = PPC_FLOAT_STATE_COUNT; + if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, &state_count) != KERN_SUCCESS) { + return(EINVAL); + } + + mask = sigmask(SIGFPE); + if (((ut->uu_sigmask & mask) == 0) && (p->p_sigcatch & mask) && ((p->p_sigignore & mask) == 0)) { + action = ps->ps_sigact[SIGFPE]; + if((action != SIG_DFL) && (action != SIG_IGN)) { + thread_enable_fpe(th_act, 1); + } + } + + if (vec_used) { + state_count = PPC_VECTOR_STATE_COUNT; + if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, &state_count) != KERN_SUCCESS) { + return(EINVAL); + } + } + return (EJUSTRETURN); +} + +#endif /* FOR64_TRANSITION */ + struct sigreturn_args { struct ucontext *uctx; + int infostyle; }; /* ARGSUSED */ @@ -291,19 +547,23 @@ struct sigreturn_args *uap; int *retval; { - struct ucontext uctx, *p_uctx; - struct mcontext mctx, *p_mctx; + struct ucontext uctx; + struct ucontext *p_uctx; + char mactx[sizeof(struct mcontext64)]; + struct mcontext *p_mctx; + struct mcontext64 *p_64mctx; int error; thread_act_t th_act; - struct ppc_float_state fs; - struct ppc_exception_state es; struct sigacts *ps = p->p_sigacts; sigset_t mask; register sig_t action; unsigned long state_count; - unsigned int nbits, rbits; + unsigned int state_flavor; struct uthread * ut; int vec_used = 0; + void *tsptr, *fptr, *vptr; + int infostyle = uap->infostyle; + void ppc_checkthreadstate(void *, int); th_act = current_act(); @@ -311,7 +571,9 @@ if (error = copyin(uap->uctx, &uctx, sizeof(struct ucontext))) { return(error); } - if (error = copyin(uctx.uc_mcontext, &mctx, sizeof(struct mcontext))) { + + + if (error = copyin(uctx.uc_mcontext, mactx, uctx.uc_mcsize)) { return(error); } @@ -319,32 +581,51 @@ p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; - ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; - + ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; if (ut->uu_siglist & ~ut->uu_sigmask) signal_setast(current_act()); - nbits = get_msr_nbits(); - rbits = get_msr_rbits(); - /* adjust the critical fields */ - /* make sure naughty bits are off */ - mctx.ss.srr1 &= ~(nbits); - /* make sure necessary bits are on */ - mctx.ss.srr1 |= (rbits); + vec_used = 0; + switch (infostyle) { + case UC_FLAVOR64_VEC: + case UC_TRAD64_VEC: + vec_used = 1; + case UC_TRAD64: + case UC_FLAVOR64: { + p_64mctx = (struct mcontext64 *)mactx; + tsptr = (void *)&p_64mctx->ss; + fptr = (void *)&p_64mctx->fs; + vptr = (void *)&p_64mctx->vs; + state_flavor = PPC_THREAD_STATE64; + state_count = PPC_THREAD_STATE64_COUNT; + } + break; + case UC_FLAVOR_VEC : + case UC_TRAD_VEC : + vec_used = 1; + case UC_FLAVOR : + case UC_TRAD : + default: { + p_mctx = (struct mcontext *)mactx; + tsptr = (void *)&p_mctx->ss; + fptr = (void *)&p_mctx->fs; + vptr = (void *)&p_mctx->vs; + state_flavor = PPC_THREAD_STATE; + state_count = PPC_THREAD_STATE_COUNT; + } + break; + } /* switch () */ - state_count = (size_t)((PPC_EXCEPTION_STATE_COUNT + PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); + /* validate the thread state, set/reset appropriate mode bits in srr1 */ + (void)ppc_checkthreadstate(tsptr, state_flavor); - if (uctx.uc_mcsize > state_count) - vec_used = 1; - - state_count = PPC_THREAD_STATE_COUNT; - if (act_machine_set_state(th_act, PPC_THREAD_STATE, &mctx.ss, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, state_flavor, tsptr, &state_count) != KERN_SUCCESS) { return(EINVAL); } state_count = PPC_FLOAT_STATE_COUNT; - if (act_machine_set_state(th_act, PPC_FLOAT_STATE, &mctx.fs, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, &state_count) != KERN_SUCCESS) { return(EINVAL); } @@ -358,11 +639,10 @@ if (vec_used) { state_count = PPC_VECTOR_STATE_COUNT; - if (act_machine_set_state(th_act, PPC_VECTOR_STATE, &mctx.vs, &state_count) != KERN_SUCCESS) { + if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, &state_count) != KERN_SUCCESS) { return(EINVAL); } } - return (EJUSTRETURN); } diff -urN xnu-344.49/bsd/dev/ppc/unix_startup.c xnu-517/bsd/dev/ppc/unix_startup.c --- xnu-344.49/bsd/dev/ppc/unix_startup.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/ppc/unix_startup.c Sat Oct 25 00:25:25 2003 @@ -68,7 +68,7 @@ kern_return_t ret; if (nbuf == 0) - nbuf = atop(mem_size / 100); /* 1% */ + nbuf = atop_64(sane_size / 100); /* Get 1% of ram, but no more than we can map */ if (nbuf > 8192) nbuf = 8192; if (nbuf < 256) @@ -82,7 +82,7 @@ niobuf = 128; size = (nbuf + niobuf) * sizeof (struct buf); - size = round_page(size); + size = round_page_32(size); ret = kmem_suballoc(kernel_map, &firstaddr, @@ -106,13 +106,13 @@ buf = (struct buf * )firstaddr; bzero(buf,size); - if ((mem_size > (64 * 1024 * 1024)) || ncl) { + if ((sane_size > (64 * 1024 * 1024)) || ncl) { int scale; extern u_long tcp_sendspace; extern u_long tcp_recvspace; if ((nmbclusters = ncl) == 0) { - if ((nmbclusters = ((mem_size / 16) / MCLBYTES)) > 16384) + if ((nmbclusters = ((sane_size / 16) / MCLBYTES)) > 16384) nmbclusters = 16384; } if ((scale = nmbclusters / NMBCLUSTERS) > 1) { @@ -137,7 +137,7 @@ bsd_startupearly(); ret = kmem_suballoc(kernel_map, - &mbutl, + (vm_offset_t *) &mbutl, (vm_size_t) (nmbclusters * MCLBYTES), FALSE, TRUE, diff -urN xnu-344.49/bsd/dev/random/YarrowCoreLib/src/prng.c xnu-517/bsd/dev/random/YarrowCoreLib/src/prng.c --- xnu-344.49/bsd/dev/random/YarrowCoreLib/src/prng.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/random/YarrowCoreLib/src/prng.c Sat Oct 25 00:25:25 2003 @@ -343,8 +343,8 @@ #if defined(macintosh) || defined(__APPLE__) #if (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD)) struct timeval tv; - int32_t endTime; - #else TARGET_API_MAC_CARBON + int64_t endTime, curTime; + #else /* TARGET_API_MAC_CARBON */ UnsignedWide uwide; /* struct needed for Microseconds() */ LONGLONG start; LONGLONG now; @@ -360,15 +360,11 @@ #if (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD)) /* note we can't loop for more than a million microseconds */ #ifdef KERNEL_BUILD - microtime (&tv); + microuptime (&tv); #else gettimeofday(&tv, NULL); #endif - endTime = tv.tv_usec + ticks; - if(endTime > 1000000) { - /* handle rollover now */ - endTime -= 1000000; - } + endTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec + ticks; #else /* TARGET_API_MAC_OSX */ Microseconds(&uwide); start = UnsignedWideToUInt64(uwide); @@ -393,9 +389,10 @@ #ifdef TARGET_API_MAC_OSX gettimeofday(&tv, NULL); #else - microtime (&tv); + microuptime (&tv); + curTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec; #endif - } while(tv.tv_usec < endTime); + } while(curTime < endTime); #else Microseconds(&uwide); now = UnsignedWideToUInt64(uwide); diff -urN xnu-344.49/bsd/dev/random/randomdev.c xnu-517/bsd/dev/random/randomdev.c --- xnu-344.49/bsd/dev/random/randomdev.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/random/randomdev.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999, 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -40,6 +40,8 @@ #define RANDOM_MAJOR -1 /* let the kernel pick the device number */ +d_ioctl_t random_ioctl; + /* * A struct describing which functions will get invoked for certain * actions. @@ -50,7 +52,7 @@ random_close, /* close */ random_read, /* read */ random_write, /* write */ - eno_ioctl, /* ioctl */ + random_ioctl, /* ioctl */ nulldev, /* stop */ nulldev, /* reset */ NULL, /* tty's */ @@ -142,14 +144,33 @@ } devfs_make_node(makedev (ret, 0), DEVFS_CHAR, - UID_ROOT, GID_WHEEL, 0644, "random", 0); + UID_ROOT, GID_WHEEL, 0666, "random", 0); /* * also make urandom * (which is exactly the same thing in our context) */ devfs_make_node(makedev (ret, 1), DEVFS_CHAR, - UID_ROOT, GID_WHEEL, 0644, "urandom", 0); + UID_ROOT, GID_WHEEL, 0666, "urandom", 0); +} + +int +random_ioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + switch (cmd) { + case FIONBIO: + case FIOASYNC: + break; + default: + return ENODEV; + } + + return (0); } /* @@ -172,8 +193,10 @@ if (flags & FWRITE) { if (securelevel >= 2) return (EPERM); +#ifndef __APPLE__ if ((securelevel >= 1) && suser(p->p_ucred, &p->p_acflag)) return (EPERM); +#endif /* !__APPLE__ */ } return (0); diff -urN xnu-344.49/bsd/dev/vn/shadow.c xnu-517/bsd/dev/vn/shadow.c --- xnu-344.49/bsd/dev/vn/shadow.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/vn/shadow.c Sat Oct 25 00:25:25 2003 @@ -61,11 +61,11 @@ #include #define my_malloc(a) malloc(a) #define my_free(a) free(a) -#else TEST_SHADOW +#else /* !TEST_SHADOW */ #include #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK) #define my_free(a) FREE(a, M_TEMP) -#endif TEST_SHADOW +#endif /* TEST_SHADOW */ #include "shadow.h" diff -urN xnu-344.49/bsd/dev/vn/vn.c xnu-517/bsd/dev/vn/vn.c --- xnu-344.49/bsd/dev/vn/vn.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/dev/vn/vn.c Sat Oct 25 00:25:25 2003 @@ -73,11 +73,10 @@ #include #include #include -#include #include #include #include -#include +#include #include #include @@ -91,6 +90,17 @@ #include +extern void +vfs_io_maxsegsize(struct vnode *vp, + int flags, /* B_READ or B_WRITE */ + int *maxsegsize); + +extern void +vfs_io_attributes(struct vnode *vp, + int flags, /* B_READ or B_WRITE */ + int *iosize, + int *vectors); + #include "shadow.h" static ioctl_fcn_t vnioctl_chr; @@ -388,7 +398,7 @@ VOP_TRUNCATE(vn->sc_shadow_vp, size, IO_SYNC, vn->sc_cred, p); VOP_UNLOCK(vn->sc_shadow_vp, 0, p); -#endif 0 +#endif } error = file_io(vn->sc_shadow_vp, vn->sc_cred, UIO_WRITE, base + start, @@ -494,8 +504,10 @@ * simply read or write less. */ if (bp->b_blkno >= vn->sc_size) { - bp->b_error = EINVAL; - bp->b_flags |= B_ERROR | B_INVAL; + if (bp->b_blkno > vn->sc_size) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR | B_INVAL; + } biodone(bp); return; } @@ -531,8 +543,10 @@ struct vn_ioctl *vio; int error; u_long *f; + int num = 0; u_int64_t * o; int unit; + int size = 0; unit = vnunit(dev); if (vnunit(dev) >= NVNDEVICE) { @@ -548,10 +562,15 @@ o = (u_int64_t *)data; switch (cmd) { case VNIOCDETACH: + case DKIOCGETBLOCKSIZE: + case DKIOCSETBLOCKSIZE: case DKIOCGETMAXBLOCKCOUNTREAD: case DKIOCGETMAXBLOCKCOUNTWRITE: case DKIOCGETMAXSEGMENTCOUNTREAD: case DKIOCGETMAXSEGMENTCOUNTWRITE: + case DKIOCGETMAXSEGMENTBYTECOUNTREAD: + case DKIOCGETMAXSEGMENTBYTECOUNTWRITE: + case DKIOCGETBLOCKCOUNT: case DKIOCGETBLOCKCOUNT32: if ((vn->sc_flags & VNF_INITED) == 0) { return (ENXIO); @@ -562,16 +581,36 @@ } switch (cmd) { case DKIOCGETMAXBLOCKCOUNTREAD: - *o = vn->sc_vp->v_mount->mnt_maxreadcnt / vn->sc_secsize; + vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); + *o = size / vn->sc_secsize; break; case DKIOCGETMAXBLOCKCOUNTWRITE: - *o = vn->sc_vp->v_mount->mnt_maxwritecnt / vn->sc_secsize; + vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); + *o = size / vn->sc_secsize; + break; + case DKIOCGETMAXBYTECOUNTREAD: + vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); + *o = size; + break; + case DKIOCGETMAXBYTECOUNTWRITE: + vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); + *o = size; break; case DKIOCGETMAXSEGMENTCOUNTREAD: - *o = vn->sc_vp->v_mount->mnt_segreadcnt; + vfs_io_attributes(vn->sc_vp, B_READ, &size, &num); + *o = num; break; case DKIOCGETMAXSEGMENTCOUNTWRITE: - *o = vn->sc_vp->v_mount->mnt_segwritecnt; + vfs_io_attributes(vn->sc_vp, B_WRITE, &size, &num); + *o = num; + break; + case DKIOCGETMAXSEGMENTBYTECOUNTREAD: + vfs_io_maxsegsize(vn->sc_vp, B_READ, &size); + *o = size; + break; + case DKIOCGETMAXSEGMENTBYTECOUNTWRITE: + vfs_io_maxsegsize(vn->sc_vp, B_WRITE, &size); + *o = size; break; case DKIOCGETBLOCKSIZE: *f = vn->sc_secsize; @@ -598,7 +637,7 @@ case DKIOCGETBLOCKCOUNT32: *f = vn->sc_size; break; - case DKIOCGETBLOCKCOUNT64: + case DKIOCGETBLOCKCOUNT: *o = vn->sc_size; break; case VNIOCSHADOW: @@ -757,7 +796,7 @@ vn->sc_size = (quad_t)vio->vn_size * PAGE_SIZE / vn->sc_secsize; else vn->sc_size = vattr.va_size / vn->sc_secsize; -#endif 0 +#endif vn->sc_secsize = DEV_BSIZE; vn->sc_fsize = vattr.va_size; vn->sc_size = vattr.va_size / vn->sc_secsize; @@ -980,4 +1019,4 @@ printf("vninit: devfs_make_node failed!\n"); } } -#endif NVNDEVICE +#endif /* NVNDEVICE */ diff -urN xnu-344.49/bsd/hfs/hfs.h xnu-517/bsd/hfs/hfs.h --- xnu-344.49/bsd/hfs/hfs.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,6 +26,8 @@ #ifndef __HFS__ #define __HFS__ +#define HFS_SPARSE_DEV 1 + #include #ifdef KERNEL @@ -38,6 +40,7 @@ #include #include #include +#include #include @@ -46,6 +49,7 @@ #include #include #include +#include struct uio; // This is more effective than #include in case KERNEL is undefined... @@ -60,6 +64,11 @@ #define HFS_MAX_DEFERED_ALLOC (1024*1024) +// 32 gigs is a "big" file (i.e. one that when deleted +// would touch enough data that we should break it into +// multiple separate transactions +#define HFS_BIGFILE_SIZE (32LL * 1024LL * 1024LL * 1024LL) + enum { kMDBSize = 512 }; /* Size of I/O transfer to read entire MDB */ @@ -104,7 +113,24 @@ * superuser may continue to allocate blocks. */ #define HFS_MINFREE 1 -#define HFS_MAXRESERVE (u_int64_t)(250*1024*1024) +#define HFS_MAXRESERVE ((u_int64_t)(250*1024*1024)) + +/* + * The system distinguishes between the desirable low-disk + * notifiaction levels for root volumes and non-root volumes. + * The various thresholds are computed as a fraction of the + * volume size, all capped at a certain fixed level + */ + +#define HFS_ROOTLOWDISKTRIGGERFRACTION 5 +#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(250*1024*1024)) +#define HFS_ROOTLOWDISKSHUTOFFFRACTION 6 +#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(375*1024*1024)) + +#define HFS_LOWDISKTRIGGERFRACTION 1 +#define HFS_LOWDISKTRIGGERLEVEL ((u_int64_t)(50*1024*1024)) +#define HFS_LOWDISKSHUTOFFFRACTION 2 +#define HFS_LOWDISKSHUTOFFLEVEL ((u_int64_t)(75*1024*1024)) /* Internal Data structures*/ @@ -183,10 +209,7 @@ /* This structure describes the HFS specific mount structure data. */ typedef struct hfsmount { - u_int8_t hfs_fs_ronly; /* Whether this was mounted as read-initially */ - u_int8_t hfs_unknownpermissions; /* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */ - u_int8_t hfs_media_writeable; - u_int8_t hfs_orphans_cleaned; + u_int32_t hfs_flags; /* see below */ /* Physical Description */ u_long hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */ @@ -206,9 +229,6 @@ mode_t hfs_file_mask; /* mask to and with file protection bits */ u_long hfs_encoding; /* Defualt encoding for non hfs+ volumes */ - /* simple lock for shared meta renaming */ - simple_lock_data_t hfs_renamelock; - /* HFS Specific */ struct vfsVCB hfs_vcb; struct cat_desc hfs_privdir_desc; @@ -217,19 +237,66 @@ hfs_to_unicode_func_t hfs_get_unicode; unicode_to_hfs_func_t hfs_get_hfsname; + /* Quota variables: */ struct quotafile hfs_qfiles[MAXQUOTAS]; /* quota files */ - // XXXdbg + /* Journaling variables: */ void *jnl; // the journal for this volume (if one exists) struct vnode *jvp; // device where the journal lives (may be equal to devvp) u_int32_t jnl_start; // start block of the journal file (so we don't delete it) + u_int32_t jnl_size; u_int32_t hfs_jnlfileid; u_int32_t hfs_jnlinfoblkid; - volatile int readers; + volatile int readers; volatile int blocker; + + /* Notification variables: */ + unsigned long hfs_notification_conditions; + u_int32_t hfs_freespace_notify_warninglimit; + u_int32_t hfs_freespace_notify_desiredlevel; + + /* Metadata allocation zone variables: */ + u_int32_t hfs_metazone_start; + u_int32_t hfs_metazone_end; + u_int32_t hfs_hotfile_start; + u_int32_t hfs_hotfile_end; + int hfs_hotfile_freeblks; + int hfs_hotfile_maxblks; + int hfs_overflow_maxblks; + int hfs_catalog_maxblks; + + /* Hot File Clustering variables: */ + enum hfc_stage hfc_stage; /* what are we up to... */ + time_t hfc_timebase; /* recording period start time */ + time_t hfc_timeout; /* recording period stop time */ + void * hfc_recdata; /* recording data (opaque) */ + int hfc_maxfiles; /* maximum files to track */ + struct vnode * hfc_filevp; + +#ifdef HFS_SPARSE_DEV + /* Sparse device variables: */ + struct vnode * hfs_backingfs_rootvp; + int hfs_sparsebandblks; +#endif } hfsmount_t; -#define hfs_private_metadata_dir hfs_privdir_desc.cd_cnid + +/* HFS mount point flags */ +#define HFS_READ_ONLY 0x001 +#define HFS_UNKNOWN_PERMS 0x002 +#define HFS_WRITEABLE_MEDIA 0x004 +#define HFS_CLEANED_ORPHANS 0x008 +#define HFS_X 0x010 +#define HFS_CASE_SENSITIVE 0x020 +#define HFS_STANDARD 0x040 +#define HFS_METADATA_ZONE 0x080 +#define HFS_FRAGMENTED_FREESPACE 0x100 +#define HFS_NEED_JNL_RESET 0x200 + +#ifdef HFS_SPARSE_DEV +#define HFS_HAS_SPARSE_DEVICE 0x400 +#endif + #define hfs_global_shared_lock_acquire(hfsmp) \ do { \ @@ -276,16 +343,6 @@ #define MAKE_INODE_NAME(name,linkno) \ (void) sprintf((name), "%s%d", HFS_INODE_PREFIX, (linkno)) -/* - * Write check macro - */ -#define WRITE_CK(VNODE, FUNC_NAME) { \ - if ((VNODE)->v_mount->mnt_flag & MNT_RDONLY) { \ - DBG_ERR(("%s: ATTEMPT TO WRITE A READONLY VOLUME\n", \ - FUNC_NAME)); \ - return(EROFS); \ - } \ -} /* structure to hold a "." or ".." directory entry (12 bytes) */ typedef struct hfsdotentry { @@ -304,55 +361,6 @@ ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen)+1 + 3) &~ 3)) -enum { - kCatalogFolderNode = 1, - kCatalogFileNode = 2 -}; - -/* - * CatalogNodeData has same layout as the on-disk HFS Plus file/dir records. - * Classic hfs file/dir records are converted to match this layout. - * - * The cnd_extra padding allows big hfs plus thread records (520 bytes max) - * to be read onto this stucture during a cnid lookup. - * - */ -struct CatalogNodeData { - int16_t cnd_type; - u_int16_t cnd_flags; - u_int32_t cnd_valence; /* dirs only */ - u_int32_t cnd_nodeID; - u_int32_t cnd_createDate; - u_int32_t cnd_contentModDate; - u_int32_t cnd_attributeModDate; - u_int32_t cnd_accessDate; - u_int32_t cnd_backupDate; - u_int32_t cnd_ownerID; - u_int32_t cnd_groupID; - u_int8_t cnd_adminFlags; /* super-user changeable flags */ - u_int8_t cnd_ownerFlags; /* owner changeable flags */ - u_int16_t cnd_mode; /* file type + permission bits */ - union { - u_int32_t cndu_iNodeNum; /* indirect links only */ - u_int32_t cndu_linkCount; /* indirect nodes only */ - u_int32_t cndu_rawDevice; /* special files (FBLK and FCHR) only */ - } cnd_un; - u_int8_t cnd_finderInfo[32]; - u_int32_t cnd_textEncoding; - u_int32_t cnd_reserved; - HFSPlusForkData cnd_datafork; - HFSPlusForkData cnd_rsrcfork; - u_int32_t cnd_iNodeNumCopy; - u_int32_t cnd_linkCNID; /* for hard links only */ - u_int8_t cnd_extra[264]; /* make struct at least 520 bytes long */ -}; -typedef struct CatalogNodeData CatalogNodeData; - -#define cnd_iNodeNum cnd_un.cndu_iNodeNum -#define cnd_linkCount cnd_un.cndu_linkCount -#define cnd_rawDevice cnd_un.cndu_rawDevice - - enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; @@ -388,6 +396,9 @@ #define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb)) +#define HFS_KNOTE(vp, hint) KNOTE(&VTOC(vp)->c_knotes, (hint)) + + #define E_NONE 0 #define kHFSBlockSize 512 @@ -411,11 +422,10 @@ u_int32_t to_bsd_time(u_int32_t hfs_time); u_int32_t to_hfs_time(u_int32_t bsd_time); -int hfs_flushfiles(struct mount *mp, int flags, struct proc *p); int hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush); #define HFS_ALTFLUSH 1 -short hfsUnmount(struct hfsmount *hfsmp, struct proc *p); +extern int hfsUnmount(struct hfsmount *hfsmp, struct proc *p); extern int hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, @@ -493,6 +503,10 @@ extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); extern int hfs_namecmp(const char *, size_t, const char *, size_t); + +extern int hfs_virtualmetafile(struct cnode *); + +void hfs_generate_volume_notifications(struct hfsmount *hfsmp); #endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/hfs/hfs_attrlist.c xnu-517/bsd/hfs/hfs_attrlist.c --- xnu-344.49/bsd/hfs/hfs_attrlist.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_attrlist.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -45,9 +45,6 @@ -extern uid_t console_user; - - /* Routines that are shared by hfs_setattr: */ extern int hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean considerFlags); @@ -71,22 +68,22 @@ static void packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp); + struct vnode *vp, struct proc *p); static void packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, - struct vnode *vp); + struct vnode *vp, struct proc *p); static void packcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * cdp, - struct cat_attr * cap); + struct cat_attr * cap, struct proc *p); static void packfileattr(struct attrblock *abp, struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork); + struct cat_fork *rsrcfork, struct proc *p); static void packdirattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp); + struct cat_attr * cattrp, struct proc *p); static void unpackattrblk(struct attrblock *abp, struct vnode *vp); @@ -192,39 +189,34 @@ (alist->commonattr & ATTR_CMN_OBJPERMANENTID) && (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + cat_cookie_t cookie = {0}; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) return (EROFS); if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0) return (error); - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - hfs_global_shared_lock_release(hfsmp); - return error; - } - } + /* + * Reserve some space in the Catalog file. + */ + error = cat_preflight(hfsmp, CAT_CREATE, &cookie, ap->a_p); + if (error) + return (error); /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p); + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, + LK_EXCLUSIVE, ap->a_p); if (error) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - return (error); + cat_postflight(hfsmp, &cookie, ap->a_p); + return (error); } error = cat_insertfilethread(hfsmp, &cp->c_desc); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p); + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, + ap->a_p); - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + cat_postflight(hfsmp, &cookie, ap->a_p); if (error) return (error); @@ -291,7 +283,7 @@ attrblk.ab_blocksize = attrblocksize; hfs_packattrblk(&attrblk, hfsmp, vp, &cp->c_desc, &cp->c_attr, - datafp, rsrcfp); + datafp, rsrcfp, ap->a_p); /* Don't copy out more data than was generated */ attrbufsize = MIN(attrbufsize, (u_int)varptr - (u_int)attrbufptr); @@ -346,7 +338,7 @@ u_long saved_flags; int error = 0; - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (hfsmp->hfs_flags & HFS_READ_ONLY) return (EROFS); if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || ((alist->commonattr & ~ATTR_CMN_SETMASK) != 0) || @@ -378,7 +370,7 @@ if (hfsmp->jnl && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; - extd = &cp->c_datafork->ff_data.cf_extents[0]; + extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { return EPERM; } @@ -503,6 +495,10 @@ struct cat_desc to_desc = {0}; struct cat_desc todir_desc = {0}; struct cat_desc new_desc = {0}; + cat_cookie_t cookie = {0}; + int catreserve = 0; + int catlocked = 0; + int started_tr = 0; todir_desc.cd_parentcnid = kRootParID; todir_desc.cd_cnid = kRootParID; @@ -517,38 +513,38 @@ // XXXdbg hfs_global_shared_lock_acquire(hfsmp); if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - error = EINVAL; - /* Restore the old name in the VCB */ - copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); - vcb->vcbFlags |= 0xFF00; - goto ErrorExit; - } + if ((error = journal_start_transaction(hfsmp->jnl) != 0)) { + goto rename_out; + } + started_tr = 1; } + /* + * Reserve some space in the Catalog file. + */ + error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p); + if (error) { + goto rename_out; + } + catreserve = 1; /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - - /* Restore the old name in the VCB */ - copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); - vcb->vcbFlags |= 0xFF00; - goto ErrorExit; + goto rename_out; } + catlocked = 1; error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc); - - /* Unlock the Catalog */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); +rename_out: + if (catlocked) { + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + } + if (catreserve) { + cat_postflight(hfsmp, &cookie, p); + } + if (started_tr) { + journal_end_transaction(hfsmp->jnl); } hfs_global_shared_lock_release(hfsmp); @@ -565,7 +561,7 @@ cp->c_desc.cd_nameptr = 0; cp->c_desc.cd_namelen = 0; cp->c_desc.cd_flags &= ~CD_HASBUF; - FREE(name, M_TEMP); + remove_name(name); } /* Update cnode's catalog descriptor */ replace_desc(cp, &new_desc); @@ -788,14 +784,12 @@ cdescp = &cp->c_desc; cattrp = &cp->c_attr; if (cp->c_datafork) { - c_datafork.cf_size = cp->c_datafork->ff_data.cf_size; - c_datafork.cf_clump = cp->c_datafork->ff_data.cf_clump; - c_datafork.cf_blocks = cp->c_datafork->ff_data.cf_blocks; + c_datafork.cf_size = cp->c_datafork->ff_size; + c_datafork.cf_blocks = cp->c_datafork->ff_blocks; } if (cp->c_rsrcfork) { - c_rsrcfork.cf_size = cp->c_rsrcfork->ff_data.cf_size; - c_rsrcfork.cf_clump = cp->c_rsrcfork->ff_data.cf_clump; - c_rsrcfork.cf_blocks = cp->c_rsrcfork->ff_data.cf_blocks; + c_rsrcfork.cf_size = cp->c_rsrcfork->ff_size; + c_rsrcfork.cf_blocks = cp->c_rsrcfork->ff_blocks; } } } @@ -808,7 +802,7 @@ /* Pack catalog entries into attribute buffer. */ hfs_packattrblk(&attrblk, hfsmp, vp, cdescp, cattrp, - &c_datafork, &c_rsrcfork); + &c_datafork, &c_rsrcfork, p); currattrbufsize = ((char *)varptr - (char *)attrbufptr); /* All done with cnode. */ @@ -910,25 +904,26 @@ struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork) + struct cat_fork *rsrcfork, + struct proc *p) { struct attrlist *attrlistp = abp->ab_attrlist; if (attrlistp->volattr) { if (attrlistp->commonattr) - packvolcommonattr(abp, hfsmp, vp); + packvolcommonattr(abp, hfsmp, vp, p); if (attrlistp->volattr & ~ATTR_VOL_INFO) - packvolattr(abp, hfsmp, vp); + packvolattr(abp, hfsmp, vp, p); } else { if (attrlistp->commonattr) - packcommonattr(abp, hfsmp, vp, descp, attrp); + packcommonattr(abp, hfsmp, vp, descp, attrp, p); if (attrlistp->dirattr && S_ISDIR(attrp->ca_mode)) - packdirattr(abp, hfsmp, vp, descp,attrp); + packdirattr(abp, hfsmp, vp, descp,attrp, p); if (attrlistp->fileattr && !S_ISDIR(attrp->ca_mode)) - packfileattr(abp, hfsmp, attrp, datafork, rsrcfork); + packfileattr(abp, hfsmp, attrp, datafork, rsrcfork, p); } } @@ -966,7 +961,8 @@ struct attrblock *abp, struct vnode *vp, char *name, - int namelen) + int namelen, + struct proc *p) { void *varbufptr; struct attrreference * attr_refptr; @@ -1022,7 +1018,7 @@ * Pack common volume attributes. */ static void -packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp) +packvolcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct proc *p) { attrgroup_t attr; void *attrbufptr = *abp->ab_attrbufpp; @@ -1035,7 +1031,7 @@ attr = abp->ab_attrlist->commonattr; if (ATTR_CMN_NAME & attr) { - packnameattr(abp, vp, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen); + packnameattr(abp, vp, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, p); attrbufptr = *abp->ab_attrbufpp; varbufptr = *abp->ab_varbufpp; } @@ -1107,7 +1103,7 @@ } if (ATTR_CMN_OWNERID & attr) { if (cp->c_uid == UNKNOWNUID) - *((uid_t *)attrbufptr)++ = console_user; + *((uid_t *)attrbufptr)++ = p->p_ucred->cr_uid; else *((uid_t *)attrbufptr)++ = cp->c_uid; } @@ -1154,7 +1150,7 @@ static void -packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp) +packvolattr(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct proc *p) { attrgroup_t attr; void *attrbufptr = *abp->ab_attrbufpp; @@ -1179,7 +1175,6 @@ if (ATTR_VOL_SPACEFREE & attr) { *((off_t *)attrbufptr)++ = (off_t)hfs_freeblks(hfsmp, 0) * (off_t)vcb->blockSize; - } if (ATTR_VOL_SPACEAVAIL & attr) { *((off_t *)attrbufptr)++ = (off_t)hfs_freeblks(hfsmp, 1) * @@ -1263,31 +1258,70 @@ vcapattrptr = (vol_capabilities_attr_t *)attrbufptr; if (vcb->vcbSigWord == kHFSPlusSigWord) { + u_int32_t journal_active; + u_int32_t case_sensitive; + + if (hfsmp->jnl) + journal_active = VOL_CAP_FMT_JOURNAL_ACTIVE; + else + journal_active = 0; + + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + case_sensitive = VOL_CAP_FMT_CASE_SENSITIVE; + else + case_sensitive = 0; + vcapattrptr->capabilities[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_PERSISTENTOBJECTIDS | VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS; + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + journal_active | + case_sensitive | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS ; } else { /* Plain HFS */ vcapattrptr->capabilities[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS; + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS ; } vcapattrptr->capabilities[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_READDIRATTR ; + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK ; vcapattrptr->capabilities[VOL_CAPABILITIES_RESERVED1] = 0; vcapattrptr->capabilities[VOL_CAPABILITIES_RESERVED2] = 0; vcapattrptr->valid[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_PERSISTENTOBJECTIDS | VOL_CAP_FMT_SYMBOLICLINKS | - VOL_CAP_FMT_HARDLINKS; + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS ; vcapattrptr->valid[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | VOL_CAP_INT_NFSEXPORT | - VOL_CAP_INT_READDIRATTR ; + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK ; vcapattrptr->valid[VOL_CAPABILITIES_RESERVED1] = 0; vcapattrptr->valid[VOL_CAPABILITIES_RESERVED2] = 0; @@ -1322,7 +1356,8 @@ struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * cdp, - struct cat_attr * cap) + struct cat_attr * cap, + struct proc *p) { attrgroup_t attr = abp->ab_attrlist->commonattr; struct mount *mp = HFSTOVFS(hfsmp); @@ -1331,7 +1366,7 @@ u_long attrlength = 0; if (ATTR_CMN_NAME & attr) { - packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen); + packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen, p); attrbufptr = *abp->ab_attrbufpp; varbufptr = *abp->ab_varbufpp; } @@ -1409,7 +1444,7 @@ } if (ATTR_CMN_OWNERID & attr) { *((uid_t *)attrbufptr)++ = - (cap->ca_uid == UNKNOWNUID) ? console_user : cap->ca_uid; + (cap->ca_uid == UNKNOWNUID) ? p->p_ucred->cr_uid : cap->ca_uid; } if (ATTR_CMN_GRPID & attr) { *((gid_t *)attrbufptr)++ = cap->ca_gid; @@ -1459,7 +1494,8 @@ struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, - struct cat_attr * cattrp) + struct cat_attr * cattrp, + struct proc *p) { attrgroup_t attr = abp->ab_attrlist->dirattr; void *attrbufptr = *abp->ab_attrbufpp; @@ -1470,7 +1506,7 @@ u_long entries = cattrp->ca_entries; if (descp->cd_parentcnid == kRootParID) { - if (hfsmp->hfs_private_metadata_dir != 0) + if (hfsmp->hfs_privdir_desc.cd_cnid != 0) --entries; /* hide private dir */ if (hfsmp->jnl) entries -= 2; /* hide the journal files */ @@ -1493,7 +1529,8 @@ struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, - struct cat_fork *rsrcfork) + struct cat_fork *rsrcfork, + struct proc *p) { attrgroup_t attr = abp->ab_attrlist->fileattr; void *attrbufptr = *abp->ab_attrbufpp; @@ -1517,7 +1554,7 @@ *((u_long *)attrbufptr)++ = hfsmp->hfs_logBlockSize; } if (ATTR_FILE_CLUMPSIZE & attr) { - *((u_long *)attrbufptr)++ = datafork->cf_clump; /* XXX ambiguity */ + *((u_long *)attrbufptr)++ = HFSTOVCB(hfsmp)->vcbClpSiz; } if (ATTR_FILE_DEVTYPE & attr) { if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) @@ -1870,7 +1907,7 @@ int i; if (obj_uid == UNKNOWNUID) - obj_uid = console_user; + obj_uid = p->p_ucred->cr_uid; /* User id 0 (root) always gets access. */ if (cred->cr_uid == 0) { diff -urN xnu-344.49/bsd/hfs/hfs_attrlist.h xnu-517/bsd/hfs/hfs_attrlist.h --- xnu-344.49/bsd/hfs/hfs_attrlist.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_attrlist.h Sat Oct 25 00:25:25 2003 @@ -64,7 +64,7 @@ extern void hfs_packattrblk(struct attrblock *abp, struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc *descp, struct cat_attr *attrp, - struct cat_fork *datafork, struct cat_fork *rsrcfork); + struct cat_fork *datafork, struct cat_fork *rsrcfork, struct proc *p); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff -urN xnu-344.49/bsd/hfs/hfs_btreeio.c xnu-517/bsd/hfs/hfs_btreeio.c --- xnu-344.49/bsd/hfs/hfs_btreeio.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_btreeio.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -142,6 +142,27 @@ blockPtr->isModified = 1; } +static int +btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) +{ +#if BYTE_ORDER == LITTLE_ENDIAN + struct vnode *vp = bp->b_vp; + BlockDescriptor block; + + /* Prepare the block pointer */ + block.blockHeader = bp; + block.buffer = bp->b_data; + /* not found in cache ==> came from disk */ + block.blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; + block.blockSize = bp->b_bcount; + + // XXXdbg have to swap the data before it goes in the journal + SWAP_BT_NODE (&block, ISHFSPLUS (VTOVCB(vp)), VTOC(vp)->c_fileid, 1); +#endif + + return journal_modify_block_end(hfsmp->jnl, bp); +} + __private_extern__ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) @@ -171,7 +192,8 @@ if (blockPtr->isModified == 0) { panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp); } - retval = journal_modify_block_end(hfsmp->jnl, bp); + + retval = btree_journal_modify_block_end(hfsmp, bp); blockPtr->isModified = 0; } else { retval = VOP_BWRITE(bp); @@ -206,7 +228,7 @@ if (blockPtr->isModified == 0) { panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp); } - retval = journal_modify_block_end(hfsmp->jnl, bp); + retval = btree_journal_modify_block_end(hfsmp, bp); blockPtr->isModified = 0; } else if (bdwrite_internal(bp, 1) != 0) { hfs_btsync(vp, 0); @@ -226,7 +248,7 @@ // // journal_modify_block_abort(hfsmp->jnl, bp); //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp); - journal_modify_block_end(hfsmp->jnl, bp); + btree_journal_modify_block_end(hfsmp, bp); blockPtr->isModified = 0; } else { brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ @@ -311,7 +333,9 @@ // is at least the node size then we break out of the loop and let // the error propagate back up. do { - retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded); + retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, + kEFContigMask | kEFMetadataMask, + &actualBytesAdded); if (retval == dskFulErr && actualBytesAdded == 0) { if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) { @@ -336,6 +360,7 @@ * there's plenty of room to grow. */ if ((retval == 0) && + ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) && (vcb->nextAllocation > startAllocation) && ((vcb->nextAllocation + fileblocks) < vcb->totalBlocks)) { vcb->nextAllocation += fileblocks; @@ -418,6 +443,11 @@ ) { MarkVCBDirty( vcb ); ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH); + } else { + struct timeval tv = time; + + VTOC(vp)->c_flag |= C_CHANGE | C_UPDATE; + (void) VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); } ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded); diff -urN xnu-344.49/bsd/hfs/hfs_catalog.c xnu-517/bsd/hfs/hfs_catalog.c --- xnu-344.49/bsd/hfs/hfs_catalog.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_catalog.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -89,6 +89,8 @@ int resolvelink(struct hfsmount *hfsmp, u_long linkref, struct HFSPlusCatalogFile *recp); +static int resolvelinkid(struct hfsmount *hfsmp, u_long linkref, ino_t *ino); + static int getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key); static int buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, @@ -118,8 +120,46 @@ static int buildthread(void *keyp, void *recp, int std_hfs, int directory); +__private_extern__ +int +cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, struct proc *p) +{ + FCB *fcb; + int result; + + fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + + /* Lock catalog b-tree */ + result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); + if (result) + return (result); + + result = BTReserveSpace(fcb, ops, (void*)cookie); + + /* Unlock catalog b-tree */ + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + + MacToVFSError(result); +} + +__private_extern__ +void +cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, struct proc *p) +{ + FCB *fcb; + int error; + + fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); + (void) BTReleaseReserve(fcb, (void*)cookie); + if (error == 0) { + hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + } +} + - +__private_extern__ void cat_convertattr( struct hfsmount *hfsmp, @@ -145,11 +185,39 @@ promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 0, datafp); promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 1, rsrcfp); } else { - bcopy(&recp->hfsPlusFile.dataFork, datafp, sizeof(*datafp)); - bcopy(&recp->hfsPlusFile.resourceFork, rsrcfp, sizeof(*rsrcfp)); + /* Convert the data fork. */ + datafp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + datafp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (attrp->ca_atime >= hfsmp->hfc_timebase)) { + datafp->cf_bytesread = + recp->hfsPlusFile.dataFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + datafp->cf_bytesread = 0; + } + datafp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.dataFork.extents[0], + &datafp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + + /* Convert the resource fork. */ + rsrcfp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + rsrcfp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (attrp->ca_atime >= hfsmp->hfc_timebase)) { + datafp->cf_bytesread = + recp->hfsPlusFile.resourceFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + datafp->cf_bytesread = 0; + } + rsrcfp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.resourceFork.extents[0], + &rsrcfp->cf_extents[0], sizeof(HFSPlusExtentRecord)); } } +__private_extern__ int cat_convertkey( struct hfsmount *hfsmp, @@ -181,6 +249,7 @@ /* * cat_releasedesc */ +__private_extern__ void cat_releasedesc(struct cat_desc *descp) { @@ -195,7 +264,7 @@ descp->cd_nameptr = NULL; descp->cd_namelen = 0; descp->cd_flags &= ~CD_HASBUF; - FREE(name, M_TEMP); + remove_name(name); } descp->cd_nameptr = NULL; descp->cd_namelen = 0; @@ -209,6 +278,7 @@ /* * cat_lookup - lookup a catalog node using a cnode decriptor */ +__private_extern__ int cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, @@ -243,6 +313,7 @@ return (result); } +__private_extern__ int cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) { @@ -264,11 +335,6 @@ if (result) goto exit; - // XXXdbg - preflight all btree operations to make sure there's enough space - result = BTCheckFreeSpace(fcb); - if (result) - goto exit; - BDINIT(file_data, &file_rec); result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]); if (result) @@ -306,6 +372,7 @@ /* * cat_idlookup - lookup a catalog node using a cnode id */ +__private_extern__ int cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) @@ -473,14 +540,41 @@ } } if (forkp != NULL) { - if (isadir(recp)) + if (isadir(recp)) { bzero(forkp, sizeof(*forkp)); - else if (std_hfs) + } else if (std_hfs) { promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, wantrsrc, forkp); - else if (wantrsrc) - bcopy(&recp->hfsPlusFile.resourceFork, forkp, sizeof(*forkp)); - else - bcopy(&recp->hfsPlusFile.dataFork, forkp, sizeof(*forkp)); + } else if (wantrsrc) { + /* Convert the resource fork. */ + forkp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + forkp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { + forkp->cf_bytesread = + recp->hfsPlusFile.resourceFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + forkp->cf_bytesread = 0; + } + forkp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.resourceFork.extents[0], + &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + } else { + /* Convert the data fork. */ + forkp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + forkp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { + forkp->cf_bytesread = + recp->hfsPlusFile.dataFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + forkp->cf_bytesread = 0; + } + forkp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.dataFork.extents[0], + &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + } } if (descp != NULL) { HFSPlusCatalogKey * pluskey = NULL; @@ -508,6 +602,7 @@ /* * cat_create - create a node in the catalog */ +__private_extern__ int cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, struct cat_desc *out_descp) @@ -547,11 +642,6 @@ hfs_setencodingbits(hfsmp, encoding); } - // XXXdbg - preflight all btree operations to make sure there's enough space - result = BTCheckFreeSpace(fcb); - if (result) - goto exit; - /* * Insert the thread record first */ @@ -660,6 +750,7 @@ * 4. BTDeleteRecord(from_thread); * 5. BTInsertRecord(to_thread); */ +__private_extern__ int cat_rename ( struct hfsmount * hfsmp, @@ -700,11 +791,6 @@ if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0))) goto exit; - // XXXdbg - preflight all btree operations to make sure there's enough space - result = BTCheckFreeSpace(fcb); - if (result) - goto exit; - to_key = (HFSPlusCatalogKey *)&to_iterator->key; MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); BDINIT(btdata, recp); @@ -753,7 +839,7 @@ if (result) goto exit; - /* Update the text encoding (on disk and in descriptor */ + /* Update the text encoding (on disk and in descriptor) */ if (!std_hfs) { encoding = hfs_pickencoding(to_key->nodeName.unicode, to_key->nodeName.length); @@ -871,6 +957,14 @@ if (std_hfs) { MALLOC(pluskey, HFSPlusCatalogKey *, sizeof(HFSPlusCatalogKey), M_TEMP, M_WAITOK); promotekey(hfsmp, (HFSCatalogKey *)&to_iterator->key, pluskey, &encoding); + + /* Save the real encoding hint in the Finder Info (field 4). */ + if (directory && from_cdp->cd_cnid == kHFSRootFolderID) { + u_long realhint; + + realhint = hfs_pickencoding(pluskey->nodeName.unicode, pluskey->nodeName.length); + vcb->vcbFndrInfo[4] = SET_HFS_TEXT_ENCODING(realhint); + } } else pluskey = (HFSPlusCatalogKey *)&to_iterator->key; @@ -901,6 +995,7 @@ * 2. BTDeleteRecord(thread); * 3. BTUpdateRecord(parent); */ +__private_extern__ int cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp) { @@ -945,11 +1040,6 @@ if (result) goto exit; - // XXXdbg - preflight all btree operations to make sure there's enough space - result = BTCheckFreeSpace(fcb); - if (result) - goto exit; - /* Delete record */ result = BTDeleteRecord(fcb, iterator); if (result) @@ -973,6 +1063,7 @@ * cnode_update - update the catalog node described by descp * using the data from attrp and forkp. */ +__private_extern__ int cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *dataforkp, struct cat_fork *rsrcforkp) @@ -1217,6 +1308,9 @@ file->resourceFork.totalBlocks = forkp->cf_blocks; bcopy(&forkp->cf_extents[0], &file->resourceFork.extents, sizeof(HFSPlusExtentRecord)); + /* Push blocks read to disk */ + file->resourceFork.clumpSize = + howmany(forkp->cf_bytesread, blksize); } if (state->s_datafork) { forkp = state->s_datafork; @@ -1224,6 +1318,9 @@ file->dataFork.totalBlocks = forkp->cf_blocks; bcopy(&forkp->cf_extents[0], &file->dataFork.extents, sizeof(HFSPlusExtentRecord)); + /* Push blocks read to disk */ + file->resourceFork.clumpSize = + howmany(forkp->cf_bytesread, blksize); } if ((file->resourceFork.extents[0].startBlock != 0) && @@ -1295,7 +1392,7 @@ /* Hide the private meta data directory and journal files */ if (parentcnid == kRootDirID) { if ((rec->recordType == kHFSPlusFolderRecord) && - (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) { + (rec->hfsPlusFolder.folderID == hfsmp->hfs_privdir_desc.cd_cnid)) { return (1); /* continue */ } if (hfsmp->jnl && @@ -1355,6 +1452,7 @@ /* * Note: index is zero relative */ +__private_extern__ int cat_getentriesattr(struct hfsmount *hfsmp, struct cat_desc *prevdesc, int index, struct cat_entrylist *ce_list) @@ -1463,6 +1561,10 @@ return MacToVFSError(result); } +struct linkinfo { + u_long link_ref; + void * dirent_addr; +}; struct read_state { u_int32_t cbs_parentID; @@ -1472,20 +1574,40 @@ off_t cbs_lastoffset; struct uio * cbs_uio; ExtendedVCB * cbs_vcb; - int16_t cbs_hfsPlus; + int8_t cbs_hfsPlus; + int8_t cbs_case_sensitive; int16_t cbs_result; + int32_t cbs_numresults; + u_long *cbs_cookies; + int32_t cbs_ncookies; + int32_t cbs_nlinks; + int32_t cbs_maxlinks; + struct linkinfo *cbs_linkinfo; +}; + +/* Map file mode type to directory entry types */ +u_char modetodirtype[16] = { + DT_REG, DT_FIFO, DT_CHR, DT_UNKNOWN, + DT_DIR, DT_UNKNOWN, DT_BLK, DT_UNKNOWN, + DT_REG, DT_UNKNOWN, DT_LNK, DT_UNKNOWN, + DT_SOCK, DT_UNKNOWN, DT_WHT, DT_UNKNOWN }; +#define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) static int catrec_read(const CatalogKey *ckp, const CatalogRecord *crp, u_int16_t recordLen, struct read_state *state) { + struct hfsmount *hfsmp; CatalogName *cnp; size_t utf8chars; u_int32_t curID; OSErr result; struct dirent catent; + time_t itime; + u_long ilinkref = 0; + void * uiobase; if (state->cbs_hfsPlus) curID = ckp->hfsPlus.parentID; @@ -1529,7 +1651,18 @@ catent.d_fileno = crp->hfsPlusFolder.folderID; break; case kHFSPlusFileRecord: - catent.d_type = DT_REG; + itime = to_bsd_time(crp->hfsPlusFile.createDate); + hfsmp = VCBTOHFS(state->cbs_vcb); + /* + * When a hardlink link is encountered save its link ref. + */ + if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator) && + ((itime == state->cbs_vcb->vcbCrDate) || + (itime == hfsmp->hfs_metadata_createdate))) { + ilinkref = crp->hfsPlusFile.bsdInfo.special.iNodeNum; + } + catent.d_type = MODE_TO_DT(crp->hfsPlusFile.bsdInfo.fileMode); catent.d_fileno = crp->hfsPlusFile.fileID; break; default: @@ -1575,37 +1708,76 @@ /* hide our private meta data directory */ if (curID == kRootDirID && catent.d_fileno == state->cbs_hiddenDirID && - catent.d_type == DT_DIR) - goto lastitem; + catent.d_type == DT_DIR) { + if (state->cbs_case_sensitive) { + // This is how we skip over these entries. The next + // time we fill in a real item the uio_offset will + // point to the correct place in the "virtual" directory + // so that PositionIterator() will do the right thing + // when scanning to get to a particular position in the + // directory. + state->cbs_uio->uio_offset += catent.d_reclen; + state->cbs_lastoffset = state->cbs_uio->uio_offset; + return (1); /* skip and continue */ + } else + goto lastitem; + } + /* Hide the journal files */ if ((curID == kRootDirID) && (catent.d_type == DT_REG) && ((catent.d_fileno == state->cbs_hiddenJournalID) || (catent.d_fileno == state->cbs_hiddenInfoBlkID))) { + // see comment up above for why this is here + state->cbs_uio->uio_offset += catent.d_reclen; + state->cbs_lastoffset = state->cbs_uio->uio_offset; + return (1); /* skip and continue */ } state->cbs_lastoffset = state->cbs_uio->uio_offset; + uiobase = state->cbs_uio->uio_iov->iov_base; /* if this entry won't fit then we're done */ - if (catent.d_reclen > state->cbs_uio->uio_resid) + if (catent.d_reclen > state->cbs_uio->uio_resid || + (ilinkref != 0 && state->cbs_nlinks == state->cbs_maxlinks) || + (state->cbs_ncookies != 0 && state->cbs_numresults >= state->cbs_ncookies)) return (0); /* stop */ state->cbs_result = uiomove((caddr_t) &catent, catent.d_reclen, state->cbs_uio); + /* + * Record any hard links for post processing. + */ + if ((ilinkref != 0) && + (state->cbs_result == 0) && + (state->cbs_nlinks < state->cbs_maxlinks)) { + state->cbs_linkinfo[state->cbs_nlinks].dirent_addr = uiobase; + state->cbs_linkinfo[state->cbs_nlinks].link_ref = ilinkref; + state->cbs_nlinks++; + } + + if (state->cbs_cookies) { + state->cbs_cookies[state->cbs_numresults++] = state->cbs_uio->uio_offset; + } else { + state->cbs_numresults++; + } + /* continue iteration if there's room */ return (state->cbs_result == 0 && state->cbs_uio->uio_resid >= AVERAGE_HFSDIRENTRY_SIZE); } +#define SMALL_DIRENTRY_SIZE (sizeof(struct dirent) - (MAXNAMLEN + 1) + 8) /* * */ +__private_extern__ int -cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp, - struct uio *uio, int *eofflag) +cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp, int entrycnt, + struct uio *uio, int *eofflag, u_long *cookies, int ncookies) { ExtendedVCB *vcb = HFSTOVCB(hfsmp); BTreeIterator * iterator; @@ -1614,13 +1786,24 @@ u_int16_t op; struct read_state state; u_int32_t dirID = descp->cd_cnid; + void * buffer; + int bufsize; + int maxdirentries; int result; diroffset = uio->uio_offset; *eofflag = 0; + maxdirentries = MIN(entrycnt, uio->uio_resid / SMALL_DIRENTRY_SIZE); - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - bzero(iterator, sizeof(*iterator)); + /* Get a buffer for collecting link info and for a btree iterator */ + bufsize = (maxdirentries * sizeof(struct linkinfo)) + sizeof(*iterator); + MALLOC(buffer, void *, bufsize, M_TEMP, M_WAITOK); + bzero(buffer, bufsize); + + state.cbs_nlinks = 0; + state.cbs_maxlinks = maxdirentries; + state.cbs_linkinfo = (struct linkinfo *) buffer; + iterator = (BTreeIterator *) ((char *)buffer + (maxdirentries * sizeof(struct linkinfo))); /* get an iterator and position it */ cip = GetCatalogIterator(vcb, dirID, diroffset); @@ -1634,7 +1817,7 @@ } else if ((result = MacToVFSError(result))) goto cleanup; - state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir; + state.cbs_hiddenDirID = hfsmp->hfs_privdir_desc.cd_cnid; if (hfsmp->jnl) { state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid; state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid; @@ -1645,16 +1828,58 @@ state.cbs_uio = uio; state.cbs_result = 0; state.cbs_parentID = dirID; + if (diroffset <= 2*sizeof(struct hfsdotentry)) { + state.cbs_numresults = diroffset/sizeof(struct hfsdotentry); + } else { + state.cbs_numresults = 0; + } + state.cbs_cookies = cookies; + state.cbs_ncookies = ncookies; if (vcb->vcbSigWord == kHFSPlusSigWord) state.cbs_hfsPlus = 1; else state.cbs_hfsPlus = 0; + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) + state.cbs_case_sensitive = 1; + else + state.cbs_case_sensitive = 0; + /* process as many entries as possible... */ result = BTIterateRecords(GetFileControlBlock(vcb->catalogRefNum), op, iterator, (IterateCallBackProcPtr)catrec_read, &state); + /* + * Post process any hard links to get the real file id. + */ + if (state.cbs_nlinks > 0) { + struct iovec aiov; + struct uio auio; + u_int32_t fileid; + int i; + u_int32_t tempid; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = uio->uio_segflg; + auio.uio_rw = UIO_READ; /* read kernel memory into user memory */ + auio.uio_procp = uio->uio_procp; + + for (i = 0; i < state.cbs_nlinks; ++i) { + fileid = 0; + + if (resolvelinkid(hfsmp, state.cbs_linkinfo[i].link_ref, &fileid) != 0) + continue; + + /* Update the file id in the user's buffer */ + aiov.iov_base = (char *) state.cbs_linkinfo[i].dirent_addr; + aiov.iov_len = sizeof(fileid); + auio.uio_offset = 0; + auio.uio_resid = aiov.iov_len; + (void) uiomove((caddr_t)&fileid, sizeof(fileid), &auio); + } + } if (state.cbs_result) result = state.cbs_result; else @@ -1679,13 +1904,71 @@ } (void) ReleaseCatalogIterator(cip); - FREE(iterator, M_TEMP); + FREE(buffer, M_TEMP); return (result); } /* + * cat_binarykeycompare - compare two HFS Plus catalog keys. + + * The name portion of the key is comapred using a 16-bit binary comparison. + * This is called from the b-tree code. + */ +__private_extern__ +int +cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) +{ + u_int32_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + result = 0; + + if (searchParentID > trialParentID) { + ++result; + } else if (searchParentID < trialParentID) { + --result; + } else { + u_int16_t * str1 = &searchKey->nodeName.unicode[0]; + u_int16_t * str2 = &trialKey->nodeName.unicode[0]; + int length1 = searchKey->nodeName.length; + int length2 = trialKey->nodeName.length; + u_int16_t c1, c2; + int length; + + if (length1 < length2) { + length = length1; + --result; + } else if (length1 > length2) { + length = length2; + ++result; + } else { + length = length1; + } + + while (length--) { + c1 = *(str1++); + c2 = *(str2++); + + if (c1 > c2) { + result = 1; + break; + } + if (c1 < c2) { + result = -1; + break; + } + } + } + + return result; +} + + +/* * buildkey - build a Catalog b-tree key from a cnode descriptor */ static int @@ -1766,7 +2049,7 @@ bzero(iterator, sizeof(*iterator)); /* Build a descriptor for private dir. */ - idesc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + idesc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; idesc.cd_nameptr = inodename; idesc.cd_namelen = strlen(inodename); idesc.cd_flags = 0; @@ -1791,6 +2074,25 @@ } /* + * Resolve hard link reference to obtain the inode number. + */ +static int +resolvelinkid(struct hfsmount *hfsmp, u_long linkref, ino_t *ino) +{ + struct HFSPlusCatalogFile record; + int error; + + error = resolvelink(hfsmp, linkref, &record); + if (error == 0) { + if (record.fileID == 0) + error = ENOENT; + else + *ino = record.fileID; + } + return (error); +} + +/* * getkey - get a key from id by doing a thread lookup */ static int @@ -1947,10 +2249,15 @@ char * nameptr; long bufsize; size_t utf8len; + char tmpbuff[128]; /* guess a size... */ bufsize = (3 * key->nodeName.length) + 1; - MALLOC(nameptr, char *, bufsize, M_TEMP, M_WAITOK); + if (bufsize >= sizeof(tmpbuff)-1) { + MALLOC(nameptr, char *, bufsize, M_TEMP, M_WAITOK); + } else { + nameptr = &tmpbuff[0]; + } result = utf8_encodestr(key->nodeName.unicode, key->nodeName.length * sizeof(UniChar), @@ -1970,14 +2277,17 @@ bufsize, ':', 0); } descp->cd_parentcnid = key->parentID; - descp->cd_nameptr = nameptr; + descp->cd_nameptr = add_name(nameptr, utf8len, 0, 0); descp->cd_namelen = utf8len; descp->cd_cnid = cnid; descp->cd_hint = hint; descp->cd_flags = CD_DECOMPOSED | CD_HASBUF; if (isdir) - descp->cd_flags |= CD_ISDIR; + descp->cd_flags |= CD_ISDIR; descp->cd_encoding = encoding; + if (nameptr != &tmpbuff[0]) { + FREE(nameptr, M_TEMP); + } return result; } @@ -2115,6 +2425,8 @@ if (resource) { forkp->cf_size = filep->rsrcLogicalSize; forkp->cf_blocks = filep->rsrcPhysicalSize / blocksize; + forkp->cf_bytesread = 0; + forkp->cf_vblocks = 0; xp[0].startBlock = (u_int32_t)filep->rsrcExtents[0].startBlock; xp[0].blockCount = (u_int32_t)filep->rsrcExtents[0].blockCount; xp[1].startBlock = (u_int32_t)filep->rsrcExtents[1].startBlock; @@ -2124,6 +2436,8 @@ } else { forkp->cf_size = filep->dataLogicalSize; forkp->cf_blocks = filep->dataPhysicalSize / blocksize; + forkp->cf_bytesread = 0; + forkp->cf_vblocks = 0; xp[0].startBlock = (u_int32_t)filep->dataExtents[0].startBlock; xp[0].blockCount = (u_int32_t)filep->dataExtents[0].blockCount; xp[1].startBlock = (u_int32_t)filep->dataExtents[1].startBlock; diff -urN xnu-344.49/bsd/hfs/hfs_catalog.h xnu-517/bsd/hfs/hfs_catalog.h --- xnu-344.49/bsd/hfs/hfs_catalog.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_catalog.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,6 +65,7 @@ /* cd_flags */ #define CD_HASBUF 0x01 /* allocated filename buffer */ #define CD_DECOMPOSED 0x02 /* name is fully decomposed */ +#define CD_ISMETA 0x40 /* describes a metadata file */ #define CD_ISDIR 0x80 /* describes a directory */ /* @@ -95,15 +96,24 @@ #define ca_entries ca_union.cau_entries /* - * Catalog Node Fork (runtime + on disk) + * Catalog Node Fork (runtime) + * + * NOTE: this is not the same as a struct HFSPlusForkData */ struct cat_fork { - u_int64_t cf_size; /* fork's logical size in bytes */ - u_int32_t cf_clump; /* fork's clump size in bytes */ - u_int32_t cf_blocks; /* total blocks used by this fork */ - struct HFSPlusExtentDescriptor cf_extents[8]; /* initial set of extents */ + u_int64_t cf_size; /* fork's logical size in bytes */ + union { + u_int32_t cfu_clump; /* fork's clump size in bytes (sys files only) */ + u_int64_t cfu_bytesread; /* bytes read from this fork */ + } cf_union; + u_int32_t cf_vblocks; /* virtual (unalloated) blocks */ + u_int32_t cf_blocks; /* total blocks used by this fork */ + struct HFSPlusExtentDescriptor cf_extents[8]; /* initial set of extents */ }; +#define cf_clump cf_union.cfu_clump +#define cf_bytesread cf_union.cfu_bytesread + /* * Catalog Node Entry @@ -132,6 +142,28 @@ }; /* + * Catalog Operations Hint + * + * lower 16 bits: count of B-tree insert operations + * upper 16 bits: count of B-tree delete operations + * + */ +#define CAT_DELETE 0x00020000 +#define CAT_CREATE 0x00000002 +#define CAT_RENAME 0x00020002 +#define CAT_EXCHANGE 0x00020002 + +typedef u_int32_t catops_t; + +/* + * The size of cat_cookie_t much match the size of + * the nreserve struct (in BTreeNodeReserve.c). + */ +typedef struct cat_cookie_t { + char opaque[24]; +} cat_cookie_t; + +/* * Catalog Interface * * These functions perform a catalog transactions. The @@ -186,12 +218,30 @@ extern int cat_getdirentries( struct hfsmount *hfsmp, struct cat_desc *descp, + int entrycnt, struct uio *uio, - int *eofflag); + int *eofflag, + u_long *cookies, + int ncookies); extern int cat_insertfilethread ( struct hfsmount *hfsmp, struct cat_desc *descp); + +extern int cat_preflight( + struct hfsmount *hfsmp, + catops_t ops, + cat_cookie_t *cookie, + struct proc *p); + +extern void cat_postflight( + struct hfsmount *hfsmp, + cat_cookie_t *cookie, + struct proc *p); + +extern int cat_binarykeycompare( + HFSPlusCatalogKey *searchKey, + HFSPlusCatalogKey *trialKey); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff -urN xnu-344.49/bsd/hfs/hfs_chash.c xnu-517/bsd/hfs/hfs_chash.c --- xnu-344.49/bsd/hfs/hfs_chash.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_chash.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -140,7 +140,7 @@ (void)tsleep((caddr_t)cp, PINOD, "hfs_chashget-2", 0); goto loop; } - if (cp->c_flag & C_NOEXISTS) + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) continue; /* @@ -177,7 +177,7 @@ */ if (wantrsrc && *rvpp == NULL && cp->c_rsrc_vp) { error = vget(cp->c_rsrc_vp, 0, p); - vput(*vpp); /* ref no longer needed */ + vrele(*vpp); /* ref no longer needed */ *vpp = NULL; if (error) goto loop; @@ -185,7 +185,7 @@ } else if (!wantrsrc && *vpp == NULL && cp->c_vp) { error = vget(cp->c_vp, 0, p); - vput(*rvpp); /* ref no longer needed */ + vrele(*rvpp); /* ref no longer needed */ *rvpp = NULL; if (error) goto loop; @@ -205,11 +205,11 @@ void hfs_chashinsert(struct cnode *cp) { - if (cp->c_fileid == 0) - panic("hfs_chashinsert: trying to insert file id 0"); - simple_lock(&hfs_chash_slock); - LIST_INSERT_HEAD(CNODEHASH(cp->c_dev, cp->c_fileid), cp, c_hash); - simple_unlock(&hfs_chash_slock); + if (cp->c_fileid != 0) { + simple_lock(&hfs_chash_slock); + LIST_INSERT_HEAD(CNODEHASH(cp->c_dev, cp->c_fileid), cp, c_hash); + simple_unlock(&hfs_chash_slock); + } } diff -urN xnu-344.49/bsd/hfs/hfs_cnode.c xnu-517/bsd/hfs/hfs_cnode.c --- xnu-344.49/bsd/hfs/hfs_cnode.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_cnode.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,6 +66,8 @@ int forkcount = 0; int truncated = 0; int started_tr = 0, grabbed_lock = 0; + cat_cookie_t cookie; + int cat_reserve = 0; if (prtactive && vp->v_usecount != 0) vprint("hfs_inactive: pushing active", vp); @@ -76,7 +78,7 @@ if (cp->c_mode == 0) goto out; - if (vp->v_mount->mnt_flag & MNT_RDONLY) + if (hfsmp->hfs_flags & HFS_READ_ONLY) goto out; if (cp->c_datafork) @@ -85,15 +87,14 @@ ++forkcount; /* If needed, get rid of any fork's data for a deleted file */ - if ((cp->c_flag & C_DELETED) && - vp->v_type == VREG && - (VTOF(vp)->ff_blocks != 0)) { - error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); - truncated = 1; - // have to do this to prevent the lost ubc_info panic - SET(cp->c_flag, C_TRANSIT); + if ((vp->v_type == VREG) && (cp->c_flag & C_DELETED)) { + if (VTOF(vp)->ff_blocks != 0) { + error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); + if (error) + goto out; + truncated = 1; + } recycle = 1; - if (error) goto out; } /* @@ -102,13 +103,13 @@ */ if ((cp->c_flag & C_DELETED) && (forkcount <= 1)) { /* - * Mark cnode in transit so that one can get this + * Mark cnode in transit so that no one can get this * cnode from cnode hash. */ SET(cp->c_flag, C_TRANSIT); cp->c_flag &= ~C_DELETED; cp->c_rdev = 0; - + // XXXdbg hfs_global_shared_lock_acquire(hfsmp); grabbed_lock = 1; @@ -120,6 +121,15 @@ started_tr = 1; } + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + cat_reserve = 1; + + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) goto out; @@ -158,18 +168,20 @@ hfs_volupdate(hfsmp, VOL_RMFILE, 0); } - /* Push any defered access times to disk */ - if (cp->c_flag & C_ATIMEMOD) { - cp->c_flag &= ~C_ATIMEMOD; - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord) - cp->c_flag |= C_MODIFIED; - } - if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) { tv = time; + // if the only thing being updated is the access time + // then set the modified bit too so that update will + // flush it to disk. otherwise it'll get dropped. + if ((cp->c_flag & C_CHANGEMASK) == C_ACCESS) { + cp->c_flag |= C_MODIFIED; + } VOP_UPDATE(vp, &tv, &tv, 0); } out: + if (cat_reserve) + cat_postflight(hfsmp, &cookie, p); + // XXXdbg - have to do this because a goto could have come here if (started_tr) { journal_end_transaction(hfsmp->jnl); @@ -211,7 +223,12 @@ if (prtactive && vp->v_usecount != 0) vprint("hfs_reclaim(): pushing active", vp); - devvp = cp->c_devvp; /* For later releasing */ + /* + * Keep track of an inactive hot file. + */ + (void) hfs_addhotfile(vp); + + devvp = cp->c_devvp; /* For later releasing */ /* * Find file fork for this vnode (if any) @@ -224,6 +241,9 @@ } else if ((fp = cp->c_rsrcfork) && (cp->c_rsrc_vp == vp)) { cp->c_rsrcfork = NULL; cp->c_rsrc_vp = NULL; + if (VPARENT(vp) == cp->c_vp) { + cp->c_flag &= ~C_VPREFHELD; + } altfp = cp->c_datafork; } else { cp->c_vp = NULL; @@ -288,7 +308,7 @@ cp->c_desc.cd_nameptr = 0; cp->c_desc.cd_flags &= ~CD_HASBUF; cp->c_desc.cd_namelen = 0; - FREE(nameptr, M_TEMP); + remove_name(nameptr); } CLR(cp->c_flag, (C_ALLOC | C_TRANSIT)); if (ISSET(cp->c_flag, C_WALLOC) || ISSET(cp->c_flag, C_WTRANSIT)) @@ -333,8 +353,8 @@ cp = hfs_chashget(dev, cnid, wantrsrc, &vp, &rvp); if (cp != NULL) { /* hide open files that have been deleted */ - if ((hfsmp->hfs_private_metadata_dir != 0) - && (cp->c_parentcnid == hfsmp->hfs_private_metadata_dir) + if ((hfsmp->hfs_privdir_desc.cd_cnid != 0) + && (cp->c_parentcnid == hfsmp->hfs_privdir_desc.cd_cnid) && (cp->c_nlink == 0)) { retval = ENOENT; goto exit; @@ -393,6 +413,7 @@ cnattr.ca_fileid = kRootParID; cnattr.ca_nlink = 2; + cnattr.ca_entries = 1; cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); } else { /* Lock catalog b-tree */ @@ -408,8 +429,9 @@ goto exit; /* Hide open files that have been deleted */ - if ((hfsmp->hfs_private_metadata_dir != 0) && - (cndesc.cd_parentcnid == hfsmp->hfs_private_metadata_dir)) { + if ((hfsmp->hfs_privdir_desc.cd_cnid != 0) && + (cndesc.cd_parentcnid == hfsmp->hfs_privdir_desc.cd_cnid) && + (cnattr.ca_nlink == 0)) { cat_releasedesc(&cndesc); retval = ENOENT; goto exit; @@ -426,14 +448,16 @@ && cndesc.cd_namelen > 0) { replace_desc(VTOC(new_vp), &cndesc); } + cat_releasedesc(&cndesc); } + exit: /* Release reference taken on opposite vnode (if any). */ if (vp) - vput(vp); + vrele(vp); else if (rvp) - vput(rvp); + vrele(rvp); if (retval) { *vpp = NULL; @@ -445,7 +469,8 @@ if (vp == NULL) panic("hfs_getcnode: missing vp!"); - UBCINFOCHECK("hfs_getcnode", vp); + if (UBCISVALID(vp)) + UBCINFOCHECK("hfs_getcnode", vp); *vpp = vp; return (0); } @@ -478,12 +503,13 @@ int retval; dev_t dev; struct proc *p = current_proc(); - +#if 0 /* Bail when unmount is in progress */ if (mp->mnt_kern_flag & MNTK_UNMOUNT) { *vpp = NULL; return (EPERM); } +#endif #if !FIFO if (IFTOVT(attrp->ca_mode) == VFIFO) { @@ -502,6 +528,11 @@ SET(cp2->c_flag, C_ALLOC); cp2->c_cnid = descp->cd_cnid; cp2->c_fileid = attrp->ca_fileid; + if (cp2->c_fileid == 0) { + FREE_ZONE(cp2, sizeof(struct cnode), M_HFSNODE); + *vpp = NULL; + return (ENOENT); + } cp2->c_dev = dev; lockinit(&cp2->c_lock, PINOD, "cnode", 0, 0); (void) lockmgr(&cp2->c_lock, LK_EXCLUSIVE, (struct slock *)0, p); @@ -560,9 +591,9 @@ /* Release reference taken on opposite vnode (if any). */ if (rvp) - vput(rvp); + vrele(rvp); if (vp) - vput(vp); + vrele(vp); vp = new_vp; vp->v_ubcinfo = UBC_NOINFO; @@ -604,9 +635,7 @@ bzero(fp, sizeof(struct filefork)); fp->ff_cp = cp; if (forkp) - bcopy(forkp, &fp->ff_data, sizeof(HFSPlusForkData)); - if (fp->ff_clumpsize == 0) - fp->ff_clumpsize = HFSTOVCB(hfsmp)->vcbClpSiz; + bcopy(forkp, &fp->ff_data, sizeof(struct cat_fork)); rl_init(&fp->ff_invalidranges); if (wantrsrc) { if (cp->c_rsrcfork != NULL) @@ -632,7 +661,7 @@ vp->v_type = IFTOVT(cp->c_mode); /* Tag system files */ - if ((descp->cd_cnid < kHFSFirstUserCatalogNodeID) && (vp->v_type == VREG)) + if ((descp->cd_flags & CD_ISMETA) && (vp->v_type == VREG)) vp->v_flag |= VSYSTEM; /* Tag root directory */ if (cp->c_cnid == kRootDirID) @@ -672,6 +701,11 @@ vp->v_op = hfs_fifoop_p; #endif } + + /* + * Stop tracking an active hot file. + */ + (void) hfs_removehotfile(vp); /* Vnode is now initialized - see if anyone was waiting for it. */ CLR(cp->c_flag, C_ALLOC); diff -urN xnu-344.49/bsd/hfs/hfs_cnode.h xnu-517/bsd/hfs/hfs_cnode.h --- xnu-344.49/bsd/hfs/hfs_cnode.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_cnode.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -47,27 +47,30 @@ struct filefork { struct cnode *ff_cp; /* cnode associated with this fork */ struct rl_head ff_invalidranges; /* Areas of disk that should read back as zeroes */ + long ff_evtonly_refs; /* number of vnode references used solely for events (O_EVTONLY) */ union { struct hfslockf *ffu_lockf; /* Head of byte-level lock list. */ void *ffu_sysdata; /* private data for system files */ char *ffu_symlinkptr; /* symbolic link pathname */ } ff_un; struct cat_fork ff_data; - u_int32_t ff_unallocblocks; /* unallocated blocks (until cmap) */ }; +typedef struct filefork filefork_t; /* Aliases for common fields */ #define ff_size ff_data.cf_size #define ff_clumpsize ff_data.cf_clump +#define ff_bytesread ff_data.cf_bytesread #define ff_blocks ff_data.cf_blocks #define ff_extents ff_data.cf_extents +#define ff_unallocblocks ff_data.cf_vblocks + #define ff_symlinkptr ff_un.ffu_symlinkptr #define ff_lockf ff_un.ffu_lockf /* The btree code still needs these... */ #define fcbEOF ff_size -#define fcbClmpSize ff_clumpsize #define fcbExtents ff_extents #define fcbBTCBPtr ff_un.ffu_sysdata @@ -97,13 +100,16 @@ struct vnode *c_devvp; /* vnode for block I/O */ dev_t c_dev; /* cnode's device */ struct dquot *c_dquot[MAXQUOTAS]; /* cnode's quota info */ + struct klist c_knotes; /* knotes attached to this vnode */ cnid_t c_childhint; /* catalog hint for children */ struct cat_desc c_desc; /* cnode's descriptor */ struct cat_attr c_attr; /* cnode's attributes */ SLIST_HEAD(hfs_indexhead, hfs_index) c_indexlist; /* directory index list */ + long c_evtonly_refs; /* number of vnode references used solely for events (O_EVTONLY) */ struct filefork *c_datafork; /* cnode's data fork */ struct filefork *c_rsrcfork; /* cnode's rsrc fork */ }; +typedef struct cnode cnode_t; /* Aliases for common cnode fields */ #define c_cnid c_desc.cd_cnid @@ -131,23 +137,28 @@ /* Runtime cnode flags (kept in c_flag) */ -#define C_ACCESS 0x0001 /* Access time update request */ -#define C_CHANGE 0x0002 /* Change time update request */ -#define C_UPDATE 0x0004 /* Modification time update request */ -#define C_MODIFIED 0x0008 /* CNode has been modified */ -#define C_ATIMEMOD 0x0010 /* Access time has been modified */ - -#define C_NOEXISTS 0x0020 /* CNode has been deleted, catalog entry is gone */ -#define C_DELETED 0x0040 /* CNode has been marked to be deleted */ -#define C_HARDLINK 0x0080 /* CNode is a hard link */ - -#define C_ALLOC 0x0100 /* CNode is being allocated */ -#define C_WALLOC 0x0200 /* Waiting for allocation to finish */ -#define C_TRANSIT 0x0400 /* CNode is getting recycled */ -#define C_WTRANSIT 0x0800 /* Waiting for cnode getting recycled */ +#define C_ACCESS 0x00001 /* Access time update request */ +#define C_CHANGE 0x00002 /* Change time update request */ +#define C_UPDATE 0x00004 /* Modification time update request */ +#define C_MODIFIED 0x00008 /* CNode has been modified */ + +#define C_RELOCATING 0x00010 /* CNode's fork is being relocated */ +#define C_NOEXISTS 0x00020 /* CNode has been deleted, catalog entry is gone */ +#define C_DELETED 0x00040 /* CNode has been marked to be deleted */ +#define C_HARDLINK 0x00080 /* CNode is a hard link */ + +#define C_ALLOC 0x00100 /* CNode is being allocated */ +#define C_WALLOC 0x00200 /* Waiting for allocation to finish */ +#define C_TRANSIT 0x00400 /* CNode is getting recycled */ +#define C_WTRANSIT 0x00800 /* Waiting for cnode getting recycled */ +#define C_NOBLKMAP 0x01000 /* CNode blocks cannot be mapped */ +#define C_WBLKMAP 0x02000 /* Waiting for block map */ + +#define C_ZFWANTSYNC 0x04000 /* fsync requested and file has holes */ +#define C_VPREFHELD 0x08000 /* resource fork has done a vget() on c_vp (for its parent ptr) */ -#define C_RENAME 0x1000 /* CNode is being renamed */ -#define C_ZFWANTSYNC 0x2000 /* fsync requested and file has holes */ +#define C_FROMSYNC 0x10000 /* fsync was called from sync */ +#define C_FORCEUPDATE 0x20000 /* force the catalog entry update */ #define ZFTIMELIMIT (5 * 60) @@ -176,6 +187,8 @@ FTOC(fp)->c_rsrc_vp : \ FTOC(fp)->c_vp) +#define EVTONLYREFS(vp) ((vp->v_type == VREG) ? VTOF(vp)->ff_evtonly_refs : VTOC(vp)->c_evtonly_refs) + /* * Test for a resource fork */ @@ -189,23 +202,18 @@ */ #define C_TIMEMASK (C_ACCESS | C_CHANGE | C_UPDATE) -#define ATIME_ACCURACY 60 +#define C_CHANGEMASK (C_ACCESS | C_CHANGE | C_UPDATE | C_MODIFIED) + +#define ATIME_ACCURACY 1 +#define ATIME_ONDISK_ACCURACY 300 #define CTIMES(cp, t1, t2) { \ if ((cp)->c_flag & C_TIMEMASK) { \ /* \ - * If only the access time is changing then defer \ - * updating it on-disk util later (in hfs_inactive). \ - * If it was recently updated then skip the update. \ + * Only do the update if it is more than just \ + * the C_ACCESS field being updated. \ */ \ - if (((cp)->c_flag & (C_TIMEMASK | C_MODIFIED)) == C_ACCESS) { \ - if (((cp)->c_flag & C_ATIMEMOD) || \ - (t1)->tv_sec > ((cp)->c_atime + ATIME_ACCURACY)) { \ - (cp)->c_atime = (t1)->tv_sec; \ - (cp)->c_flag |= C_ATIMEMOD; \ - } \ - (cp)->c_flag &= ~C_ACCESS; \ - } else { \ + if (((cp)->c_flag & C_CHANGEMASK) != C_ACCESS) { \ if ((cp)->c_flag & C_ACCESS) { \ (cp)->c_atime = (t1)->tv_sec; \ } \ diff -urN xnu-344.49/bsd/hfs/hfs_encodinghint.c xnu-517/bsd/hfs/hfs_encodinghint.c --- xnu-344.49/bsd/hfs/hfs_encodinghint.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_encodinghint.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -34,16 +34,16 @@ #define CJK_CHINESE_SIMP 0x8 #define CJK_ALL 0xF -#define CJK_CHINESE (CJK_CHINESE_TRAD | CJK_CHINESE_SIMP) -#define CJK_KATAKANA (CJK_JAPAN | CJK_CHINESE_SIMP | CJK_KOREAN) +#define CJK_CHINESE (CJK_CHINESE_TRAD | CJK_CHINESE_SIMP) +#define CJK_KATAKANA (CJK_JAPAN) /* Remember the last unique CJK bit */ u_int8_t cjk_lastunique = 0; -/* CJK encoding bias */ +/* Encoding bias */ u_int32_t hfs_encodingbias = 0; - +int hfs_islatinbias = 0; /* Map CJK bits to Mac encoding */ u_int8_t cjk_encoding[] = { @@ -793,6 +793,14 @@ cjkstate = CJK_ALL; continue; } + if (hfs_islatinbias && ch >= 0x0300 && ch <= 0x0329) { + guess = hfs_encodingbias; + continue; + } + if (ch <= 0x03CE && ch >= 0x0384) { + guess = kTextEncodingMacGreek; + continue; + } if (ch <= 0x0491 && ch >= 0x0401) { guess = kTextEncodingMacCyrillic; continue; @@ -806,6 +814,35 @@ if (ch >= 0x0E00 && ch <= 0x0E5B) { return kTextEncodingMacThai; } + /* Catch a few Shift-JIS strays */ + if (guess == 0 || guess == kTextEncodingMacUnicode) { + if (ch == 0x2010 || ch == 0x2014 || ch == 0x2015 || ch == 0x2016) { + guess = kTextEncodingMacJapanese; + if ((cjkstate == 0) || (cjkstate & CJK_JAPAN)) + cjkstate = CJK_JAPAN; + else + cjkstate |= CJK_JAPAN; + continue; + } + if ((hfs_encodingbias == kTextEncodingMacJapanese) && + (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00AC)) { + guess = kTextEncodingMacJapanese; + continue; + } + /* TM char depends on the Mac encoding used. */ + if (ch == 0x2122) { + switch(hfs_encodingbias) { + case kTextEncodingMacJapanese: + case kTextEncodingMacChineseTrad: + case kTextEncodingMacKorean: + case kTextEncodingMacGreek: + case kTextEncodingMacThai: + case kTextEncodingMacChineseSimp: + guess = hfs_encodingbias; + break; + } + } + } if (guess == 0 && ch > 0x2122) { guess = kTextEncodingMacUnicode; } @@ -852,4 +889,33 @@ return guess; } + +__private_extern__ +u_int32_t +hfs_getencodingbias() +{ + return (hfs_encodingbias); +} + + +__private_extern__ +void +hfs_setencodingbias(u_int32_t bias) +{ + hfs_encodingbias = bias; + + switch (bias) { + case kTextEncodingMacRoman: + case kTextEncodingMacCentralEurRoman: + case kTextEncodingMacTurkish: + case kTextEncodingMacCroatian: + case kTextEncodingMacIcelandic: + case kTextEncodingMacRomanian: + hfs_islatinbias = 1; + break; + default: + hfs_islatinbias = 0; + break; + } +} diff -urN xnu-344.49/bsd/hfs/hfs_encodings.c xnu-517/bsd/hfs/hfs_encodings.c --- xnu-344.49/bsd/hfs/hfs_encodings.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_encodings.c Sat Oct 25 00:25:25 2003 @@ -55,7 +55,7 @@ #define MAX_HFS_UNICODE_CHARS (15*5) -int mac_roman_to_unicode(Str31 hfs_str, UniChar *uni_str, UInt32 maxCharLen, UInt32 *usedCharLen); +int mac_roman_to_unicode(const Str31 hfs_str, UniChar *uni_str, UInt32 maxCharLen, UInt32 *usedCharLen); static int unicode_to_mac_roman(UniChar *uni_str, UInt32 unicodeChars, Str31 hfs_str); @@ -202,7 +202,7 @@ encp = NULL; simple_unlock(&hfs_encoding_list_slock); - kmod_destroy(host_priv_self(), id); + kmod_destroy((host_priv_t) host_priv_self(), id); simple_lock(&hfs_encoding_list_slock); } break; @@ -614,7 +614,7 @@ * Unicode output is fully decomposed */ int -mac_roman_to_unicode(Str31 hfs_str, UniChar *uni_str, +mac_roman_to_unicode(const Str31 hfs_str, UniChar *uni_str, UInt32 maxCharLen, UInt32 *unicodeChars) { const UInt8 *p; diff -urN xnu-344.49/bsd/hfs/hfs_encodings.h xnu-517/bsd/hfs/hfs_encodings.h --- xnu-344.49/bsd/hfs/hfs_encodings.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_encodings.h Sat Oct 25 00:25:25 2003 @@ -32,10 +32,6 @@ #include #ifdef __APPLE_API_UNSTABLE -/* - * Sysctl value for HFS Unicode encoding matching. - */ -#define HFS_ENCODINGBIAS 1 /* encoding matching CJK bias */ #define CTL_HFS_NAMES { \ { 0, 0 }, \ @@ -55,7 +51,7 @@ * encoding conversion routines. */ -typedef int (* hfs_to_unicode_func_t)(Str31 hfs_str, UniChar *uni_str, +typedef int (* hfs_to_unicode_func_t)(const Str31 hfs_str, UniChar *uni_str, UInt32 maxCharLen, UInt32 *usedCharLen); typedef int (* unicode_to_hfs_func_t)(UniChar *uni_str, UInt32 unicodeChars, diff -urN xnu-344.49/bsd/hfs/hfs_endian.c xnu-517/bsd/hfs/hfs_endian.c --- xnu-344.49/bsd/hfs/hfs_endian.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_endian.c Sat Oct 25 00:25:25 2003 @@ -358,9 +358,31 @@ if (unswap) srcPtr[0] = SWAP_BE16 (srcPtr[0]); } + } else if (fileID > kHFSFirstUserCatalogNodeID) { + HotFileKey *srcKey; + UInt32 *srcRec; + + for (i = 0; i < srcDesc->numRecords; i++) { + srcKey = (HotFileKey *)((char *)src->buffer + srcOffs[i]); + + if (!unswap) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + srcRec = (u_int32_t *)((char *)srcKey + srcKey->keyLength + 2); + if (unswap) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + + /* Don't swap srcKey->forkType */ + /* Don't swap srcKey->pad */ + + srcKey->temperature = SWAP_BE32 (srcKey->temperature); + srcKey->fileID = SWAP_BE32 (srcKey->fileID); + + *((UInt32 *)srcRec) = SWAP_BE32 (*((UInt32 *)srcRec)); + } } else { panic ("%s unrecognized B-Tree type", "hfs_swap_BTNode:"); } + return (0); } diff -urN xnu-344.49/bsd/hfs/hfs_endian.h xnu-517/bsd/hfs/hfs_endian.h --- xnu-344.49/bsd/hfs/hfs_endian.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_endian.h Sat Oct 25 00:25:25 2003 @@ -51,7 +51,7 @@ /* HFS is always big endian, no swapping needed */ #define SWAP_HFS_PLUS_FORK_DATA(__a) - #define SWAP_BT_NODE(__a, __b, __c) + #define SWAP_BT_NODE(__a, __b, __c, __d) /************************/ /* LITTLE ENDIAN Macros */ diff -urN xnu-344.49/bsd/hfs/hfs_format.h xnu-517/bsd/hfs/hfs_format.h --- xnu-344.49/bsd/hfs/hfs_format.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_format.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -25,6 +25,9 @@ #ifndef __HFS_FORMAT__ #define __HFS_FORMAT__ +#ifndef __HFSVOLUMES__ + +#include #include /* @@ -48,9 +51,11 @@ enum { kHFSSigWord = 0x4244, /* 'BD' in ASCII */ kHFSPlusSigWord = 0x482B, /* 'H+' in ASCII */ - kHFSJSigWord = 0x484a, /* 'HJ' in ASCII */ - kHFSPlusVersion = 0x0004, /* will change as format changes */ - /* version 4 shipped with Mac OS 8.1 */ + kHFSXSigWord = 0x4858, /* 'HX' in ASCII */ + + kHFSPlusVersion = 0x0004, /* 'H+' volumes are version 4 only */ + kHFSXVersion = 0x0005, /* 'HX' volumes start with version 5 */ + kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */ kHFSJMountVersion = 0x4846534a /* 'HFSJ' for journaled HFS+ on OS X */ }; @@ -89,6 +94,7 @@ }; +#ifndef __FILES__ /* Unicode strings are used for HFS Plus file and folder names */ struct HFSUniStr255 { u_int16_t length; /* number of unicode characters */ @@ -96,6 +102,7 @@ }; typedef struct HFSUniStr255 HFSUniStr255; typedef const HFSUniStr255 *ConstHFSUniStr255Param; +#endif /* __FILES__ */ enum { kHFSMaxVolumeNameChars = 27, @@ -228,6 +235,7 @@ kHFSAllocationFileID = 6, /* File ID of the allocation file (HFS Plus only) */ kHFSStartupFileID = 7, /* File ID of the startup file (HFS Plus only) */ kHFSAttributesFileID = 8, /* File ID of the attribute file (HFS Plus only) */ + kHFSRepairCatalogFileID = 14, /* Used when rebuilding Catalog B-tree */ kHFSBogusExtentFileID = 15, /* Used for exchanging extents in extents file */ kHFSFirstUserCatalogNodeID = 16 }; @@ -458,7 +466,7 @@ kHFSBootVolumeInconsistentBit = 11, /* boot volume is inconsistent (System 7.6 and later) */ kHFSCatalogNodeIDsReusedBit = 12, kHFSVolumeJournaledBit = 13, /* this volume has a journal on it */ - /* Bit 14 is reserved for future use */ + kHFSVolumeInconsistentBit = 14, /* serious inconsistencies detected at runtime */ kHFSVolumeSoftwareLockBit = 15, /* volume is locked by software */ kHFSVolumeHardwareLockMask = 1 << kHFSVolumeHardwareLockBit, @@ -468,6 +476,7 @@ kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit, kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit, kHFSVolumeJournaledMask = 1 << kHFSVolumeJournaledBit, + kHFSVolumeInconsistentMask = 1 << kHFSVolumeInconsistentBit, kHFSVolumeSoftwareLockMask = 1 << kHFSVolumeSoftwareLockBit, kHFSMDBAttributesMask = 0x8380 }; @@ -509,6 +518,14 @@ typedef struct HFSMasterDirectoryBlock HFSMasterDirectoryBlock; +#ifdef __APPLE_API_UNSTABLE +#define SET_HFS_TEXT_ENCODING(hint) \ + (0x656e6300 | ((hint) & 0xff)) +#define GET_HFS_TEXT_ENCODING(hint) \ + (((hint) & 0xffffff00) == 0x656e6300 ? (hint) & 0x000000ff : 0xffffffffU) +#endif /* __APPLE_API_UNSTABLE */ + + /* HFS Plus Volume Header - 512 bytes */ /* Stored at sector #2 (3rd sector) and second-to-last sector. */ struct HFSPlusVolumeHeader { @@ -516,7 +533,6 @@ u_int16_t version; /* == kHFSPlusVersion */ u_int32_t attributes; /* volume attributes */ u_int32_t lastMountedVersion; /* implementation version which last mounted volume */ -//XXXdbg u_int32_t reserved; /* reserved - initialized as zero */ u_int32_t journalInfoBlock; /* block addr of journal info (if volume is journaled, zero otherwise) */ u_int32_t createDate; /* date and time of volume creation */ @@ -596,7 +612,7 @@ u_int16_t reserved1; /* unused */ u_int32_t clumpSize; /* reserved */ u_int8_t btreeType; /* reserved */ - u_int8_t reserved2; /* reserved */ + u_int8_t keyCompareType; /* Key string Comparison Type */ u_int32_t attributes; /* persistent attributes about the tree */ u_int32_t reserved3[16]; /* reserved */ }; @@ -609,6 +625,13 @@ kBTVariableIndexKeysMask = 0x00000004 /* keys in index nodes are variable length */ }; + +/* Catalog Key Name Comparison Type */ +enum { + kHFSCaseFolding = 0xCF, /* case folding (case-insensitive) */ + kHFSBinaryCompare = 0xBC, /* binary compare (case-sensitive) */ +}; + /* JournalInfoBlock - Structure that describes where our journal lives */ struct JournalInfoBlock { u_int32_t flags; @@ -631,5 +654,9 @@ #ifdef __cplusplus } #endif + +#else +#warning hfs_format.h is not compatible with HFSVolumes.h (include only one) +#endif /* __HFSVOLUMES__ */ #endif /* __HFS_FORMAT__ */ diff -urN xnu-344.49/bsd/hfs/hfs_hotfiles.c xnu-517/bsd/hfs/hfs_hotfiles.c --- xnu-344.49/bsd/hfs/hfs_hotfiles.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/hfs/hfs_hotfiles.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,2156 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "hfscommon/headers/BTreeScanner.h" + + +#define HFC_DEBUG 0 +#define HFC_VERBOSE 0 + + + +/* + * Hot File List (runtime). + */ +typedef struct hotfileinfo { + u_int32_t hf_fileid; + u_int32_t hf_temperature; + u_int32_t hf_blocks; +} hotfileinfo_t; + +typedef struct hotfilelist { + u_int32_t hfl_magic; + u_int32_t hfl_version; + time_t hfl_duration; /* duration of sample period */ + int hfl_count; /* count of hot files recorded */ + int hfl_next; /* next file to move */ + int hfl_totalblocks; /* total hot file blocks */ + int hfl_reclaimblks; /* blocks to reclaim in HFV */ + u_int32_t hfl_spare[2]; + hotfileinfo_t hfl_hotfile[1]; /* array of hot files */ +} hotfilelist_t; + + +/* + * Hot File Entry (runtime). + */ +typedef struct hotfile_entry { + struct hotfile_entry *left; + struct hotfile_entry *right; + u_int32_t fileid; + u_int32_t temperature; + u_int32_t blocks; +} hotfile_entry_t; + +/* + * Hot File Recording Data (runtime). + */ +typedef struct hotfile_data { + struct hfsmount *hfsmp; + long refcount; + int activefiles; /* active number of hot files */ + u_int32_t threshold; + u_int32_t maxblocks; + hotfile_entry_t *rootentry; + hotfile_entry_t *freelist; + hotfile_entry_t *coldest; + hotfile_entry_t entries[1]; +} hotfile_data_t; + + + +/* + * Hot File Data recording functions (in-memory binary tree). + */ +static void hf_insert (hotfile_data_t *, hotfile_entry_t *); +static void hf_delete (hotfile_data_t *, u_int32_t, u_int32_t); +static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t); +static hotfile_entry_t * hf_coldest (hotfile_data_t *); +static hotfile_entry_t * hf_getnewentry (hotfile_data_t *); +static int hf_getsortedlist (hotfile_data_t *, hotfilelist_t *); +static void hf_printtree (hotfile_entry_t *); + +/* + * Hot File misc support functions. + */ +static int hotfiles_collect (struct hfsmount *, struct proc *); +static int hotfiles_age (struct hfsmount *, struct proc *); +static int hotfiles_adopt (struct hfsmount *, struct proc *); +static int hotfiles_evict (struct hfsmount *, struct proc *); +static int hotfiles_refine (struct hfsmount *, struct proc *); +static int hotextents(struct hfsmount *, HFSPlusExtentDescriptor *); + +/* + * Hot File Cluster B-tree (on disk) functions. + */ +static int hfc_btree_create (struct hfsmount *, int, int); +static int hfc_btree_open (struct hfsmount *, struct vnode **); +static int hfc_btree_close (struct hfsmount *, struct vnode *); +static int hfc_comparekeys (HotFileKey *, HotFileKey *); + + +char hfc_tag[] = "CLUSTERED HOT FILES B-TREE "; + + +/* + *======================================================================== + * HOT FILE INTERFACE ROUTINES + *======================================================================== + */ + +/* + * Start recording the hotest files on a file system. + * + */ +__private_extern__ +int +hfs_recording_start(struct hfsmount *hfsmp, struct proc *p) +{ + hotfile_data_t *hotdata; + int maxentries; + size_t size; + int i; + int error; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || + (hfsmp->jnl == NULL) || + (hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) { + return (EPERM); + } + if (HFSTOVCB(hfsmp)->freeBlocks < (2 * hfsmp->hfs_hotfile_maxblks)) { + return (ENOSPC); + } + if (hfsmp->hfc_stage != HFC_IDLE) { + return (EBUSY); + } + hfsmp->hfc_stage = HFC_BUSY; + + /* + * Dump previous recording data. + */ + if (hfsmp->hfc_recdata) { + void * tmp; + + tmp = hfsmp->hfc_recdata; + hfsmp->hfc_recdata = NULL; + FREE(tmp, M_TEMP); + } + + /* + * On first startup check for suspended recording. + */ + if (hfsmp->hfc_timebase == 0 && + hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) == 0) { + HotFilesInfo hotfileinfo; + + if ((BTGetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, + sizeof(hotfileinfo)) == 0) && + (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC) && + (SWAP_BE32 (hotfileinfo.timeleft) > 0) && + (SWAP_BE32 (hotfileinfo.timebase) > 0)) { + hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt); + hfsmp->hfc_timeout = SWAP_BE32 (hotfileinfo.timeleft) + time.tv_sec ; + hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase); +#if HFC_VERBOSE + printf("HFS: resume recording hot files (%d left)\n", SWAP_BE32 (hotfileinfo.timeleft)); +#endif + } else { + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + hfsmp->hfc_timebase = time.tv_sec + 1; + hfsmp->hfc_timeout = hfsmp->hfc_timebase + HFC_DEFAULT_DURATION; + } + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } else { + struct cat_attr cattr; + u_int32_t cnid; + + /* + * Make sure a btree file exists. + */ + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); + if ((cnid == 0) && + !S_ISREG(cattr.ca_mode) && + (error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT))) { + hfsmp->hfc_stage = HFC_IDLE; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); + } +#if HFC_VERBOSE + printf("HFS: begin recording hot files\n"); +#endif + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + hfsmp->hfc_timeout = time.tv_sec + HFC_DEFAULT_DURATION; + + /* Reset time base. */ + if (hfsmp->hfc_timebase == 0) { + hfsmp->hfc_timebase = time.tv_sec + 1; + } else { + u_int32_t cumulativebase; + u_int32_t oldbase = hfsmp->hfc_timebase; + + cumulativebase = hfsmp->hfc_timeout - (HFC_CUMULATIVE_CYCLES * HFC_DEFAULT_DURATION); + hfsmp->hfc_timebase = MAX(hfsmp->hfc_timebase, cumulativebase); + } + } + + if ((hfsmp->hfc_maxfiles == 0) || + (hfsmp->hfc_maxfiles > HFC_MAXIMUM_FILE_COUNT)) { + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + } + maxentries = hfsmp->hfc_maxfiles; + + size = sizeof(hotfile_data_t) + (maxentries * sizeof(hotfile_entry_t)); + MALLOC(hotdata, hotfile_data_t *, size, M_TEMP, M_WAITOK); + bzero(hotdata, size); + + for (i = 1; i < maxentries ; i++) + hotdata->entries[i-1].right = &hotdata->entries[i]; + + hotdata->freelist = &hotdata->entries[0]; + /* + * Establish minimum temperature and maximum file size. + */ + hotdata->threshold = HFC_MINIMUM_TEMPERATURE; + hotdata->maxblocks = HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize; + hotdata->hfsmp = hfsmp; + + hfsmp->hfc_recdata = hotdata; +out: + hfsmp->hfc_stage = HFC_RECORDING; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (0); +} + +/* + * Stop recording the hotest files on a file system. + */ +__private_extern__ +int +hfs_recording_stop(struct hfsmount *hfsmp, struct proc *p) +{ + hotfile_data_t *hotdata; + hotfilelist_t *listp; + size_t size; + enum hfc_stage newstage = HFC_IDLE; + void * tmp; + int error; + + + if (hfsmp->hfc_stage != HFC_RECORDING) + return (EPERM); + + hotfiles_collect(hfsmp, p); + + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + hfsmp->hfc_stage = HFC_BUSY; + + /* + * Convert hot file data into a simple file id list.... + * + * then dump the sample data + */ +#if HFC_VERBOSE + printf("HFS: end of hot file recording\n"); +#endif + hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; + if (hotdata == NULL) + return (0); + hfsmp->hfc_recdata = NULL; + hfsmp->hfc_stage = HFC_EVALUATION; + wakeup((caddr_t)&hfsmp->hfc_stage); + +#if HFC_VERBOSE + printf(" curentries: %d\n", hotdata->activefiles); +#endif + /* + * If no hot files recorded then we're done. + */ + if (hotdata->rootentry == NULL) { + error = 0; + goto out; + } + + /* Open the B-tree file for writing... */ + if (hfsmp->hfc_filevp) + panic("hfs_recording_stop: hfc_filevp exists (vp = 0x%08x)", hfsmp->hfc_filevp); + + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { + goto out; + } + + /* + * Age the previous set of clustered hot files. + */ + error = hotfiles_age(hfsmp, p); + if (error) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + goto out; + } + + /* + * Create a sorted list of hotest files. + */ + size = sizeof(hotfilelist_t); + size += sizeof(hotfileinfo_t) * (hotdata->activefiles - 1); + MALLOC(listp, hotfilelist_t *, size, M_TEMP, M_WAITOK); + bzero(listp, size); + + hf_getsortedlist(hotdata, listp); + listp->hfl_duration = time.tv_sec - hfsmp->hfc_timebase; + hfsmp->hfc_recdata = listp; + + /* + * Account for duplicates. + */ + error = hotfiles_refine(hfsmp, p); + if (error) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + goto out; + } + + /* + * Compute the amount of space to reclaim... + */ + if (listp->hfl_totalblocks > hfsmp->hfs_hotfile_freeblks) { + listp->hfl_reclaimblks = + MIN(listp->hfl_totalblocks, hfsmp->hfs_hotfile_maxblks) - + hfsmp->hfs_hotfile_freeblks; +#if HFC_VERBOSE + printf("hfs_recording_stop: need to reclaim %d blocks\n", listp->hfl_reclaimblks); +#endif + if (listp->hfl_reclaimblks) + newstage = HFC_EVICTION; + else + newstage = HFC_ADOPTION; + } else { + newstage = HFC_ADOPTION; + } + + if (newstage == HFC_ADOPTION && listp->hfl_totalblocks == 0) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + newstage = HFC_IDLE; + } +out: +#if HFC_VERBOSE + if (newstage == HFC_EVICTION) + printf("HFS: evicting coldest files\n"); + else if (newstage == HFC_ADOPTION) + printf("HFS: adopting hotest files\n"); +#endif + FREE(hotdata, M_TEMP); + + hfsmp->hfc_stage = newstage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Suspend recording the hotest files on a file system. + */ +__private_extern__ +int +hfs_recording_suspend(struct hfsmount *hfsmp, struct proc *p) +{ + HotFilesInfo hotfileinfo; + hotfile_data_t *hotdata; + int error; + + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + hotdata = (hotfile_data_t *)hfsmp->hfc_recdata; + if (hotdata == NULL) { + hfsmp->hfc_stage = HFC_DISABLED; + return (0); + } + hfsmp->hfc_stage = HFC_BUSY; + +#if HFC_VERBOSE + printf("HFS: suspend hot file recording\n"); +#endif + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { + printf("hfs_recording_suspend: err %d opening btree\n", error); + goto out; + } + + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + error = EINVAL; + goto out; + } + } + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + + hotfileinfo.magic = SWAP_BE32 (HFC_MAGIC); + hotfileinfo.version = SWAP_BE32 (HFC_VERSION); + hotfileinfo.duration = SWAP_BE32 (HFC_DEFAULT_DURATION); + hotfileinfo.timebase = SWAP_BE32 (hfsmp->hfc_timebase); + hotfileinfo.timeleft = SWAP_BE32 (hfsmp->hfc_timeout - time.tv_sec); + hotfileinfo.threshold = SWAP_BE32 (hotdata->threshold); + hotfileinfo.maxfileblks = SWAP_BE32 (hotdata->maxblocks); + hotfileinfo.maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); + strcpy(hotfileinfo.tag, hfc_tag); + (void) BTSetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, sizeof(hotfileinfo)); + + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; +out: + FREE(hotdata, M_TEMP); + + hfsmp->hfc_stage = HFC_DISABLED; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Abort a hot file recording session. + */ +__private_extern__ +int +hfs_recording_abort(struct hfsmount *hfsmp, struct proc *p) +{ + void * tmp; + + if (hfsmp->hfc_stage == HFC_DISABLED) + return (0); + + if (hfsmp->hfc_stage == HFC_BUSY) { + (void) tsleep((caddr_t)&hfsmp->hfc_stage, PINOD, "hfs_recording_abort", 0); + } + hfsmp->hfc_stage = HFC_BUSY; + + printf("HFS: terminate hot file recording\n"); + + if (hfsmp->hfc_recdata) { + tmp = hfsmp->hfc_recdata; + hfsmp->hfc_recdata = NULL; + FREE(tmp, M_TEMP); + } + hfsmp->hfc_stage = HFC_DISABLED; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (0); +} + +/* + * + */ +__private_extern__ +int +hfs_recording_init(struct hfsmount *hfsmp, struct proc *p) +{ + CatalogKey * keyp; + CatalogRecord * datap; + u_int32_t dataSize; + HFSPlusCatalogFile *filep; + BTScanState scanstate; + BTreeIterator * iterator; + FSBufferDescriptor record; + HotFileKey * key; + filefork_t * filefork; + u_int32_t data; + struct cat_attr cattr; + u_int32_t cnid; + int error = 0; + + int inserted = 0; /* debug variables */ + int filecount = 0; + + /* + * If the Hot File btree exists then metadata zone is ready. + */ + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); + if (cnid != 0 && S_ISREG(cattr.ca_mode)) { + if (hfsmp->hfc_stage == HFC_DISABLED) + hfsmp->hfc_stage = HFC_IDLE; + return (0); + } + /* + * For now, only the boot volume is supported. + */ + if ((HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) { + hfsmp->hfs_flags &= ~HFS_METADATA_ZONE; + return (EPERM); + } + error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); + if (error) { + return (error); + } + /* + * Open the Hot File B-tree file for writing. + */ + if (hfsmp->hfc_filevp) + panic("hfs_recording_init: hfc_filevp exists (vp = 0x%08x)", hfsmp->hfc_filevp); + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { + return (error); + } + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + key = (HotFileKey*) &iterator->key; + key->keyLength = HFC_KEYLENGTH; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; +#if HFC_VERBOSE + printf("Evaluating space for \"%s\" metadata zone...\n", HFSTOVCB(hfsmp)->vcbVN); +#endif + /* + * Get ready to scan the Catalog file. + */ + error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, + kCatSearchBufferSize, &scanstate); + if (error) { + printf("hfs_recording_init: err %d BTScanInit\n", error); + goto out2; + } + + /* + * The writes to Hot File B-tree file are journaled. + */ + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + error = EINVAL; + goto out1; + } + } + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + filefork = VTOF(hfsmp->hfc_filevp); + + /* + * Visit all the catalog btree leaf records. + */ + for (;;) { + error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize); + if (error) { + if (error == btNotFound) + error = 0; + else + printf("hfs_recording_init: err %d BTScanNext\n", error); + break; + } + if ((datap->recordType != kHFSPlusFileRecord) || + (dataSize != sizeof(HFSPlusCatalogFile))) { + continue; + } + filep = (HFSPlusCatalogFile *)datap; + filecount++; + if (filep->dataFork.totalBlocks == 0) { + continue; + } + /* + * Any file that has blocks inside the hot file + * space is recorded for later eviction. + * + * For now, resource forks are ignored. + */ + if (!hotextents(hfsmp, &filep->dataFork.extents[0])) { + continue; + } + cnid = filep->fileID; + + /* Skip over journal files. */ + if (cnid == hfsmp->hfs_jnlfileid || cnid == hfsmp->hfs_jnlinfoblkid) { + continue; + } + /* + * XXX - need to skip quota files as well. + */ + + /* Insert a hot file entry. */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_MINIMUM_TEMPERATURE; + key->fileID = cnid; + key->forkType = 0; + data = 0x3f3f3f3f; + error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + if (error) { + printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + + /* Insert the corresponding thread record. */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = cnid; + key->forkType = 0; + data = HFC_MINIMUM_TEMPERATURE; + error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + if (error) { + printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + inserted++; + } + (void) BTFlushPath(filefork); + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); +#if HFC_VERBOSE + printf("%d files identified out of %d\n", inserted, filecount); +#endif + +out1: + (void) BTScanTerminate(&scanstate, &data, &data, &data); +out2: + FREE(iterator, M_TEMP); + if (hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + if (error == 0) + hfsmp->hfc_stage = HFC_IDLE; + + return (error); +} + +/* + * Use sync to perform ocassional background work. + */ +__private_extern__ +int +hfs_hotfilesync(struct hfsmount *hfsmp, struct proc *p) +{ + if ((HFSTOVFS(hfsmp)->mnt_kern_flag & MNTK_UNMOUNT) == 0 && hfsmp->hfc_stage) { + switch (hfsmp->hfc_stage) { + case HFC_IDLE: + (void) hfs_recording_start(hfsmp, p); + break; + + case HFC_RECORDING: + if (time.tv_sec > hfsmp->hfc_timeout) + (void) hfs_recording_stop(hfsmp, p); + break; + + case HFC_EVICTION: + (void) hotfiles_evict(hfsmp, p); + break; + + case HFC_ADOPTION: + (void) hotfiles_adopt(hfsmp, p); + break; + } + } + return (0); +} + +/* + * Add a hot file to the recording list. + * + * This can happen when a hot file gets reclaimed or at the + * end of the recording period for any active hot file. + * + * NOTE: Since both the data and resource fork can be hot, + * there can be two entries for the same file id. + * + */ +__private_extern__ +int +hfs_addhotfile(struct vnode *vp) +{ + hotfile_data_t *hotdata; + hotfile_entry_t *entry; + hfsmount_t *hfsmp; + cnode_t *cp; + filefork_t *ffp; + u_int32_t temperature; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + if (!(vp->v_type == VREG || vp->v_type == VLNK) || + (vp->v_flag & (VSYSTEM | VSWAP))) { + return (0); + } + /* Skip resource forks for now. */ + if (VNODE_IS_RSRC(vp)) { + return (0); + } + if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) { + return (0); + } + ffp = VTOF(vp); + cp = VTOC(vp); + + if ((ffp->ff_bytesread == 0) || + (ffp->ff_blocks == 0) || + (ffp->ff_blocks > hotdata->maxblocks) || + (cp->c_flag & (C_DELETED | C_NOEXISTS)) || + (cp->c_flags & UF_NODUMP) || + (cp->c_atime < hfsmp->hfc_timebase)) { + return (0); + } + + temperature = ffp->ff_bytesread / ffp->ff_size; + if (temperature < hotdata->threshold) { + return (0); + } + /* + * If there is room or this file is hotter than + * the coldest one then add it to the list. + * + */ + if ((hotdata->activefiles < hfsmp->hfc_maxfiles) || + (hotdata->coldest == NULL) || + (temperature > hotdata->coldest->temperature)) { + ++hotdata->refcount; + entry = hf_getnewentry(hotdata); + entry->temperature = temperature; + entry->fileid = cp->c_fileid; + entry->blocks = ffp->ff_blocks; + hf_insert(hotdata, entry); + --hotdata->refcount; + } + + return (0); +} + +/* + * Remove a hot file to the recording list. + * + * This can happen when a hot file becomes + * an active vnode (active hot files are + * not kept in the recording list until the + * end of the recording period). + * + */ +__private_extern__ +int +hfs_removehotfile(struct vnode *vp) +{ + hotfile_data_t *hotdata; + hfsmount_t *hfsmp; + cnode_t *cp; + filefork_t *ffp; + u_int32_t temperature; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + if (!(vp->v_type == VREG || vp->v_type == VLNK) || + (vp->v_flag & (VSYSTEM | VSWAP))) { + return (0); + } + if ((hotdata = (hotfile_data_t *)hfsmp->hfc_recdata) == NULL) + return (0); + + ffp = VTOF(vp); + cp = VTOC(vp); + + if ((ffp->ff_bytesread == 0) || (ffp->ff_blocks == 0) || + (cp->c_atime < hfsmp->hfc_timebase)) { + return (0); + } + + temperature = ffp->ff_bytesread / ffp->ff_size; + if (temperature < hotdata->threshold) + return (0); + + if (hotdata->coldest && (temperature >= hotdata->coldest->temperature)) { + ++hotdata->refcount; + hf_delete(hotdata, VTOC(vp)->c_fileid, temperature); + --hotdata->refcount; + } + + return (0); +} + + +/* + *======================================================================== + * HOT FILE MAINTENANCE ROUTINES + *======================================================================== + */ + +/* + * Add all active hot files to the recording list. + */ +static int +hotfiles_collect(struct hfsmount *hfsmp, struct proc *p) +{ + struct mount *mp = HFSTOVFS(hfsmp); + struct vnode *nvp, *vp; + struct cnode *cp; + int error; + + if (vfs_busy(mp, LK_NOWAIT, 0, p)) + return (0); +loop: + simple_lock(&mntvnode_slock); + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + if (vp->v_mount != mp) { + simple_unlock(&mntvnode_slock); + goto loop; + } + simple_lock(&vp->v_interlock); + nvp = vp->v_mntvnodes.le_next; + + if ((vp->v_flag & VSYSTEM) || + !(vp->v_type == VREG || vp->v_type == VLNK)) { + simple_unlock(&vp->v_interlock); + continue; + } + + cp = VTOC(vp); + if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { + simple_unlock(&vp->v_interlock); + continue; + } + + simple_unlock(&mntvnode_slock); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); + if (error) { + if (error == ENOENT) + goto loop; + simple_lock(&mntvnode_slock); + continue; + } + (void) hfs_addhotfile(vp); + vput(vp); + + simple_lock(&mntvnode_slock); + } + + simple_unlock(&mntvnode_slock); + + vfs_unbusy(mp, p); + + return (0); +} + + +/* + * Update the data of a btree record + * This is called from within BTUpdateRecord. + */ +static int +update_callback(const HotFileKey *key, u_int32_t *data, u_int16_t datalen, u_int32_t *state) +{ + if (key->temperature == HFC_LOOKUPTAG) + *data = *state; + return (0); +} + +/* + * Identify files already in hot area. + */ +static int +hotfiles_refine(struct hfsmount *hfsmp, struct proc *p) +{ + BTreeIterator * iterator; + struct mount *mp; + struct vnode *vp; + filefork_t * filefork; + hotfilelist_t *listp; + FSBufferDescriptor record; + HotFileKey * key; + u_int32_t data; + int i; + int error = 0; + + + if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) + return (0); + + mp = HFSTOVFS(hfsmp); + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + key = (HotFileKey*) &iterator->key; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + error = EINVAL; + goto out; + } + } + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + filefork = VTOF(hfsmp->hfc_filevp); + + for (i = 0; i < listp->hfl_count; ++i) { + /* + * Check if entry (thread) is already in hot area. + */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + (void) BTInvalidateHint(iterator); + if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) != 0) { + continue; /* not in hot area, so skip */ + } + + /* + * Update thread entry with latest temperature. + */ + error = BTUpdateRecord(filefork, iterator, + (IterateCallBackProcPtr)update_callback, + &listp->hfl_hotfile[i].hf_temperature); + if (error) { + printf("hotfiles_refine: BTUpdateRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + // break; + } + /* + * Re-key entry with latest temperature. + */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = data; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + /* Pick up record data. */ + (void) BTInvalidateHint(iterator); + (void) BTSearchRecord(filefork, iterator, &record, NULL, iterator); + error = BTDeleteRecord(filefork, iterator); + if (error) { + printf("hotfiles_refine: BTDeleteRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + key->keyLength = HFC_KEYLENGTH; + key->temperature = listp->hfl_hotfile[i].hf_temperature; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + if (error) { + printf("hotfiles_refine: BTInsertRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + + /* + * Invalidate this entry in the list. + */ + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + + } /* end for */ + + (void) BTFlushPath(filefork); + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); +out: + FREE(iterator, M_TEMP); + return (error); +} + +/* + * Move new hot files into hot area. + */ +static int +hotfiles_adopt(struct hfsmount *hfsmp, struct proc *p) +{ + BTreeIterator * iterator; + struct mount *mp; + struct vnode *vp; + filefork_t * filefork; + hotfilelist_t *listp; + FSBufferDescriptor record; + HotFileKey * key; + u_int32_t data; + enum hfc_stage stage; + int fileblocks; + int blksmoved; + int i; + int last; + int error = 0; + int startedtrans = 0; + int aquiredlock = 0; + + if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) + return (0); + + if (hfsmp->hfc_stage != HFC_ADOPTION) { + return (EBUSY); + } + stage = hfsmp->hfc_stage; + hfsmp->hfc_stage = HFC_BUSY; + + mp = HFSTOVFS(hfsmp); + blksmoved = 0; + last = listp->hfl_next + HFC_FILESPERSYNC; + if (last > listp->hfl_count) + last = listp->hfl_count; + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + key = (HotFileKey*) &iterator->key; + key->keyLength = HFC_KEYLENGTH; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + filefork = VTOF(hfsmp->hfc_filevp); + + for (i = listp->hfl_next; (i < last) && (blksmoved < HFC_BLKSPERSYNC); ++i) { + /* + * Skip invalid entries (already in hot area). + */ + if (listp->hfl_hotfile[i].hf_temperature == 0) { + listp->hfl_next++; + continue; + } + /* + * Acquire a vnode for this file. + */ + error = VFS_VGET(mp, &listp->hfl_hotfile[i].hf_fileid, &vp); + if (error) { + if (error == ENOENT) { + error = 0; + listp->hfl_next++; + continue; /* stale entry, go to next */ + } + break; + } + if (vp->v_type != VREG && vp->v_type != VLNK) { + printf("hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid); + vput(vp); + listp->hfl_hotfile[i].hf_temperature == 0; + listp->hfl_next++; + continue; /* stale entry, go to next */ + } + if (hotextents(hfsmp, &VTOF(vp)->ff_extents[0])) { + vput(vp); + listp->hfl_hotfile[i].hf_temperature == 0; + listp->hfl_next++; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + continue; /* stale entry, go to next */ + } + fileblocks = VTOF(vp)->ff_blocks; + if (fileblocks > hfsmp->hfs_hotfile_freeblks) { + vput(vp); + listp->hfl_next++; + listp->hfl_totalblocks -= fileblocks; + continue; /* entry too big, go to next */ + } + + if ((blksmoved > 0) && + (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { + vput(vp); + break; + } + /* Start a new transaction. */ + hfs_global_shared_lock_acquire(hfsmp); + aquiredlock = 1; + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + error = EINVAL; + vput(vp); + break; + } + startedtrans = 1; + } + + error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, p->p_ucred, p); + vput(vp); + if (error) + break; + + /* Keep hot file free space current. */ + hfsmp->hfs_hotfile_freeblks -= fileblocks; + listp->hfl_totalblocks -= fileblocks; + + /* Insert hot file entry */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = listp->hfl_hotfile[i].hf_temperature; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + if (VTOC(vp)->c_desc.cd_nameptr) + data = *(u_int32_t *)(VTOC(vp)->c_desc.cd_nameptr); + else + data = 0x3f3f3f3f; + + error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + if (error) { + printf("hotfiles_adopt: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + stage = HFC_IDLE; + break; + } + + /* Insert thread record */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + data = listp->hfl_hotfile[i].hf_temperature; + error = BTInsertRecord(filefork, iterator, &record, sizeof(data)); + if (error) { + printf("hotfiles_adopt: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + stage = HFC_IDLE; + break; + } + (void) BTFlushPath(filefork); + + /* Transaction complete. */ + if (startedtrans) { + journal_end_transaction(hfsmp->jnl); + startedtrans = 0; + } + hfs_global_shared_lock_release(hfsmp); + aquiredlock = 0; + + blksmoved += fileblocks; + listp->hfl_next++; + if (listp->hfl_next >= listp->hfl_count) { + break; + } + if (hfsmp->hfs_hotfile_freeblks <= 0) { +#if HFC_VERBOSE + printf("hotfiles_adopt: free space exhausted (%d)\n", hfsmp->hfs_hotfile_freeblks); +#endif + break; + } + } /* end for */ + +#if HFC_VERBOSE + printf("hotfiles_adopt: [%d] adopted %d blocks (%d left)\n", listp->hfl_next, blksmoved, listp->hfl_totalblocks); +#endif + /* Finish any outstanding transactions. */ + if (startedtrans) { + (void) BTFlushPath(filefork); + journal_end_transaction(hfsmp->jnl); + startedtrans = 0; + } + if (aquiredlock) { + hfs_global_shared_lock_release(hfsmp); + aquiredlock = 0; + } + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + + if ((listp->hfl_next >= listp->hfl_count) || (hfsmp->hfs_hotfile_freeblks <= 0)) { +#if HFC_VERBOSE + printf("hotfiles_adopt: all done relocating %d files\n", listp->hfl_count); + printf("hotfiles_adopt: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); +#endif + stage = HFC_IDLE; + } + FREE(iterator, M_TEMP); + + if (stage != HFC_ADOPTION && hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + hfsmp->hfc_stage = stage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Reclaim space by evicting the coldest files. + */ +static int +hotfiles_evict(struct hfsmount *hfsmp, struct proc *p) +{ + BTreeIterator * iterator; + struct mount *mp; + struct vnode *vp; + HotFileKey * key; + filefork_t * filefork; + hotfilelist_t *listp; + enum hfc_stage stage; + int blksmoved; + int filesmoved; + int fileblocks; + int error = 0; + int startedtrans = 0; + int aquiredlock = 0; + + if (hfsmp->hfc_stage != HFC_EVICTION) { + return (EBUSY); + } + + if ((listp = (hotfilelist_t *)hfsmp->hfc_recdata) == NULL) + return (0); + + stage = hfsmp->hfc_stage; + hfsmp->hfc_stage = HFC_BUSY; + + mp = HFSTOVFS(hfsmp); + filesmoved = blksmoved = 0; + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + key = (HotFileKey*) &iterator->key; + + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + filefork = VTOF(hfsmp->hfc_filevp); + + while (listp->hfl_reclaimblks > 0 && + blksmoved < HFC_BLKSPERSYNC && + filesmoved < HFC_FILESPERSYNC) { + + /* + * Obtain the first record (ie the coldest one). + */ + if (BTIterateRecord(filefork, kBTreeFirstRecord, iterator, NULL, NULL) != 0) { +#if HFC_VERBOSE + printf("hotfiles_evict: no more records\n"); +#endif + error = 0; + stage = HFC_ADOPTION; + break; + } + if (key->keyLength != HFC_KEYLENGTH) { + printf("hotfiles_evict: invalid key length %d\n", key->keyLength); + error = EFTYPE; + break; + } + if (key->temperature == HFC_LOOKUPTAG) { +#if HFC_VERBOSE + printf("hotfiles_evict: ran into thread records\n"); +#endif + error = 0; + stage = HFC_ADOPTION; + break; + } + /* + * Aquire the vnode for this file. + */ + error = VFS_VGET(mp, &key->fileID, &vp); + + /* Start a new transaction. */ + hfs_global_shared_lock_acquire(hfsmp); + aquiredlock = 1; + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + if (error == 0) + vput(vp); + error = EINVAL; + break; + } + startedtrans = 1; + } + if (error) { + if (error == ENOENT) { + (void) BTDeleteRecord(filefork, iterator); + key->temperature = HFC_LOOKUPTAG; + (void) BTDeleteRecord(filefork, iterator); + goto next; /* stale entry, go to next */ + } else { + printf("hotfiles_evict: err %d getting file %d (%d)\n", + error, key->fileID); + } + break; + } + if (vp->v_type != VREG && vp->v_type != VLNK) { + printf("hotfiles_evict: huh, not a file %d\n", key->fileID); + vput(vp); + (void) BTDeleteRecord(filefork, iterator); + key->temperature = HFC_LOOKUPTAG; + (void) BTDeleteRecord(filefork, iterator); + goto next; /* invalid entry, go to next */ + } + fileblocks = VTOF(vp)->ff_blocks; + if ((blksmoved > 0) && + (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { + vput(vp); + break; + } + /* + * Make sure file is in the hot area. + */ + if (!hotextents(hfsmp, &VTOF(vp)->ff_extents[0])) { +#if HFC_VERBOSE + printf("hotfiles_evict: file %d isn't hot!\n", key->fileID); +#endif + vput(vp); + (void) BTDeleteRecord(filefork, iterator); + key->temperature = HFC_LOOKUPTAG; + (void) BTDeleteRecord(filefork, iterator); + goto next; /* go to next */ + } + + /* + * Relocate file out of hot area. + */ + error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, p->p_ucred, p); + if (error) { + /* XXX skip to next record here! */ + printf("hotfiles_evict: err % relocating file\n", error, key->fileID); + vput(vp); + break; + } + (void) VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p); + + vput(vp); + + hfsmp->hfs_hotfile_freeblks += fileblocks; + listp->hfl_reclaimblks -= fileblocks; + if (listp->hfl_reclaimblks < 0) + listp->hfl_reclaimblks = 0; + blksmoved += fileblocks; + filesmoved++; + + error = BTDeleteRecord(filefork, iterator); + if (error) { + printf("hotfiles_evict: BTDeleteRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + key->temperature = HFC_LOOKUPTAG; + error = BTDeleteRecord(filefork, iterator); + if (error) { + printf("hotfiles_evict: BTDeleteRecord thread failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } +next: + (void) BTFlushPath(filefork); + + /* Transaction complete. */ + if (startedtrans) { + journal_end_transaction(hfsmp->jnl); + startedtrans = 0; + } + hfs_global_shared_lock_release(hfsmp); + aquiredlock = 0; + + } /* end while */ + +#if HFC_VERBOSE + printf("hotfiles_evict: moved %d files (%d blks, %d to go)\n", filesmoved, blksmoved, listp->hfl_reclaimblks); +#endif + /* Finish any outstanding transactions. */ + if (startedtrans) { + (void) BTFlushPath(filefork); + journal_end_transaction(hfsmp->jnl); + startedtrans = 0; + } + if (aquiredlock) { + hfs_global_shared_lock_release(hfsmp); + aquiredlock = 0; + } + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + + /* + * Move to next stage when finished. + */ + if (listp->hfl_reclaimblks <= 0) { + stage = HFC_ADOPTION; +#if HFC_VERBOSE + printf("hotfiles_evict: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); +#endif + } + FREE(iterator, M_TEMP); + hfsmp->hfc_stage = stage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Age the existing records in the hot files b-tree. + */ +static int +hotfiles_age(struct hfsmount *hfsmp, struct proc *p) +{ + BTreeInfoRec btinfo; + BTreeIterator * iterator; + BTreeIterator * prev_iterator; + FSBufferDescriptor record; + FSBufferDescriptor prev_record; + HotFileKey * key; + HotFileKey * prev_key; + filefork_t * filefork; + u_int32_t data; + u_int32_t prev_data; + u_int32_t newtemp; + int error; + int i; + int numrecs; + int aged = 0; + u_int16_t reclen; + + + MALLOC(iterator, BTreeIterator *, 2 * sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, 2 * sizeof(*iterator)); + key = (HotFileKey*) &iterator->key; + + prev_iterator = &iterator[1]; + prev_key = (HotFileKey*) &prev_iterator->key; + + record.bufferAddress = &data; + record.itemSize = sizeof(data); + record.itemCount = 1; + prev_record.bufferAddress = &prev_data; + prev_record.itemSize = sizeof(prev_data); + prev_record.itemCount = 1; + + /* + * Capture b-tree changes inside a transaction + */ + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + error = EINVAL; + goto out2; + } + } + vn_lock(hfsmp->hfc_filevp, LK_EXCLUSIVE | LK_RETRY, p); + filefork = VTOF(hfsmp->hfc_filevp); + + error = BTGetInformation(filefork, 0, &btinfo); + if (error) { + error = MacToVFSError(error); + goto out; + } + if (btinfo.numRecords < 2) { + error = 0; + goto out; + } + + /* Only want 1st half of leaf records */ + numrecs = (btinfo.numRecords /= 2) - 1; + + error = BTIterateRecord(filefork, kBTreeFirstRecord, iterator, &record, &reclen); + if (error) { + printf("hfs_agehotfiles: BTIterateRecord: %d\n", error); + error = MacToVFSError(error); + goto out; + } + bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); + prev_data = data; + + for (i = 0; i < numrecs; ++i) { + error = BTIterateRecord(filefork, kBTreeNextRecord, iterator, &record, &reclen); + if (error == 0) { + if (key->temperature < prev_key->temperature) { + printf("hfs_agehotfiles: out of order keys!\n"); + error = EFTYPE; + break; + } + if (reclen != sizeof(data)) { + printf("hfs_agehotfiles: invalid record length %d\n", reclen); + error = EFTYPE; + break; + } + if (key->keyLength != HFC_KEYLENGTH) { + printf("hfs_agehotfiles: invalid key length %d\n", key->keyLength); + error = EFTYPE; + break; + } + } else if ((error == fsBTEndOfIterationErr || error == fsBTRecordNotFoundErr) && + (i == (numrecs - 1))) { + error = 0; + } else if (error) { + printf("hfs_agehotfiles: %d of %d BTIterateRecord: %d\n", i, numrecs, error); + error = MacToVFSError(error); + break; + } + if (prev_key->temperature == HFC_LOOKUPTAG) { +#if HFC_VERBOSE + printf("hfs_agehotfiles: ran into thread record\n"); +#endif + error = 0; + break; + } + error = BTDeleteRecord(filefork, prev_iterator); + if (error) { + printf("hfs_agehotfiles: BTDeleteRecord failed %d (file %d)\n", error, prev_key->fileID); + error = MacToVFSError(error); + break; + } + + /* Age by halving the temperature (floor = 4) */ + newtemp = MAX(prev_key->temperature >> 1, 4); + prev_key->temperature = newtemp; + + error = BTInsertRecord(filefork, prev_iterator, &prev_record, sizeof(data)); + if (error) { + printf("hfs_agehotfiles: BTInsertRecord failed %d (file %d)\n", error, prev_key->fileID); + error = MacToVFSError(error); + break; + } + ++aged; + /* + * Update thread entry with latest temperature. + */ + prev_key->temperature = HFC_LOOKUPTAG; + error = BTUpdateRecord(filefork, prev_iterator, + (IterateCallBackProcPtr)update_callback, + &newtemp); + if (error) { + printf("hfs_agehotfiles: %d of %d BTUpdateRecord failed %d (file %d, %d)\n", + i, numrecs, error, prev_key->fileID, newtemp); + error = MacToVFSError(error); + // break; + } + + bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); + prev_data = data; + + } /* end for */ + +#if HFC_VERBOSE + if (error == 0) + printf("hfs_agehotfiles: aged %d records out of %d\n", aged, btinfo.numRecords); +#endif + (void) BTFlushPath(filefork); +out: + (void) VOP_UNLOCK(hfsmp->hfc_filevp, 0, p); + + if (hfsmp->jnl) { + // hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); +out2: + FREE(iterator, M_TEMP); + return (error); +} + +/* + * Return true if any blocks (or all blocks if all is true) + * are contained in the hot file region. + */ +static int +hotextents(struct hfsmount *hfsmp, HFSPlusExtentDescriptor * extents) +{ + u_int32_t b1, b2; + int i; + int inside = 0; + + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + b1 = extents[i].startBlock; + if (b1 == 0) + break; + b2 = b1 + extents[i].blockCount - 1; + if ((b1 >= hfsmp->hfs_hotfile_start && + b2 <= hfsmp->hfs_hotfile_end) || + (b1 < hfsmp->hfs_hotfile_end && + b2 > hfsmp->hfs_hotfile_end)) { + inside = 1; + break; + } + } + return (inside); +} + + +/* + *======================================================================== + * HOT FILE B-TREE ROUTINES + *======================================================================== + */ + +/* + * Open the hot files b-tree for writing. + * + * On successful exit the vnode has a reference but is unlocked. + */ +static int +hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) +{ + struct proc *p; + struct vnode *vp; + struct cat_desc cdesc = {0}; + struct cat_attr cattr; + struct cat_fork cfork; + static char filename[] = HFC_FILENAME; + int error; + int retry = 0; + + *vpp = NULL; + p = current_proc(); + + cdesc.cd_parentcnid = kRootDirID; + cdesc.cd_nameptr = filename; + cdesc.cd_namelen = strlen(filename); + + /* Lock catalog b-tree */ + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); + if (error) + return (error); + + error = cat_lookup(hfsmp, &cdesc, 0, &cdesc, &cattr, &cfork); + + /* Unlock catalog b-tree */ + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + + if (error) { + printf("hfc_btree_open: cat_lookup error %d\n", error); + return (error); + } +again: + cdesc.cd_flags |= CD_ISMETA; + error = hfs_getnewvnode(hfsmp, NULL, &cdesc, 0, &cattr, &cfork, &vp); + if (error) { + printf("hfc_btree_open: hfs_getnewvnode error %d\n", error); + cat_releasedesc(&cdesc); + return (error); + } + if ((vp->v_flag & VSYSTEM) == 0) { +#if HFC_VERBOSE + printf("hfc_btree_open: file has UBC, try again\n"); +#endif + vput(vp); + vgone(vp); + if (retry++ == 0) + goto again; + else + return (EBUSY); + } + + /* Open the B-tree file for writing... */ + error = BTOpenPath(VTOF(vp), (KeyCompareProcPtr) hfc_comparekeys); + if (error) { + printf("hfc_btree_open: BTOpenPath error %d\n", error); + error = MacToVFSError(error); + } else { +#if HFC_VERBOSE + struct BTreeInfoRec btinfo; + + if (BTGetInformation(VTOF(vp), 0, &btinfo) == 0) { + printf("btinfo: nodeSize %d\n", btinfo.nodeSize); + printf("btinfo: maxKeyLength %d\n", btinfo.maxKeyLength); + printf("btinfo: treeDepth %d\n", btinfo.treeDepth); + printf("btinfo: numRecords %d\n", btinfo.numRecords); + printf("btinfo: numNodes %d\n", btinfo.numNodes); + printf("btinfo: numFreeNodes %d\n", btinfo.numFreeNodes); + } +#endif + } + + VOP_UNLOCK(vp, 0, p); /* unlocked with a single reference */ + if (error) + vrele(vp); + else + *vpp = vp; + + if ((vp->v_flag & VSYSTEM) == 0) + panic("hfc_btree_open: not a system file (vp = 0x%08x)", vp); + + if (UBCINFOEXISTS(vp)) + panic("hfc_btree_open: has UBCInfo (vp = 0x%08x)", vp); + + return (error); +} + +/* + * Close the hot files b-tree. + * + * On entry the vnode is not locked but has a reference. + */ +static int +hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) +{ + struct proc *p = current_proc(); + int error; + + + if (hfsmp->jnl) { + journal_flush(hfsmp->jnl); + } + + if (vget(vp, LK_EXCLUSIVE, p) == 0) { + (void) VOP_FSYNC(vp, NOCRED, MNT_WAIT, p); + error = BTClosePath(VTOF(vp)); + if (error) + printf("hfc_btree_close: BTClosePath error %d\n", error); + vput(vp); + } + vrele(vp); + vgone(vp); + vp = NULL; + + return (0); +} + +/* + * Create a hot files btree file. + * + */ +static int +hfc_btree_create(struct hfsmount *hfsmp, int nodesize, int entries) +{ + struct proc *p; + struct nameidata nd; + struct vnode *vp; + char path[128]; + int error; + + + if (hfsmp->hfc_filevp) + panic("hfc_btree_create: hfc_filevp exists (vp = 0x%08x)", hfsmp->hfc_filevp); + + p = current_proc(); + snprintf(path, sizeof(path), "%s/%s", + hfsmp->hfs_mp->mnt_stat.f_mntonname, HFC_FILENAME); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p); + if ((error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR)) != 0) { + return (error); + } + vp = nd.ni_vp; + + /* Don't use non-regular files or files with links. */ + if (vp->v_type != VREG || VTOC(vp)->c_nlink != 1) { + error = EFTYPE; + goto out; + } + + printf("HFS: created HFBT on %s\n", HFSTOVCB(hfsmp)->vcbVN); + + if (VTOF(vp)->ff_size < nodesize) { + caddr_t buffer; + u_int16_t *index; + u_int16_t offset; + BTNodeDescriptor *ndp; + BTHeaderRec *bthp; + HotFilesInfo *hotfileinfo; + int nodecnt; + int filesize; + int entirespernode; + + /* + * Mark it invisible (truncate will pull these changes). + */ + ((FndrFileInfo *)&VTOC(vp)->c_finderinfo[0])->fdFlags |= + SWAP_BE16 (kIsInvisible + kNameLocked); + + if (kmem_alloc(kernel_map, (vm_offset_t *)&buffer, nodesize)) { + error = ENOMEM; + goto out; + } + bzero(buffer, nodesize); + index = (int16_t *)buffer; + + entirespernode = (nodesize - sizeof(BTNodeDescriptor) - 2) / + (sizeof(HotFileKey) + 6); + nodecnt = 2 + howmany(entries * 2, entirespernode); + nodecnt = roundup(nodecnt, 8); + filesize = nodecnt * nodesize; + + /* FILL IN THE NODE DESCRIPTOR: */ + ndp = (BTNodeDescriptor *)buffer; + ndp->kind = kBTHeaderNode; + ndp->numRecords = SWAP_BE16 (3); + offset = sizeof(BTNodeDescriptor); + index[(nodesize / 2) - 1] = SWAP_BE16 (offset); + + /* FILL IN THE HEADER RECORD: */ + bthp = (BTHeaderRec *)((UInt8 *)buffer + offset); + bthp->nodeSize = SWAP_BE16 (nodesize); + bthp->totalNodes = SWAP_BE32 (filesize / nodesize); + bthp->freeNodes = SWAP_BE32 (nodecnt - 1); + bthp->clumpSize = SWAP_BE32 (filesize); + bthp->btreeType = kUserBTreeType; /* non-metadata */ + bthp->attributes |= SWAP_BE32 (kBTBigKeysMask); + bthp->maxKeyLength = SWAP_BE16 (HFC_KEYLENGTH); + offset += sizeof(BTHeaderRec); + index[(nodesize / 2) - 2] = SWAP_BE16 (offset); + + /* FILL IN THE USER RECORD: */ + hotfileinfo = (HotFilesInfo *)((UInt8 *)buffer + offset); + hotfileinfo->magic = SWAP_BE32 (HFC_MAGIC); + hotfileinfo->version = SWAP_BE32 (HFC_VERSION); + hotfileinfo->duration = SWAP_BE32 (HFC_DEFAULT_DURATION); + hotfileinfo->timebase = 0; + hotfileinfo->timeleft = 0; + hotfileinfo->threshold = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE); + hotfileinfo->maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize); + hotfileinfo->maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); + strcpy(hotfileinfo->tag, hfc_tag); + offset += kBTreeHeaderUserBytes; + index[(nodesize / 2) - 3] = SWAP_BE16 (offset); + + /* FILL IN THE MAP RECORD (only one node in use). */ + *((u_int8_t *)buffer + offset) = 0x80; + offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) + - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); + index[(nodesize / 2) - 4] = SWAP_BE16 (offset); + + vp->v_flag |= VNOFLUSH; + error = VOP_TRUNCATE(vp, (off_t)filesize, IO_NDELAY, NOCRED, p); + if (error == 0) { + struct iovec aiov; + struct uio auio; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = buffer; + aiov.iov_len = filesize; + auio.uio_resid = nodesize; + auio.uio_offset = (off_t)(0); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(vp, &auio, 0, kernproc->p_ucred); + } + kmem_free(kernel_map, (vm_offset_t)buffer, nodesize); + } +out: + (void) VOP_UNLOCK(vp, 0, p); + (void) vn_close(vp, FWRITE, kernproc->p_ucred, p); + vgone(vp); + return (error); +} + +/* + * Compare two hot file b-tree keys. + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +static int +hfc_comparekeys(HotFileKey *searchKey, HotFileKey *trialKey) +{ + /* + * Compared temperatures first. + */ + if (searchKey->temperature == trialKey->temperature) { + /* + * Temperatures are equal so compare file ids. + */ + if (searchKey->fileID == trialKey->fileID) { + /* + * File ids are equal so compare fork types. + */ + if (searchKey->forkType == trialKey->forkType) { + return (0); + } else if (searchKey->forkType > trialKey->forkType) { + return (1); + } + } else if (searchKey->fileID > trialKey->fileID) { + return (1); + } + } else if (searchKey->temperature > trialKey->temperature) { + return (1); + } + + return (-1); +} + + +/* + *======================================================================== + * HOT FILE DATA COLLECTING ROUTINES + *======================================================================== + */ + +/* + * Lookup a hot file entry in the tree. + */ +static hotfile_entry_t * +hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + while (entry && + entry->temperature != temperature && + entry->fileid != fileid) { + + if (temperature > entry->temperature) + entry = entry->right; + else if (temperature < entry->temperature) + entry = entry->left; + else if (fileid > entry->fileid) + entry = entry->right; + else + entry = entry->left; + } + return (entry); +} + +/* + * Insert a hot file entry into the tree. + */ +static void +hf_insert(hotfile_data_t *hotdata, hotfile_entry_t *newentry) +{ + hotfile_entry_t *entry = hotdata->rootentry; + u_int32_t fileid = newentry->fileid; + u_int32_t temperature = newentry->temperature; + + if (entry == NULL) { + hotdata->rootentry = newentry; + hotdata->coldest = newentry; + hotdata->activefiles++; + return; + } + + while (entry) { + if (temperature > entry->temperature) { + if (entry->right) + entry = entry->right; + else { + entry->right = newentry; + break; + } + } else if (temperature < entry->temperature) { + if (entry->left) + entry = entry->left; + else { + entry->left = newentry; + break; + } + } else if (fileid > entry->fileid) { + if (entry->right) + entry = entry->right; + else { + if (entry->fileid != fileid) + entry->right = newentry; + break; + } + } else { + if (entry->left) + entry = entry->left; + else { + if (entry->fileid != fileid) + entry->left = newentry; + break; + } + } + } + + hotdata->activefiles++; +} + +/* + * Find the coldest entry in the tree. + */ +static hotfile_entry_t * +hf_coldest(hotfile_data_t *hotdata) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + if (entry) { + while (entry->left) + entry = entry->left; + } + return (entry); +} + +/* + * Delete a hot file entry from the tree. + */ +static void +hf_delete(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) +{ + hotfile_entry_t *entry, *parent, *next; + + parent = NULL; + entry = hotdata->rootentry; + + while (entry && + entry->temperature != temperature && + entry->fileid != fileid) { + + parent = entry; + if (temperature > entry->temperature) + entry = entry->right; + else if (temperature < entry->temperature) + entry = entry->left; + else if (fileid > entry->fileid) + entry = entry->right; + else + entry = entry->left; + } + + if (entry) { + /* + * Reorginize the sub-trees spanning from our entry. + */ + if ((next = entry->right)) { + hotfile_entry_t *pnextl, *psub; + /* + * Tree pruning: take the left branch of the + * current entry and place it at the lowest + * left branch of the current right branch + */ + psub = next; + + /* Walk the Right/Left sub tree from current entry */ + while ((pnextl = psub->left)) + psub = pnextl; + + /* Plug the old left tree to the new ->Right leftmost entry */ + psub->left = entry->left; + + } else /* only left sub-tree, simple case */ { + next = entry->left; + } + /* + * Now, plug the current entry sub tree to + * the good pointer of our parent entry. + */ + if (parent == NULL) + hotdata->rootentry = next; + else if (parent->left == entry) + parent->left = next; + else + parent->right = next; + + /* Place entry back on the free-list */ + entry->left = 0; + entry->fileid = 0; + entry->temperature = 0; + + entry->right = hotdata->freelist; + hotdata->freelist = entry; + hotdata->activefiles--; + + if (hotdata->coldest == entry || hotdata->coldest == NULL) { + hotdata->coldest = hf_coldest(hotdata); + } + + } +} + +/* + * Get a free hot file entry. + */ +static hotfile_entry_t * +hf_getnewentry(hotfile_data_t *hotdata) +{ + hotfile_entry_t * entry; + + /* + * When the free list is empty then steal the coldest one + */ + if (hotdata->freelist == NULL) { + entry = hf_coldest(hotdata); + hf_delete(hotdata, entry->fileid, entry->temperature); + } + entry = hotdata->freelist; + hotdata->freelist = entry->right; + entry->right = 0; + + return (entry); +} + + +/* + * Visit the tree in desending order. + */ +static void +hf_sortlist(hotfile_entry_t * root, int *index, hotfilelist_t *sortedlist) +{ + if (root) { + int i; + + hf_sortlist(root->right, index, sortedlist); + i = *index; + ++(*index); + sortedlist->hfl_hotfile[i].hf_fileid = root->fileid; + sortedlist->hfl_hotfile[i].hf_temperature = root->temperature; + sortedlist->hfl_hotfile[i].hf_blocks = root->blocks; + sortedlist->hfl_totalblocks += root->blocks; + hf_sortlist(root->left, index, sortedlist); + } +} + +/* + * Generate a sorted list of hot files. + */ +static int +hf_getsortedlist(hotfile_data_t * hotdata, hotfilelist_t *sortedlist) +{ + int index = 0; + + hf_sortlist(hotdata->rootentry, &index, sortedlist); + + sortedlist->hfl_count = hotdata->activefiles; + + return (index); +} + + +#if HFC_DEBUG +static void +hf_maxdepth(hotfile_entry_t * root, int depth, int *maxdepth) +{ + if (root) { + depth++; + if (depth > *maxdepth) + *maxdepth = depth; + hf_maxdepth(root->left, depth, maxdepth); + hf_maxdepth(root->right, depth, maxdepth); + } +} + +static void +hf_printtree(hotfile_entry_t * root) +{ + if (root) { + hf_printtree(root->left); + printf("temperature: % 8d, fileid %d\n", root->temperature, root->fileid); + hf_printtree(root->right); + } +} +#endif diff -urN xnu-344.49/bsd/hfs/hfs_hotfiles.h xnu-517/bsd/hfs/hfs_hotfiles.h --- xnu-344.49/bsd/hfs/hfs_hotfiles.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/hfs/hfs_hotfiles.h Sat Oct 25 00:25:25 2003 @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#ifndef __HFS_HOTFILES__ +#define __HFS_HOTFILES__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + + +#define HFC_FILENAME ".hotfiles.btree" + + +/* + * Temperature measurement constraints. + */ +#define HFC_DEFAULT_FILE_COUNT 1000 +#define HFC_DEFAULT_DURATION (3600 * 60) +#define HFC_CUMULATIVE_CYCLES 4 +#define HFC_MAXIMUM_FILE_COUNT 5000 +#define HFC_MAXIMUM_FILESIZE (10 * 1024 * 1024) +#define HFC_MINIMUM_TEMPERATURE 16 + + +/* + * Sync constraints. + */ +#define HFC_BLKSPERSYNC 300 +#define HFC_FILESPERSYNC 50 + + +/* + * Hot file clustering stages. + */ +enum hfc_stage { + HFC_DISABLED, + HFC_IDLE, + HFC_BUSY, + HFC_RECORDING, + HFC_EVALUATION, + HFC_EVICTION, + HFC_ADOPTION, +}; + + +/* + * B-tree file key format (on-disk). + */ +struct HotFileKey { + u_int16_t keyLength; /* length of key, excluding this field */ + u_int8_t forkType; /* 0 = data fork, FF = resource fork */ + u_int8_t pad; /* make the other fields align on 32-bit boundary */ + u_int32_t temperature; /* temperature recorded */ + u_int32_t fileID; /* file ID */ +}; +typedef struct HotFileKey HotFileKey; + +#define HFC_LOOKUPTAG 0xFFFFFFFF +#define HFC_KEYLENGTH (sizeof(HotFileKey) - sizeof(u_int16_t)) + +/* + * B-tree header node user info (on-disk). + */ +struct HotFilesInfo { + u_int32_t magic; + u_int32_t version; + u_int32_t duration; /* duration of sample period */ + u_int32_t timebase; /* recording period start time */ + u_int32_t timeleft; /* recording period stop time */ + u_int32_t threshold; + u_int32_t maxfileblks; + u_int32_t maxfilecnt; + u_int8_t tag[32]; +}; +typedef struct HotFilesInfo HotFilesInfo; + +#define HFC_MAGIC 0xFF28FF26 +#define HFC_VERSION 1 + + +struct hfsmount; +struct proc; +struct vnode; + +/* + * Hot File interface functions. + */ +int hfs_hotfilesync (struct hfsmount *, struct proc *); + +int hfs_recording_init(struct hfsmount *, struct proc *); +int hfs_recording_start (struct hfsmount *, struct proc *); +int hfs_recording_stop (struct hfsmount *, struct proc *); +int hfs_recording_suspend (struct hfsmount *, struct proc *); +int hfs_recording_abort (struct hfsmount *, struct proc *); + +int hfs_addhotfile (struct vnode *); +int hfs_removehotfile (struct vnode *); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS_HOTFILES__ */ diff -urN xnu-344.49/bsd/hfs/hfs_link.c xnu-517/bsd/hfs/hfs_link.c --- xnu-344.49/bsd/hfs/hfs_link.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_link.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -116,6 +116,7 @@ struct cat_desc to_desc; int newlink = 0; int retval; + cat_cookie_t cookie = {0}; /* We don't allow link nodes in our Private Meta Data folder! */ @@ -125,10 +126,15 @@ if (hfs_freeblks(hfsmp, 0) == 0) return (ENOSPC); + /* Reserve some space in the Catalog file. */ + if ((retval = cat_preflight(hfsmp, (2 * CAT_CREATE)+ CAT_RENAME, &cookie, p))) { + return (retval); + } + /* Lock catalog b-tree */ retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (retval) { - return retval; + goto out2; } /* @@ -219,7 +225,8 @@ out: /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - +out2: + cat_postflight(hfsmp, &cookie, p); return (retval); } @@ -235,6 +242,7 @@ IN struct componentname *cnp; */ +__private_extern__ int hfs_link(ap) struct vop_link_args /* { @@ -267,7 +275,7 @@ if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord) return err_link(ap); /* hfs disks don't support hard links */ - if (hfsmp->hfs_private_metadata_dir == 0) + if (hfsmp->hfs_privdir_desc.cd_cnid == 0) return err_link(ap); /* no private metadata dir, no links possible */ if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { @@ -329,12 +337,22 @@ // XXXdbg - need to do this here as well because cp could have changed error = VOP_UPDATE(vp, &tv, &tv, 1); - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); if (hfsmp->jnl) { journal_end_transaction(hfsmp->jnl); } hfs_global_shared_lock_release(hfsmp); + + /* free the pathname buffer */ + { + char *tmp = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + } + + HFS_KNOTE(vp, NOTE_LINK); + HFS_KNOTE(tdvp, NOTE_WRITE); out1: if (tdvp != vp) diff -urN xnu-344.49/bsd/hfs/hfs_lookup.c xnu-517/bsd/hfs/hfs_lookup.c --- xnu-344.49/bsd/hfs/hfs_lookup.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_lookup.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -274,16 +274,15 @@ retval = EJUSTRETURN; goto exit; } - + /* * Insert name into cache (as non-existent) if appropriate. * - * Disable negative caching since HFS is case-insensitive. + * Only done for case-sensitive HFS+ volumes. */ -#if 0 - if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + if ((hfsmp->hfs_flags & HFS_CASE_SENSITIVE) && + (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, *vpp, cnp); -#endif retval = ENOENT; goto exit; } @@ -456,6 +455,34 @@ cache_enter(dvp, *vpp, cnp); } + + // + // have to patch up the resource fork name because + // it won't happen properly in the layers above us. + // + if (wantrsrc) { + if (VTOC(*vpp)->c_vp == NULL) { + if (VNAME(*vpp) == NULL) { + VNAME(*vpp) = add_name(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); + } + if (VPARENT(*vpp) == NULL) { + vget(dvp, 0, p); + VPARENT(*vpp) = dvp; + } + } else { + if (VNAME(*vpp) == NULL) { + // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec + // and to not count the trailing null byte at the end of the string. + VNAME(*vpp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); + } + if (VPARENT(*vpp) == NULL && *vpp != VTOC(*vpp)->c_vp) { + VPARENT(*vpp) = VTOC(*vpp)->c_vp; + VTOC(*vpp)->c_flag |= C_VPREFHELD; + vget(VTOC(*vpp)->c_vp, 0, p); + } + } + } + exit: cat_releasedesc(&desc); return (retval); @@ -483,6 +510,8 @@ * */ +#define S_IXALL 0000111 + __private_extern__ int hfs_cache_lookup(ap) @@ -495,16 +524,15 @@ struct vnode *dvp; struct vnode *vp; struct cnode *cp; + struct cnode *dcp; int lockparent; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; - struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct proc *p = cnp->cn_proc; u_long vpid; /* capability number of vnode */ - *vpp = NULL; dvp = ap->a_dvp; lockparent = flags & LOCKPARENT; @@ -514,11 +542,17 @@ if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) - return (EROFS); - if ((error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc))) - return (error); + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto err_exit; + } + dcp = VTOC(dvp); + if (((dcp->c_mode & S_IXALL) != S_IXALL) && (cnp->cn_cred->cr_uid != 0)) { + if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { + goto err_exit; + } + } /* * Lookup an entry in the cache * If the lookup succeeds, the vnode is returned in *vpp, and a status of -1 is @@ -527,14 +561,15 @@ * fails, a status of zero is returned. */ error = cache_lookup(dvp, vpp, cnp); - if (error == 0) { /* Unsuccessfull */ - error = hfs_lookup(ap); - return (error); + if (error != -1) { + if (error == 0) { /* Unsuccessfull */ + goto lookup; + } + + if (error == ENOENT) { + goto err_exit; + } } - - if (error == ENOENT) - return (error); - /* We have a name that matched */ vp = *vpp; vpid = vp->v_id; @@ -583,20 +618,51 @@ int wantrsrc = 0; cnp->cn_consume = forkcomponent(cnp, &wantrsrc); - - /* Fork names are only for lookups */ - if (cnp->cn_consume && - (cnp->cn_nameiop != LOOKUP && cnp->cn_nameiop != CREATE)) - return (EPERM); - /* - * We only store data forks in the name cache. - */ - if (wantrsrc) - return (hfs_lookup(ap)); + if (cnp->cn_consume) { + flags |= ISLASTCN; + /* Fork names are only for lookups */ + if (cnp->cn_nameiop != LOOKUP && + cnp->cn_nameiop != CREATE) { + error = EPERM; + + goto err_exit; + } + } + + if (wantrsrc) { + /* Use cnode's rsrcfork vnode (if available) */ + if (cp->c_rsrc_vp != NULL) { + *vpp = vp = cp->c_rsrc_vp; + if (VNAME(vp) == NULL) { + // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec + // and to not count the trailing null byte at the end of the string. + VNAME(vp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); + } + if (VPARENT(vp) == NULL) { + vget(cp->c_vp, 0, p); + VPARENT(vp) = cp->c_vp; + } + vpid = vp->v_id; + } else { + goto lookup; + } + } + } + error = vget(vp, 0, p); + if (error == 0) { + if (VTOC(vp) == NULL || vp->v_data != (void *)cp) { + panic("hfs: cache lookup: my cnode disappeared/went bad! vp 0x%x 0x%x 0x%x\n", + vp, vp->v_data, cp); + } + if (cnp->cn_nameiop == LOOKUP && + (!(flags & ISLASTCN) || (flags & SHAREDLEAF))) + error = lockmgr(&VTOC(vp)->c_lock, LK_SHARED, NULL, p); + else + error = lockmgr(&VTOC(vp)->c_lock, LK_EXCLUSIVE, NULL, p); + } + if (!lockparent || error || !(flags & ISLASTCN)) { + (void) lockmgr(&dcp->c_lock, LK_RELEASE, NULL, p); } - error = vget(vp, LK_EXCLUSIVE, p); - if (!lockparent || error || !(flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); } /* * Check that the capability number did not change @@ -616,8 +682,12 @@ if ((error = vn_lock(dvp, LK_EXCLUSIVE, p))) return (error); - +lookup: return (hfs_lookup(ap)); + +err_exit: + *vpp = NULL; + return (error); } diff -urN xnu-344.49/bsd/hfs/hfs_mount.h xnu-517/bsd/hfs/hfs_mount.h --- xnu-344.49/bsd/hfs/hfs_mount.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_mount.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,6 +63,17 @@ #define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */ #define HFSFSMNT_WRAPPER 0x2 /* mount HFS wrapper (if it exists) */ #define HFSFSMNT_EXTENDED_ARGS 0x4 /* indicates new fields after "flags" are valid */ + +/* + * Sysctl values for HFS + */ +#define HFS_ENCODINGBIAS 1 /* encoding matching CJK bias */ +#define HFS_EXTEND_FS 2 +#define HFS_ENCODINGHINT 3 /* guess encoding for string */ +#define HFS_ENABLE_JOURNALING 0x082969 +#define HFS_DISABLE_JOURNALING 0x031272 +#define HFS_GET_JOURNAL_INFO 0x6a6e6c69 +#define HFS_SET_PKG_EXTENSIONS 0x121031 #endif /* __APPLE_API_UNSTABLE */ diff -urN xnu-344.49/bsd/hfs/hfs_notification.c xnu-517/bsd/hfs/hfs_notification.c --- xnu-344.49/bsd/hfs/hfs_notification.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/hfs/hfs_notification.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_lockf.h" +#include "hfs_dbg.h" +#include "hfs_mount.h" +#include "hfs_quota.h" +#include "hfs_endian.h" + +#include "hfscommon/headers/BTreesInternal.h" +#include "hfscommon/headers/FileMgrInternal.h" + + + +void hfs_generate_volume_notifications(struct hfsmount *hfsmp) { + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + + if (hfsmp->hfs_notification_conditions & VQ_LOWDISK) { + /* Check to see whether the free space is back above the minimal level: */ + if (hfs_freeblks(hfsmp, 1) > hfsmp->hfs_freespace_notify_desiredlevel) { + hfsmp->hfs_notification_conditions &= ~VQ_LOWDISK; + vfs_event_signal(&HFSTOVFS(hfsmp)->mnt_stat.f_fsid, hfsmp->hfs_notification_conditions, NULL); + } + } else { + /* Check to see whether the free space fell below the requested limit: */ + if (hfs_freeblks(hfsmp, 1) < hfsmp->hfs_freespace_notify_warninglimit) { + hfsmp->hfs_notification_conditions |= VQ_LOWDISK; + vfs_event_signal(&HFSTOVFS(hfsmp)->mnt_stat.f_fsid, hfsmp->hfs_notification_conditions, NULL); + } + }; +} diff -urN xnu-344.49/bsd/hfs/hfs_quota.c xnu-517/bsd/hfs/hfs_quota.c --- xnu-344.49/bsd/hfs/hfs_quota.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_quota.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -707,11 +707,9 @@ for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { if (vp->v_mount != mp) goto again; - nextvp = vp->v_mntvnodes.le_next; simple_lock(&vp->v_interlock); simple_unlock(&mntvnode_slock); - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); @@ -720,13 +718,11 @@ continue; } - // Make sure that this is really an hfs vnode. - // - if ( vp->v_mount != mp - || vp->v_type == VNON - || vp->v_tag != VT_HFS - || VTOC(vp) == NULL) { - + /* Make sure that this is really an hfs vnode. */ + if (vp->v_mount != mp || + vp->v_type == VNON || + vp->v_tag != VT_HFS || + VTOC(vp) == NULL) { vput(vp); simple_lock(&mntvnode_slock); goto again; diff -urN xnu-344.49/bsd/hfs/hfs_readwrite.c xnu-517/bsd/hfs/hfs_readwrite.c --- xnu-344.49/bsd/hfs/hfs_readwrite.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_readwrite.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,10 @@ extern u_int32_t GetLogicalBlockSize(struct vnode *vp); +static int hfs_clonelink(struct vnode *, int, struct ucred *, struct proc *); +static int hfs_clonefile(struct vnode *, int, int, int, struct ucred *, struct proc *); +static int hfs_clonesysfile(struct vnode *, int, int, int, struct ucred *, struct proc *); + /***************************************************************************** * @@ -97,18 +102,16 @@ register struct vnode *vp = ap->a_vp; struct cnode *cp; struct filefork *fp; - struct buf *bp; - daddr_t logBlockNo; - u_long fragSize, moveSize, startOffset, ioxfersize; int devBlockSize = 0; - off_t bytesRemaining; int retval = 0; off_t filesize; off_t filebytes; + off_t start_resid = uio->uio_resid; + /* Preflight checks */ - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* HFS can only read files */ + if ((vp->v_type != VREG) || !UBCINFOEXISTS(vp)) + return (EPERM); /* can only read regular files */ if (uio->uio_resid == 0) return (0); /* Nothing left to do */ if (uio->uio_offset < 0) @@ -130,105 +133,29 @@ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); - if (UBCISVALID(vp)) { - retval = cluster_read(vp, uio, filesize, devBlockSize, 0); - } else { - - for (retval = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { - - if ((bytesRemaining = (filesize - uio->uio_offset)) <= 0) - break; - - logBlockNo = (daddr_t)(uio->uio_offset / PAGE_SIZE_64); - startOffset = (u_long) (uio->uio_offset & PAGE_MASK_64); - fragSize = PAGE_SIZE; - - if (((logBlockNo * PAGE_SIZE) + fragSize) < filesize) - ioxfersize = fragSize; - else { - ioxfersize = filesize - (logBlockNo * PAGE_SIZE); - ioxfersize = (ioxfersize + (devBlockSize - 1)) & ~(devBlockSize - 1); - } - moveSize = ioxfersize; - moveSize -= startOffset; - - if (bytesRemaining < moveSize) - moveSize = bytesRemaining; - - if (uio->uio_resid < moveSize) { - moveSize = uio->uio_resid; - }; - if (moveSize == 0) { - break; - }; - - if (( uio->uio_offset + fragSize) >= filesize) { - retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp); - - } else if (logBlockNo - 1 == vp->v_lastr && !(vp->v_flag & VRAOFF)) { - daddr_t nextLogBlockNo = logBlockNo + 1; - int nextsize; - - if (((nextLogBlockNo * PAGE_SIZE) + - (daddr_t)fragSize) < filesize) - nextsize = fragSize; - else { - nextsize = filesize - (nextLogBlockNo * PAGE_SIZE); - nextsize = (nextsize + (devBlockSize - 1)) & ~(devBlockSize - 1); - } - retval = breadn(vp, logBlockNo, ioxfersize, &nextLogBlockNo, &nextsize, 1, NOCRED, &bp); - } else { - retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp); - }; - - if (retval != E_NONE) { - if (bp) { - brelse(bp); - bp = NULL; - } - break; - }; - vp->v_lastr = logBlockNo; - - /* - * We should only get non-zero b_resid when an I/O retval - * has occurred, which should cause us to break above. - * However, if the short read did not cause an retval, - * then we want to ensure that we do not uiomove bad - * or uninitialized data. - */ - ioxfersize -= bp->b_resid; - - if (ioxfersize < moveSize) { /* XXX PPD This should take the offset into account, too! */ - if (ioxfersize == 0) - break; - moveSize = ioxfersize; - } - if ((startOffset + moveSize) > bp->b_bcount) - panic("hfs_read: bad startOffset or moveSize\n"); - - if ((retval = uiomove((caddr_t)bp->b_data + startOffset, (int)moveSize, uio))) - break; - - if (S_ISREG(cp->c_mode) && - (((startOffset + moveSize) == fragSize) || (uio->uio_offset == filesize))) { - bp->b_flags |= B_AGE; - }; - - brelse(bp); - /* Start of loop resets bp to NULL before reaching outside this block... */ - } - - if (bp != NULL) { - brelse(bp); - } - } + retval = cluster_read(vp, uio, filesize, devBlockSize, 0); cp->c_flag |= C_ACCESS; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END, (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); + /* + * Keep track blocks read + */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && retval == 0) { + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + fp->ff_bytesread = start_resid - uio->uio_resid; + cp->c_atime = time.tv_sec; + } else { + fp->ff_bytesread += start_resid - uio->uio_resid; + } + } + return (retval); } @@ -256,37 +183,32 @@ struct uio *uio = ap->a_uio; struct cnode *cp; struct filefork *fp; - struct buf *bp; struct proc *p; struct timeval tv; ExtendedVCB *vcb; - int devBlockSize = 0; - daddr_t logBlockNo; - long fragSize; - off_t origFileSize, currOffset, writelimit, bytesToAdd; - off_t actualBytesAdded; - u_long blkoffset, resid, xfersize, clearSize; - int eflags, ioflag; - int retval; + int devBlockSize = 0; + off_t origFileSize, writelimit, bytesToAdd; + off_t actualBytesAdded; + u_long resid; + int eflags, ioflag; + int retval; off_t filebytes; - u_long fileblocks; struct hfsmount *hfsmp; int started_tr = 0, grabbed_lock = 0; - ioflag = ap->a_ioflag; if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (E_NONE); - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* Can only write files */ + if ((vp->v_type != VREG) || !UBCINFOEXISTS(vp)) + return (EPERM); /* Can only write regular files */ + ioflag = ap->a_ioflag; cp = VTOC(vp); fp = VTOF(vp); vcb = VTOVCB(vp); - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; if (ioflag & IO_APPEND) uio->uio_offset = fp->ff_size; @@ -297,7 +219,7 @@ if (VTOHFS(vp)->jnl && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; - extd = &cp->c_datafork->ff_data.cf_extents[0]; + extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { return EPERM; } @@ -324,19 +246,6 @@ eflags = kEFDeferMask; /* defer file block allocations */ filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - /* - * NOTE: In the following loop there are two positions tracked: - * currOffset is the current I/O starting offset. currOffset - * is never >LEOF; the LEOF is nudged along with currOffset as - * data is zeroed or written. uio->uio_offset is the start of - * the current I/O operation. It may be arbitrarily beyond - * currOffset. - * - * The following is true at all times: - * currOffset <= LEOF <= uio->uio_offset <= writelimit - */ - currOffset = MIN(uio->uio_offset, fp->ff_size); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); retval = 0; @@ -356,6 +265,20 @@ #endif /* QUOTA */ hfsmp = VTOHFS(vp); + +#ifdef HFS_SPARSE_DEV + /* + * When the underlying device is sparse and space + * is low (< 8MB), stop doing delayed allocations + * and begin doing synchronous I/O. + */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + (hfs_freeblks(hfsmp, 0) < 2048)) { + eflags &= ~kEFDeferMask; + ioflag |= IO_SYNC; + } +#endif /* HFS_SPARSE_DEV */ + if (writelimit > filebytes) { hfs_global_shared_lock_acquire(hfsmp); grabbed_lock = 1; @@ -369,16 +292,19 @@ } while (writelimit > filebytes) { - bytesToAdd = writelimit - filebytes; - if (suser(ap->a_cred, NULL) != 0) + if (ap->a_cred && suser(ap->a_cred, NULL) != 0) eflags |= kEFReserveMask; /* lock extents b-tree (also protects volume bitmap) */ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, current_proc()); if (retval != E_NONE) break; - + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } retval = MacToVFSError(ExtendFileC (vcb, (FCB*)fp, bytesToAdd, 0, eflags, &actualBytesAdded)); @@ -394,6 +320,9 @@ // XXXdbg if (started_tr) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); started_tr = 0; @@ -403,7 +332,7 @@ grabbed_lock = 0; } - if (UBCISVALID(vp) && retval == E_NONE) { + if (retval == E_NONE) { off_t filesize; off_t zero_off; off_t tail_off; @@ -427,8 +356,10 @@ of the transfer to see whether is invalid and should be zero-filled as part of the transfer: */ - if (rl_scan(&fp->ff_invalidranges, zero_off, uio->uio_offset - 1, &invalid_range) != RL_NOOVERLAP) - lflag |= IO_HEADZEROFILL; + if (uio->uio_offset > zero_off) { + if (rl_scan(&fp->ff_invalidranges, zero_off, uio->uio_offset - 1, &invalid_range) != RL_NOOVERLAP) + lflag |= IO_HEADZEROFILL; + } } else { off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64; @@ -528,105 +459,10 @@ } if (resid > uio->uio_resid) cp->c_flag |= C_CHANGE | C_UPDATE; - } else { - while (retval == E_NONE && uio->uio_resid > 0) { - logBlockNo = currOffset / PAGE_SIZE; - blkoffset = currOffset & PAGE_MASK; - - if ((filebytes - currOffset) < PAGE_SIZE_64) - fragSize = filebytes - ((off_t)logBlockNo * PAGE_SIZE_64); - else - fragSize = PAGE_SIZE; - xfersize = fragSize - blkoffset; - - /* Make any adjustments for boundary conditions */ - if (currOffset + (off_t)xfersize > writelimit) - xfersize = writelimit - currOffset; - - /* - * There is no need to read into bp if: - * We start on a block boundary and will overwrite the whole block - * - * OR - */ - if ((blkoffset == 0) && (xfersize >= fragSize)) { - bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ); - retval = 0; - - if (bp->b_blkno == -1) { - brelse(bp); - retval = EIO; /* XXX */ - break; - } - } else { - - if (currOffset == fp->ff_size && blkoffset == 0) { - bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ); - retval = 0; - if (bp->b_blkno == -1) { - brelse(bp); - retval = EIO; /* XXX */ - break; - } - } else { - /* - * This I/O transfer is not sufficiently aligned, - * so read the affected block into a buffer: - */ - retval = bread(vp, logBlockNo, fragSize, ap->a_cred, &bp); - if (retval != E_NONE) { - if (bp) - brelse(bp); - break; - } - } - } - - /* See if we are starting to write within file boundaries: - * If not, then we need to present a "hole" for the area - * between the current EOF and the start of the current - * I/O operation: - * - * Note that currOffset is only less than uio_offset if - * uio_offset > LEOF... - */ - if (uio->uio_offset > currOffset) { - clearSize = MIN(uio->uio_offset - currOffset, xfersize); - bzero(bp->b_data + blkoffset, clearSize); - currOffset += clearSize; - blkoffset += clearSize; - xfersize -= clearSize; - } - - if (xfersize > 0) { - retval = uiomove((caddr_t)bp->b_data + blkoffset, (int)xfersize, uio); - currOffset += xfersize; - } - - if (ioflag & IO_SYNC) { - (void)VOP_BWRITE(bp); - } else if ((xfersize + blkoffset) == fragSize) { - bp->b_flags |= B_AGE; - bawrite(bp); - } else { - bdwrite(bp); - } - - /* Update the EOF if we just extended the file - * (the PEOF has already been moved out and the - * block mapping table has been updated): - */ - if (currOffset > fp->ff_size) { - fp->ff_size = currOffset; - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ - } - if (retval || (resid == 0)) - break; - cp->c_flag |= C_CHANGE | C_UPDATE; - } /* endwhile */ } + HFS_KNOTE(vp, NOTE_WRITE); + ioerr_exit: /* * If we successfully wrote any data, and we are not the superuser @@ -648,6 +484,7 @@ tv = time; retval = VOP_UPDATE(vp, &tv, &tv, 1); } + vcb->vcbWrCnt++; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END, (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); @@ -656,6 +493,22 @@ } +#ifdef HFS_SPARSE_DEV +struct hfs_backingstoreinfo { + int signature; /* == 3419115 */ + int version; /* version of this struct (1) */ + int backingfd; /* disk image file (on backing fs) */ + int bandsize; /* sparse disk image band size */ +}; + +#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) +#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) + +#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) +#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) + +#endif /* HFS_SPARSE_DEV */ + /* #% ioctl vp U U U @@ -684,10 +537,127 @@ } */ *ap; { switch (ap->a_command) { - case 1: { + +#ifdef HFS_SPARSE_DEV + case HFS_SETBACKINGSTOREINFO: { + struct hfsmount * hfsmp; + struct vnode * bsfs_rootvp; + struct vnode * di_vp; + struct file * di_fp; + struct hfs_backingstoreinfo *bsdata; + int error = 0; + + hfsmp = VTOHFS(ap->a_vp); + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + return (EALREADY); + } + if (ap->a_p->p_ucred->cr_uid != 0 && + ap->a_p->p_ucred->cr_uid != (HFSTOVFS(hfsmp))->mnt_stat.f_owner) { + return (EACCES); /* must be owner of file system */ + } + bsdata = (struct hfs_backingstoreinfo *)ap->a_data; + if (bsdata == NULL) { + return (EINVAL); + } + if (error = fdgetf(ap->a_p, bsdata->backingfd, &di_fp)) { + return (error); + } + if (fref(di_fp) == -1) { + return (EBADF); + } + if (di_fp->f_type != DTYPE_VNODE) { + frele(di_fp); + return (EINVAL); + } + di_vp = (struct vnode *)di_fp->f_data; + if (ap->a_vp->v_mount == di_vp->v_mount) { + frele(di_fp); + return (EINVAL); + } + + /* + * Obtain the backing fs root vnode and keep a reference + * on it. This reference will be dropped in hfs_unmount. + */ + error = VFS_ROOT(di_vp->v_mount, &bsfs_rootvp); + if (error) { + frele(di_fp); + return (error); + } + VOP_UNLOCK(bsfs_rootvp, 0, ap->a_p); /* Hold on to the reference */ + + hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; + hfsmp->hfs_sparsebandblks *= 4; + + frele(di_fp); + return (0); + } + case HFS_CLRBACKINGSTOREINFO: { + struct hfsmount * hfsmp; + struct vnode * tmpvp; + + hfsmp = VTOHFS(ap->a_vp); + if (ap->a_p->p_ucred->cr_uid != 0 && + ap->a_p->p_ucred->cr_uid != (HFSTOVFS(hfsmp))->mnt_stat.f_owner) { + return (EACCES); /* must be owner of file system */ + } + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + hfsmp->hfs_backingfs_rootvp) { + + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + tmpvp = hfsmp->hfs_backingfs_rootvp; + hfsmp->hfs_backingfs_rootvp = NULLVP; + hfsmp->hfs_sparsebandblks = 0; + vrele(tmpvp); + } + return (0); + } +#endif /* HFS_SPARSE_DEV */ + + case 6: { + int error; + + ap->a_vp->v_flag |= VFULLFSYNC; + error = VOP_FSYNC(ap->a_vp, ap->a_cred, MNT_NOWAIT, ap->a_p); + ap->a_vp->v_flag &= ~VFULLFSYNC; + + return error; + } + case 5: { + register struct vnode *vp; register struct cnode *cp; + struct filefork *fp; + int error; + + vp = ap->a_vp; + cp = VTOC(vp); + fp = VTOF(vp); + + if (vp->v_type != VREG) + return EINVAL; + + VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ); + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); + if (error) + return (error); + + /* + * used by regression test to determine if + * all the dirty pages (via write) have been cleaned + * after a call to 'fsysnc'. + */ + error = is_file_clean(vp, fp->ff_size); + VOP_UNLOCK(vp, 0, ap->a_p); + + return (error); + } + + case 1: { register struct vnode *vp; register struct radvisory *ra; + register struct cnode *cp; struct filefork *fp; int devBlockSize = 0; int error; @@ -992,6 +962,7 @@ struct rl_entry *invalid_range; enum rl_overlaptype overlaptype; int started_tr = 0, grabbed_lock = 0; + struct timeval tv; /* * Check for underlying vnode requests and ensure that logical @@ -1001,6 +972,17 @@ return (0); p = current_proc(); + + if (ISSET(VTOC(ap->a_vp)->c_flag, C_NOBLKMAP)) { + /* + * File blocks are getting remapped. Wait until its finished. + */ + SET(VTOC(ap->a_vp)->c_flag, C_WBLKMAP); + (void) tsleep((caddr_t)VTOC(ap->a_vp), PINOD, "hfs_cmap", 0); + if (ISSET(VTOC(ap->a_vp)->c_flag, C_NOBLKMAP)) + panic("hfs_cmap: no mappable blocks"); + } + retry: if (fp->ff_unallocblocks) { lockExtBtree = 1; @@ -1040,7 +1022,7 @@ if (fp->ff_unallocblocks) { SInt64 reqbytes, actbytes; - // + // // Make sure we have a transaction. It's possible // that we came in and fp->ff_unallocblocks was zero // but during the time we blocked acquiring the extents @@ -1052,7 +1034,7 @@ (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); lockExtBtree = 0; } - + goto retry; } @@ -1071,6 +1053,10 @@ fp->ff_blocks -= fp->ff_unallocblocks; fp->ff_unallocblocks = 0; + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } while (retval == 0 && reqbytes > 0) { retval = MacToVFSError(ExtendFileC(HFSTOVCB(hfsmp), (FCB*)fp, reqbytes, 0, @@ -1090,7 +1076,11 @@ if (retval) { (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); + VTOC(ap->a_vp)->c_flag |= C_MODIFIED; if (started_tr) { + tv = time; + VOP_UPDATE(ap->a_vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); } @@ -1099,7 +1089,6 @@ } return (retval); } - VTOC(ap->a_vp)->c_flag |= C_MODIFIED; } retval = MacToVFSError( @@ -1115,6 +1104,9 @@ // XXXdbg if (started_tr) { + tv = time; + retval = VOP_UPDATE(ap->a_vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); started_tr = 0; @@ -1361,21 +1353,7 @@ } -/* -# -#% truncate vp L L L -# -vop_truncate { - IN struct vnode *vp; - IN off_t length; - IN int flags; (IO_SYNC) - IN struct ucred *cred; - IN struct proc *p; -}; - * Truncate a cnode to at most length size, freeing (or adding) the - * disk blocks. - */ -int hfs_truncate(ap) +static int do_hfs_truncate(ap) struct vop_truncate_args /* { struct vnode *a_vp; off_t a_length; @@ -1420,6 +1398,11 @@ tv = time; retval = E_NONE; + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + /* * We cannot just check if fp->ff_size == length (as an optimization) * since there may be extra physical blocks that also need truncation. @@ -1447,13 +1430,23 @@ */ if (length > filebytes) { int eflags; + u_long blockHint = 0; /* All or nothing and don't round up to clumpsize. */ eflags = kEFAllMask | kEFNoClumpMask; - if (suser(ap->a_cred, NULL) != 0) + if (ap->a_cred && suser(ap->a_cred, NULL) != 0) eflags |= kEFReserveMask; /* keep a reserve */ + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (filebytes == 0 && + hfsmp->hfs_flags & HFS_METADATA_ZONE && + hfs_virtualmetafile(cp)) { + eflags |= kEFMetadataMask; + blockHint = hfsmp->hfs_metazone_start; + } // XXXdbg hfs_global_shared_lock_acquire(hfsmp); if (hfsmp->jnl) { @@ -1479,7 +1472,7 @@ retval = MacToVFSError(ExtendFileC(VTOVCB(vp), (FCB*)fp, bytesToAdd, - 0, + blockHint, eflags, &actualBytesAdded)); @@ -1495,6 +1488,9 @@ // XXXdbg if (hfsmp->jnl) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); } @@ -1642,6 +1638,9 @@ // XXXdbg if (hfsmp->jnl) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); } @@ -1676,6 +1675,83 @@ } +/* +# +#% truncate vp L L L +# +vop_truncate { + IN struct vnode *vp; + IN off_t length; + IN int flags; (IO_SYNC) + IN struct ucred *cred; + IN struct proc *p; +}; + * Truncate a cnode to at most length size, freeing (or adding) the + * disk blocks. + */ +int hfs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct cnode *cp = VTOC(vp); + struct filefork *fp = VTOF(vp); + off_t length; + off_t filebytes; + u_long fileblocks; + int blksize, error; + u_int64_t nsize; + + if (vp->v_type != VREG && vp->v_type != VLNK) + return (EISDIR); /* cannot truncate an HFS directory! */ + + length = ap->a_length; + blksize = VTOVCB(vp)->blockSize; + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + // have to loop truncating or growing files that are + // really big because otherwise transactions can get + // enormous and consume too many kernel resources. + if (length < filebytes && (filebytes - length) > HFS_BIGFILE_SIZE) { + while (filebytes > length) { + if ((filebytes - length) > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + + ap->a_length = filebytes; + error = do_hfs_truncate(ap); + if (error) + break; + } + } else if (length > filebytes && (length - filebytes) > HFS_BIGFILE_SIZE) { + while (filebytes < length) { + if ((length - filebytes) > HFS_BIGFILE_SIZE) { + filebytes += HFS_BIGFILE_SIZE; + } else { + filebytes = (length - filebytes); + } + + ap->a_length = filebytes; + error = do_hfs_truncate(ap); + if (error) + break; + } + } else { + error = do_hfs_truncate(ap); + } + + return error; +} + + /* # @@ -1706,6 +1782,7 @@ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); + ExtendedVCB *vcb = VTOVCB(vp); off_t length = ap->a_length; off_t startingPEOF; off_t moreBytesRequested; @@ -1716,31 +1793,30 @@ struct timeval tv; int retval, retval2; UInt32 blockHint; - UInt32 extendFlags =0; /* For call to ExtendFileC */ + UInt32 extendFlags; /* For call to ExtendFileC */ struct hfsmount *hfsmp; hfsmp = VTOHFS(vp); *(ap->a_bytesallocated) = 0; fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)VTOVCB(vp)->blockSize; + filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; if (length < (off_t)0) return (EINVAL); - if (vp->v_type != VREG && vp->v_type != VLNK) + if (vp->v_type != VREG) return (EISDIR); - if ((ap->a_flags & ALLOCATEFROMVOL) && (length <= filebytes)) + if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) return (EINVAL); /* Fill in the flags word for the call to Extend the file */ + extendFlags = kEFNoClumpMask; if (ap->a_flags & ALLOCATECONTIG) extendFlags |= kEFContigMask; - if (ap->a_flags & ALLOCATEALL) extendFlags |= kEFAllMask; - - if (suser(ap->a_cred, NULL) != 0) + if (ap->a_cred && suser(ap->a_cred, NULL) != 0) extendFlags |= kEFReserveMask; tv = time; @@ -1767,12 +1843,31 @@ #if QUOTA retval = hfs_chkdq(cp, - (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), + (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), ap->a_cred, 0); if (retval) return (retval); #endif /* QUOTA */ + /* + * Metadata zone checks. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (hfs_virtualmetafile(cp)) { + extendFlags |= kEFMetadataMask; + blockHint = hfsmp->hfs_metazone_start; + } else if ((blockHint >= hfsmp->hfs_metazone_start) && + (blockHint <= hfsmp->hfs_metazone_end)) { + /* + * Move blockHint outside metadata zone. + */ + blockHint = hfsmp->hfs_metazone_end + 1; + } + } + // XXXdbg hfs_global_shared_lock_acquire(hfsmp); if (hfsmp->jnl) { @@ -1792,7 +1887,7 @@ goto Err_Exit; } - retval = MacToVFSError(ExtendFileC(VTOVCB(vp), + retval = MacToVFSError(ExtendFileC(vcb, (FCB*)fp, moreBytesRequested, blockHint, @@ -1800,12 +1895,15 @@ &actualBytesAdded)); *(ap->a_bytesallocated) = actualBytesAdded; - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); // XXXdbg if (hfsmp->jnl) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); } @@ -1827,7 +1925,7 @@ */ if ((actualBytesAdded != 0) && (moreBytesRequested < actualBytesAdded)) *(ap->a_bytesallocated) = - roundup(moreBytesRequested, (off_t)VTOVCB(vp)->blockSize); + roundup(moreBytesRequested, (off_t)vcb->blockSize); } else { /* Shorten the size of the file */ @@ -1863,14 +1961,17 @@ retval = MacToVFSError( TruncateFileC( - VTOVCB(vp), + vcb, (FCB*)fp, length, false)); (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; if (hfsmp->jnl) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 1); + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); journal_end_transaction(hfsmp->jnl); } @@ -1925,7 +2026,7 @@ int devBlockSize = 0; int error; - if (vp->v_type != VREG && vp->v_type != VLNK) + if (vp->v_type != VREG) panic("hfs_pagein: vp not UBC type\n"); VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devBlockSize); @@ -1933,6 +2034,25 @@ error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, ap->a_size, (off_t)VTOF(vp)->ff_size, devBlockSize, ap->a_flags); + /* + * Keep track blocks read + */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + struct cnode *cp; + + cp = VTOC(vp); + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) + VTOF(vp)->ff_bytesread = ap->a_size; + else + VTOF(vp)->ff_bytesread += ap->a_size; + + cp->c_flag |= C_ACCESS; + } + return (error); } @@ -1966,10 +2086,18 @@ filesize = fp->ff_size; end_of_range = ap->a_f_offset + ap->a_size - 1; + if (cp->c_flag & C_RELOCATING) { + if (end_of_range < (filesize / 2)) { + return (EBUSY); + } + } + if (end_of_range >= filesize) end_of_range = (off_t)(filesize - 1); - if (ap->a_f_offset < filesize) + if (ap->a_f_offset < filesize) { rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); + cp->c_flag |= C_MODIFIED; /* leof is dirty */ + } retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, ap->a_size, filesize, devBlockSize, ap->a_flags); @@ -2036,3 +2164,459 @@ return (retval); } + +/* + * Relocate a file to a new location on disk + * cnode must be locked on entry + * + * Relocation occurs by cloning the file's data from its + * current set of blocks to a new set of blocks. During + * the relocation all of the blocks (old and new) are + * owned by the file. + * + * ----------------- + * |///////////////| + * ----------------- + * 0 N (file offset) + * + * ----------------- ----------------- + * |///////////////| | | STEP 1 (aquire new blocks) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- ----------------- + * |///////////////| |///////////////| STEP 2 (clone data) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- + * |///////////////| STEP 3 (head truncate blocks) + * ----------------- + * 0 N + * + * During steps 2 and 3 page-outs to file offsets less + * than or equal to N are suspended. + * + * During step 3 page-ins to the file get supended. + */ +__private_extern__ +int +hfs_relocate(vp, blockHint, cred, p) + struct vnode *vp; + u_int32_t blockHint; + struct ucred *cred; + struct proc *p; +{ + struct filefork *fp; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + + u_int32_t headblks; + u_int32_t datablks; + u_int32_t blksize; + u_int32_t realsize; + u_int32_t growsize; + u_int32_t nextallocsave; + u_int32_t sector_a; + u_int32_t sector_b; + int eflags; + u_int32_t oldstart; /* debug only */ + off_t newbytes; + int retval; + + if (vp->v_type != VREG && vp->v_type != VLNK) { + return (EPERM); + } + + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) { + return (ENOSPC); + } + + fp = VTOF(vp); + if (fp->ff_unallocblocks) + return (EINVAL); + vcb = VTOVCB(vp); + blksize = vcb->blockSize; + if (blockHint == 0) + blockHint = vcb->nextAllocation; + + if ((fp->ff_size > (u_int64_t)0x7fffffff) || + (vp->v_type == VLNK && fp->ff_size > blksize)) { + return (EFBIG); + } + + headblks = fp->ff_blocks; + datablks = howmany(fp->ff_size, blksize); + growsize = datablks * blksize; + realsize = fp->ff_size; + eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask; + if (blockHint >= hfsmp->hfs_metazone_start && + blockHint <= hfsmp->hfs_metazone_end) + eflags |= kEFMetadataMask; + + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + return (EINVAL); + } + } + + /* Lock extents b-tree (also protects volume bitmap) */ + retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE, p); + if (retval) + goto out2; + + retval = MapFileBlockC(vcb, (FCB *)fp, 1, growsize - 1, §or_a, NULL); + if (retval) { + retval = MacToVFSError(retval); + goto out; + } + + /* + * STEP 1 - aquire new allocation blocks. + */ + nextallocsave = vcb->nextAllocation; + retval = ExtendFileC(vcb, (FCB*)fp, growsize, blockHint, eflags, &newbytes); + if (eflags & kEFMetadataMask) + vcb->nextAllocation = nextallocsave; + + retval = MacToVFSError(retval); + if (retval == 0) { + VTOC(vp)->c_flag |= C_MODIFIED; + if (newbytes < growsize) { + retval = ENOSPC; + goto restore; + } else if (fp->ff_blocks < (headblks + datablks)) { + printf("hfs_relocate: allocation failed"); + retval = ENOSPC; + goto restore; + } + + retval = MapFileBlockC(vcb, (FCB *)fp, 1, growsize, §or_b, NULL); + if (retval) { + retval = MacToVFSError(retval); + } else if ((sector_a + 1) == sector_b) { + retval = ENOSPC; + goto restore; + } else if ((eflags & kEFMetadataMask) && + ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) > + hfsmp->hfs_metazone_end)) { + printf("hfs_relocate: didn't move into metadata zone\n"); + retval = ENOSPC; + goto restore; + } + } + if (retval) { + /* + * Check to see if failure is due to excessive fragmentation. + */ + if (retval == ENOSPC && + hfs_freeblks(hfsmp, 0) > (datablks * 2)) { + hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE; + } + goto out; + } + + fp->ff_size = fp->ff_blocks * blksize; + if (UBCISVALID(vp)) + (void) ubc_setsize(vp, fp->ff_size); + + /* + * STEP 2 - clone data into the new allocation blocks. + */ + + if (vp->v_type == VLNK) + retval = hfs_clonelink(vp, blksize, cred, p); + else if (vp->v_flag & VSYSTEM) + retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); + else + retval = hfs_clonefile(vp, headblks, datablks, blksize, cred, p); + + if (retval) + goto restore; + + oldstart = fp->ff_extents[0].startBlock; + + /* + * STEP 3 - switch to clone and remove old blocks. + */ + SET(VTOC(vp)->c_flag, C_NOBLKMAP); /* suspend page-ins */ + + retval = HeadTruncateFile(vcb, (FCB*)fp, headblks); + + CLR(VTOC(vp)->c_flag, C_NOBLKMAP); /* resume page-ins */ + if (ISSET(VTOC(vp)->c_flag, C_WBLKMAP)) + wakeup(VTOC(vp)); + if (retval) + goto restore; + + fp->ff_size = realsize; + if (UBCISVALID(vp)) { + (void) ubc_setsize(vp, realsize); + (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); + } + + CLR(VTOC(vp)->c_flag, C_RELOCATING); /* Resume page-outs for this file. */ +out: + (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); + + retval = VOP_FSYNC(vp, cred, MNT_WAIT, p); +out2: + if (hfsmp->jnl) { + if (VTOC(vp)->c_cnid < kHFSFirstUserCatalogNodeID) + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + else + (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + return (retval); + +restore: + /* + * Give back any newly allocated space. + */ + if (fp->ff_size != realsize) + fp->ff_size = realsize; + (void) TruncateFileC(vcb, (FCB*)fp, fp->ff_size, false); + if (UBCISVALID(vp)) + (void) ubc_setsize(vp, fp->ff_size); + CLR(VTOC(vp)->c_flag, C_RELOCATING); + goto out; +} + + +/* + * Clone a symlink. + * + */ +static int +hfs_clonelink(struct vnode *vp, int blksize, struct ucred *cred, struct proc *p) +{ + struct buf *head_bp = NULL; + struct buf *tail_bp = NULL; + int error; + + + error = meta_bread(vp, 0, blksize, cred, &head_bp); + if (error) + goto out; + + tail_bp = getblk(vp, 1, blksize, 0, 0, BLK_META); + if (tail_bp == NULL) { + error = EIO; + goto out; + } + bcopy(head_bp->b_data, tail_bp->b_data, blksize); + error = bwrite(tail_bp); +out: + if (head_bp) { + head_bp->b_flags |= B_INVAL; + brelse(head_bp); + } + (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); + + return (error); +} + +/* + * Clone a file's data within the file. + * + */ +static int +hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize, + struct ucred *cred, struct proc *p) +{ + caddr_t bufp; + size_t writebase; + size_t bufsize; + size_t copysize; + size_t iosize; + size_t filesize; + size_t offset; + struct uio auio; + struct iovec aiov; + int devblocksize; + int didhold; + int error; + + + if ((error = vinvalbuf(vp, V_SAVE, cred, p, 0, 0))) { + printf("hfs_clonefile: vinvalbuf failed - %d\n", error); + return (error); + } + + if (!ubc_clean(vp, 1)) { + printf("hfs_clonefile: not ubc_clean\n"); + return (EIO); /* XXX error code */ + } + + /* + * Suspend page-outs for this file. + */ + SET(VTOC(vp)->c_flag, C_RELOCATING); + + filesize = VTOF(vp)->ff_size; + writebase = blkstart * blksize; + copysize = blkcnt * blksize; + iosize = bufsize = MIN(copysize, 4096 * 16); + offset = 0; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + return (ENOMEM); + } + + VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devblocksize); + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = p; + + while (offset < copysize) { + iosize = MIN(copysize - offset, iosize); + + aiov.iov_base = bufp; + aiov.iov_len = iosize; + auio.uio_resid = iosize; + auio.uio_offset = offset; + auio.uio_rw = UIO_READ; + + error = cluster_read(vp, &auio, copysize, devblocksize, 0); + if (error) { + printf("hfs_clonefile: cluster_read failed - %d\n", error); + break; + } + if (auio.uio_resid != 0) { + printf("clonedata: cluster_read: uio_resid = %d\n", (int)auio.uio_resid); + error = EIO; + break; + } + + + aiov.iov_base = bufp; + aiov.iov_len = iosize; + auio.uio_resid = iosize; + auio.uio_offset = writebase + offset; + auio.uio_rw = UIO_WRITE; + + error = cluster_write(vp, &auio, filesize + offset, + filesize + offset + iosize, + auio.uio_offset, 0, devblocksize, 0); + if (error) { + printf("hfs_clonefile: cluster_write failed - %d\n", error); + break; + } + if (auio.uio_resid != 0) { + printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n"); + error = EIO; + break; + } + offset += iosize; + } + if (error == 0) { + /* Clean the pages in VM. */ + didhold = ubc_hold(vp); + if (didhold) + (void) ubc_clean(vp, 1); + + /* + * Clean out all associated buffers. + */ + (void) vinvalbuf(vp, V_SAVE, cred, p, 0, 0); + + if (didhold) + ubc_rele(vp); + } + kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); + + return (error); +} + +/* + * Clone a system (metadata) file. + * + */ +static int +hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, + struct ucred *cred, struct proc *p) +{ + caddr_t bufp; + char * offset; + size_t bufsize; + size_t iosize; + struct buf *bp = NULL; + daddr_t blkno; + daddr_t blk; + int breadcnt; + int i; + int error = 0; + + + iosize = GetLogicalBlockSize(vp); + bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1); + breadcnt = bufsize / iosize; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + return (ENOMEM); + } + blkstart = (blkstart * blksize) / iosize; + blkcnt = (blkcnt * blksize) / iosize; + blkno = 0; + + while (blkno < blkcnt) { + /* + * Read up to a megabyte + */ + offset = bufp; + for (i = 0, blk = blkno; (i < breadcnt) && (blk < blkcnt); ++i, ++blk) { + error = meta_bread(vp, blk, iosize, cred, &bp); + if (error) { + printf("hfs_clonesysfile: meta_bread error %d\n", error); + goto out; + } + if (bp->b_bcount != iosize) { + printf("hfs_clonesysfile: b_bcount is only %d\n", bp->b_bcount); + goto out; + } + + bcopy(bp->b_data, offset, iosize); + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + offset += iosize; + } + + /* + * Write up to a megabyte + */ + offset = bufp; + for (i = 0; (i < breadcnt) && (blkno < blkcnt); ++i, ++blkno) { + bp = getblk(vp, blkstart + blkno, iosize, 0, 0, BLK_META); + if (bp == NULL) { + printf("hfs_clonesysfile: getblk failed on blk %d\n", blkstart + blkno); + error = EIO; + goto out; + } + bcopy(offset, bp->b_data, iosize); + error = bwrite(bp); + bp = NULL; + if (error) + goto out; + offset += iosize; + } + } +out: + if (bp) { + brelse(bp); + } + + kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); + + error = VOP_FSYNC(vp, cred, MNT_WAIT, p); + + return (error); +} + diff -urN xnu-344.49/bsd/hfs/hfs_search.c xnu-517/bsd/hfs/hfs_search.c --- xnu-344.49/bsd/hfs/hfs_search.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_search.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -102,7 +102,7 @@ searchinfospec_t *searchInfo2, Boolean lookForDup ); -static int CheckAccess(ExtendedVCB *vcb, CatalogKey *key, struct proc *p); +static int CheckAccess(ExtendedVCB *vcb, u_long searchBits, CatalogKey *key, struct proc *p); static int InsertMatch(struct vnode *vp, struct uio *a_uio, CatalogRecord *rec, CatalogKey *key, struct attrlist *returnAttrList, @@ -161,6 +161,7 @@ }; */ +__private_extern__ int hfs_search( ap ) struct vop_searchfs_args *ap; /* @@ -198,6 +199,7 @@ BTScanState myBTScanState; void *user_start = NULL; int user_len; + int32_t searchTime; /* XXX Parameter check a_searchattrs? */ @@ -206,10 +208,32 @@ if (ap->a_options & ~SRCHFS_VALIDOPTIONSMASK) return (EINVAL); + /* SRCHFS_SKIPLINKS requires root access. + * This option cannot be used with either + * the ATTR_CMN_NAME or ATTR_CMN_PAROBJID + * attributes. + */ + if (ap->a_options & SRCHFS_SKIPLINKS) { + attrgroup_t attrs; + + attrs = ap->a_searchattrs->commonattr | ap->a_returnattrs->commonattr; + if (attrs & (ATTR_CMN_NAME | ATTR_CMN_PAROBJID)) + return (EINVAL); + if ((err = suser(p->p_ucred, &p->p_acflag))) + return (err); + } + if (ap->a_uio->uio_resid <= 0) return (EINVAL); isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); + + searchTime = kMaxMicroSecsInKernel; + if (ap->a_timelimit->tv_sec == 0 && + ap->a_timelimit->tv_usec > 0 && + ap->a_timelimit->tv_usec < kMaxMicroSecsInKernel) { + searchTime = ap->a_timelimit->tv_usec; + } /* UnPack the search boundries, searchInfo1, searchInfo2 */ err = UnpackSearchAttributeBlock(ap->a_vp, ap->a_searchattrs, @@ -256,6 +280,10 @@ /* Starting a new search. */ /* Make sure the on-disk Catalog file is current */ (void) VOP_FSYNC(vcb->catalogRefNum, NOCRED, MNT_WAIT, p); + if (VTOHFS(ap->a_vp)->jnl) { + journal_flush(VTOHFS(ap->a_vp)->jnl); + } + ap->a_options &= ~SRCHFS_START; bzero( (caddr_t)myCatPositionPtr, sizeof( *myCatPositionPtr ) ); err = BTScanInitialize(catalogFCB, 0, 0, 0, kCatSearchBufferSize, &myBTScanState); @@ -289,7 +317,7 @@ if ( result == E_NONE ) { if (CheckCriteria(vcb, ap->a_options, ap->a_searchattrs, &rec, keyp, &searchInfo1, &searchInfo2, false) && - CheckAccess(vcb, keyp, ap->a_uio->uio_procp)) { + CheckAccess(vcb, ap->a_options, keyp, ap->a_uio->uio_procp)) { result = InsertMatch(ap->a_vp, ap->a_uio, &rec, keyp, ap->a_returnattrs, @@ -340,12 +368,12 @@ break; /* Resolve any hardlinks */ - if (isHFSPlus) + if (isHFSPlus && (ap->a_options & SRCHFS_SKIPLINKS) == 0) ResolveHardlink(vcb, (HFSPlusCatalogFile *) myCurrentDataPtr); if (CheckCriteria( vcb, ap->a_options, ap->a_searchattrs, myCurrentDataPtr, myCurrentKeyPtr, &searchInfo1, &searchInfo2, true ) - && CheckAccess(vcb, myCurrentKeyPtr, ap->a_uio->uio_procp)) { + && CheckAccess(vcb, ap->a_options, myCurrentKeyPtr, ap->a_uio->uio_procp)) { err = InsertMatch(ap->a_vp, ap->a_uio, myCurrentDataPtr, myCurrentKeyPtr, ap->a_returnattrs, attributesBuffer, variableBuffer, @@ -373,7 +401,7 @@ timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); /* Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 */ if (myElapsedTime.tv_sec > 0 - || myElapsedTime.tv_usec >= kMaxMicroSecsInKernel) { + || myElapsedTime.tv_usec >= searchTime) { timerExpired = true; } } @@ -418,7 +446,12 @@ && (SWAP_BE32(recp->userInfo.fdCreator) == kHFSPlusCreator) && ((to_bsd_time(recp->createDate) == vcb->vcbCrDate) || (to_bsd_time(recp->createDate) == VCBTOHFS(vcb)->hfs_metadata_createdate))) { + cnid_t saved_cnid; + + /* Export link's cnid (a unique value) instead of inode's cnid */ + saved_cnid = recp->fileID; (void) resolvelink(VCBTOHFS(vcb), recp->bsdInfo.special.iNodeNum, recp); + recp->fileID = saved_cnid; } } @@ -484,12 +517,130 @@ } + +static char *extension_table=NULL; +static int nexts; +static int max_ext_width; + +static int +extension_cmp(void *a, void *b) +{ + return (strlen((char *)a) - strlen((char *)b)); +} + + +// +// This is the api LaunchServices uses to inform the kernel +// the list of package extensions to ignore. +// +// Internally we keep the list sorted by the length of the +// the extension (from longest to shortest). We sort the +// list of extensions so that we can speed up our searches +// when comparing file names -- we only compare extensions +// that could possibly fit into the file name, not all of +// them (i.e. a short 8 character name can't have an 8 +// character extension). +// +__private_extern__ int +set_package_extensions_table(void *data, int nentries, int maxwidth) +{ + char *new_exts, *ptr; + int error, i, len; + + if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { + return EINVAL; + } + + MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK); + + error = copyin(data, new_exts, nentries * maxwidth); + if (error) { + FREE(new_exts, M_TEMP); + return error; + } + + if (extension_table) { + FREE(extension_table, M_TEMP); + } + extension_table = new_exts; + nexts = nentries; + max_ext_width = maxwidth; + + qsort(extension_table, nexts, maxwidth, extension_cmp); + + return 0; +} + + +static int +is_package_name(char *name, int len) +{ + int i, extlen; + char *ptr, *name_ext; + + if (len <= 3) { + return 0; + } + + name_ext = NULL; + for(ptr=name; *ptr != '\0'; ptr++) { + if (*ptr == '.') { + name_ext = ptr; + } + } + + // if there is no "." extension, it can't match + if (name_ext == NULL) { + return 0; + } + + // advance over the "." + name_ext++; + + // now iterate over all the extensions to see if any match + ptr = &extension_table[0]; + for(i=0; i < nexts; i++, ptr+=max_ext_width) { + extlen = strlen(ptr); + if (strncmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { + // aha, a match! + return 1; + } + } + + // if we get here, no extension matched + return 0; +} + +// +// Determine if a name is "inappropriate" where the definition +// of "inappropriate" is up to higher level execs. Currently +// that's limited to /System. +// +static int +is_inappropriate_name(char *name, int len) +{ + char *bad_names[] = { "System" }; + int bad_len[] = { 6 }; + int i; + + for(i=0; i < sizeof(bad_names) / sizeof(bad_names[0]); i++) { + if (len == bad_len[i] && strcmp(name, bad_names[i]) == 0) { + return 1; + } + } + + // if we get here, no name matched + return 0; +} + + + /* * Check to see if caller has access rights to this item */ static int -CheckAccess(ExtendedVCB *theVCBPtr, CatalogKey *theKeyPtr, struct proc *theProcPtr) +CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, struct proc *theProcPtr) { Boolean isHFSPlus; int myErr; @@ -499,6 +650,8 @@ hfsmount_t * my_hfsmountPtr; struct cat_desc my_cat_desc; struct cat_attr my_cat_attr; + struct FndrDirInfo *finder_info; + myResult = 0; /* default to "no access" */ my_cat_desc.cd_nameptr = NULL; @@ -527,10 +680,34 @@ if ( myErr ) goto ExitThisRoutine; /* no access */ + if (searchBits & SRCHFS_SKIPPACKAGES) { + if (is_package_name(my_cat_desc.cd_nameptr, my_cat_desc.cd_namelen)) { + myResult = 0; + goto ExitThisRoutine; + } + } + + if (searchBits & SRCHFS_SKIPINAPPROPRIATE) { + if ( my_cat_desc.cd_parentcnid == kRootDirID + && is_inappropriate_name(my_cat_desc.cd_nameptr, my_cat_desc.cd_namelen)) { + myResult = 0; + goto ExitThisRoutine; + } + } + + finder_info = (struct FndrDirInfo *)&my_cat_attr.ca_finderinfo[0]; + if ( (searchBits & SRCHFS_SKIPINVISIBLE) + && (SWAP_BE16(finder_info->frFlags) & kIsInvisible)) { + + myResult = 0; + goto ExitThisRoutine; + } + myNodeID = my_cat_desc.cd_parentcnid; /* move up the hierarchy */ myPerms = DerivePermissionSummary(my_cat_attr.ca_uid, my_cat_attr.ca_gid, my_cat_attr.ca_mode, my_hfsmountPtr->hfs_mp, theProcPtr->p_ucred, theProcPtr ); + cat_releasedesc( &my_cat_desc ); if ( (myPerms & X_OK) == 0 ) @@ -574,7 +751,29 @@ break; case kHFSFileRecord: + if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ + matched = false; + goto TestDone; + } + break; + case kHFSPlusFileRecord: + /* Check if hardlink links should be skipped. */ + if (searchBits & SRCHFS_SKIPLINKS) { + cnid_t parid = key->hfsPlus.parentID; + HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)rec; + + if ((SWAP_BE32(filep->userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(filep->userInfo.fdCreator) == kHFSPlusCreator)) { + return (false); /* skip over link records */ + } else if ((parid == VCBTOHFS(vcb)->hfs_privdir_desc.cd_cnid) && + (filep->bsdInfo.special.linkCount == 0)) { + return (false); /* skip over unlinked files */ + } + } else if (key->hfsPlus.parentID == VCBTOHFS(vcb)->hfs_privdir_desc.cd_cnid) { + return (false); /* skip over private files */ + } + if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ matched = false; goto TestDone; @@ -636,6 +835,42 @@ /* Convert catalog record into cat_attr format. */ cat_convertattr(VCBTOHFS(vcb), rec, &c_attr, &datafork, &rsrcfork); + if (searchBits & SRCHFS_SKIPINVISIBLE) { + int flags; + + switch (rec->recordType) { + case kHFSFolderRecord: + case kHFSPlusFolderRecord: { + struct FndrDirInfo *finder_info; + + finder_info = (struct FndrDirInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->frFlags); + break; + } + + case kHFSFileRecord: + case kHFSPlusFileRecord: { + struct FndrFileInfo *finder_info; + + finder_info = (struct FndrFileInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->fdFlags); + break; + } + + default: { + flags = kIsInvisible; + break; + } + } + + if (flags & kIsInvisible) { + matched = false; + goto TestDone; + } + } + + + /* Now that we have a record worth searching, see if it matches the search attributes */ if (rec->recordType == kHFSFileRecord || rec->recordType == kHFSPlusFileRecord) { @@ -862,7 +1097,7 @@ u_long packedBufferSize; ExtendedVCB *vcb = VTOVCB(root_vp); Boolean isHFSPlus = vcb->vcbSigWord == kHFSPlusSigWord; - u_long privateDir = VTOHFS(root_vp)->hfs_private_metadata_dir; + u_long privateDir = VTOHFS(root_vp)->hfs_privdir_desc.cd_cnid; struct attrblock attrblk; struct cat_desc c_desc = {0}; struct cat_attr c_attr = {0}; @@ -899,19 +1134,13 @@ c_desc.cd_parentcnid = key->hfs.parentID; } - /* hide open files that have been deleted */ - if ((privateDir != 0) && (c_desc.cd_parentcnid == privateDir)) { - err = 0; - goto exit; - } - attrblk.ab_attrlist = returnAttrList; attrblk.ab_attrbufpp = &rovingAttributesBuffer; attrblk.ab_varbufpp = &rovingVariableBuffer; attrblk.ab_flags = 0; attrblk.ab_blocksize = 0; - hfs_packattrblk(&attrblk, VTOHFS(root_vp), NULL, &c_desc, &c_attr, &datafork, &rsrcfork); + hfs_packattrblk(&attrblk, VTOHFS(root_vp), NULL, &c_desc, &c_attr, &datafork, &rsrcfork, a_uio->uio_procp); packedBufferSize = (char*)rovingVariableBuffer - (char*)attributesBuffer; @@ -1014,12 +1243,8 @@ ++((struct timespec *)attributeBuffer); } if ( a & ATTR_CMN_FNDRINFO ) { - bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(u_long) * 8 ); - (u_long *)attributeBuffer += 8; - } - if ( a & ATTR_CMN_BKUPTIME ) { - searchInfo->lastBackupDate = *((struct timespec *)attributeBuffer); - ++((struct timespec *)attributeBuffer); + bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(u_long) * 8 ); + (u_long *)attributeBuffer += 8; } if ( a & ATTR_CMN_OWNERID ) { searchInfo->uid = *((uid_t *)attributeBuffer); diff -urN xnu-344.49/bsd/hfs/hfs_vfsops.c xnu-517/bsd/hfs/hfs_vfsops.c --- xnu-344.49/bsd/hfs/hfs_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_vfsops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -74,11 +74,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include // XXXdbg #include @@ -118,11 +121,14 @@ struct hfs_mount_args *args)); static int hfs_statfs __P((struct mount *mp, register struct statfs *sbp, struct proc *p)); +static int hfs_flushfiles __P((struct mount *, int, struct proc *)); +static int hfs_extendfs __P((struct mount *, u_int64_t, struct proc *)); /* * Called by vfs_mountroot when mounting HFS Plus as root. */ +__private_extern__ int hfs_mountroot() { @@ -146,9 +152,13 @@ } if ((error = hfs_mountfs(rootvp, mp, p, NULL))) { mp->mnt_vfc->vfc_refcount--; + + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + FREE(mp->mnt_xinfo_ptr, M_TEMP); vfs_unbusy(mp, p); + vrele(rootvp); /* release the reference from bdevvp() */ - _FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); return (error); } simple_lock(&mountlist_slock); @@ -208,7 +218,8 @@ if (mp->mnt_flag & MNT_UPDATE) { hfsmp = VFSTOHFS(mp); - if ((hfsmp->hfs_fs_ronly == 0) && (mp->mnt_flag & MNT_RDONLY)) { + if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && + (mp->mnt_flag & MNT_RDONLY)) { /* use VFS_SYNC to push out System (btree) files */ retval = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); @@ -221,7 +232,7 @@ if ((retval = hfs_flushfiles(mp, flags, p))) goto error_exit; - hfsmp->hfs_fs_ronly = 1; + hfsmp->hfs_flags |= HFS_READ_ONLY; retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); /* also get the volume bitmap blocks */ @@ -229,16 +240,30 @@ retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p); if (retval) { - hfsmp->hfs_fs_ronly = 0; + hfsmp->hfs_flags &= ~HFS_READ_ONLY; goto error_exit; } + + if (hfsmp->jnl) { + hfs_global_exclusive_lock_acquire(hfsmp); + + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + + // Note: we explicitly don't want to shutdown + // access to the jvp because we may need + // it later if we go back to being read-write. + + hfs_global_exclusive_lock_release(hfsmp); + } } if ((mp->mnt_flag & MNT_RELOAD) && (retval = hfs_reload(mp, ndp->ni_cnd.cn_cred, p))) goto error_exit; - if (hfsmp->hfs_fs_ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { + if ((hfsmp->hfs_flags & HFS_READ_ONLY) && + (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. @@ -257,16 +282,61 @@ if (retval != E_NONE) goto error_exit; - /* only change hfs_fs_ronly after a successfull write */ - hfsmp->hfs_fs_ronly = 0; + // If the journal was shut-down previously because we were + // asked to be read-only, let's start it back up again now + + if ( (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) + && hfsmp->jnl == NULL + && hfsmp->jvp != NULL) { + int flags; + + if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { + flags = JOURNAL_RESET; + } else { + flags = 0; + } + + hfs_global_exclusive_lock_acquire(hfsmp); + + hfsmp->jnl = journal_open(hfsmp->jvp, + (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, + hfsmp->jnl_size, + hfsmp->hfs_devvp, + hfsmp->hfs_phys_block_size, + flags, + 0, + hfs_sync_metadata, hfsmp->hfs_mp); + + hfs_global_exclusive_lock_release(hfsmp); + + if (hfsmp->jnl == NULL) { + retval = EINVAL; + goto error_exit; + } else { + hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; + } + + } + + /* Only clear HFS_READ_ONLY after a successfull write */ + hfsmp->hfs_flags &= ~HFS_READ_ONLY; } - if ((hfsmp->hfs_fs_ronly == 0) && + if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) { /* setup private/hidden directory for unlinked files */ - hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp)); + FindMetaDataDirectory(HFSTOVCB(hfsmp)); if (hfsmp->jnl) hfs_remove_orphans(hfsmp); + + /* + * Allow hot file clustering if conditions allow. + */ + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && + (mp->mnt_flag & MNT_RDONLY) && + (mp->mnt_kern_flag & MNTK_WANTRDWR)) { + (void) hfs_recording_init(hfsmp, p); + } } if (args.fspec == 0) { @@ -374,15 +444,22 @@ hfsmp = VFSTOHFS(mp); vcb = HFSTOVCB(hfsmp); - permswitch = (((hfsmp->hfs_unknownpermissions != 0) && ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) == 0)) || - ((hfsmp->hfs_unknownpermissions == 0) && ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) != 0))); + permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) && + ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) == 0)) || + (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) && + (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS))); + /* The root filesystem must operate with actual permissions: */ if (permswitch && (mp->mnt_flag & MNT_ROOTFS) && (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS)) { mp->mnt_flag &= ~MNT_UNKNOWNPERMISSIONS; /* Just say "No". */ return EINVAL; - }; - hfsmp->hfs_unknownpermissions = ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) != 0); - namefix = permfix = 0; + } + if (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) + hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; + else + hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS; + + namefix = permfix = 0; /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */ if (args->hfs_timezone.tz_minuteswest != VNOVAL) { @@ -413,7 +490,7 @@ /* Change the hfs encoding value (hfs only) */ if ((HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) && - (hfsmp->hfs_encoding != (u_long)VNOVAL) && + (args->hfs_encoding != (u_long)VNOVAL) && (hfsmp->hfs_encoding != args->hfs_encoding)) { retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func); @@ -482,6 +559,7 @@ continue; } + /* Get the real uid/gid and perm mask from disk. */ if (permswitch || permfix) { cp->c_uid = cnattr.ca_uid; cp->c_gid = cnattr.ca_gid; @@ -614,7 +692,6 @@ return (error); } - /* update cnode's catalog descriptor */ (void) replace_desc(cp, &desc); } @@ -640,8 +717,10 @@ vhp = (HFSPlusVolumeHeader *) (bp->b_data + HFS_PRI_OFFSET(sectorsize)); /* Do a quick sanity check */ - if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord || - SWAP_BE16(vhp->version) != kHFSPlusVersion || + if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && + SWAP_BE16(vhp->signature) != kHFSXSigWord) || + (SWAP_BE16(vhp->version) != kHFSPlusVersion && + SWAP_BE16(vhp->version) != kHFSXVersion) || SWAP_BE32(vhp->blockSize) != vcb->blockSize) { brelse(bp); return (EIO); @@ -723,8 +802,11 @@ cat_releasedesc(&cndesc); /* Re-establish private/hidden directory for unlinked files */ - hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb); + FindMetaDataDirectory(vcb); + /* In case any volume information changed to trigger a notification */ + hfs_generate_volume_notifications(hfsmp); + return (0); } @@ -924,8 +1006,6 @@ MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK); bzero(hfsmp, sizeof(struct hfsmount)); - - simple_lock_init(&hfsmp->hfs_renamelock); /* * Init the volume information structure @@ -937,9 +1017,11 @@ hfsmp->hfs_devvp = devvp; hfsmp->hfs_phys_block_size = blksize; hfsmp->hfs_phys_block_count = blkcnt; - hfsmp->hfs_media_writeable = 1; - hfsmp->hfs_fs_ronly = ronly; - hfsmp->hfs_unknownpermissions = ((mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) != 0); + hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; + if (ronly) + hfsmp->hfs_flags |= HFS_READ_ONLY; + if (mp->mnt_flag & MNT_UNKNOWNPERMISSIONS) + hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; for (i = 0; i < MAXQUOTAS; i++) hfsmp->hfs_qfiles[i].qf_vp = NULLVP; @@ -974,9 +1056,9 @@ /* Find out if disk media is writable. */ if (VOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, cred, p) == 0) { if (iswritable) - hfsmp->hfs_media_writeable = 1; + hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; else - hfsmp->hfs_media_writeable = 0; + hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; } /* Mount a standard HFS disk */ @@ -1104,8 +1186,43 @@ if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { mp->mnt_flag |= MNT_JOURNALED; } else { - retval = EINVAL; - goto error_exit; + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (ronly != 0 || devvp == rootvp) { + HFSPlusVolumeHeader *vhp; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + if (mdb_offset == 0) { + mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); + } + + bp = NULL; + retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); + if (retval == 0) { + vhp = (HFSPlusVolumeHeader *)(bp->b_data + HFS_PRI_OFFSET(blksize)); + + if (SWAP_BE16(vhp->signature) == kHFSPlusSigWord || SWAP_BE16(vhp->signature) == kHFSXSigWord) { + vhp->lastMountedVersion = SWAP_BE32('FSK!'); + bwrite(bp); + } else { + brelse(bp); + } + bp = NULL; + } else if (bp) { + brelse(bp); + } + } + + // if this isn't the root device just bail out. + // if it is the root device we just continue on + // in the hopes that fsck_hfs will be able to + // fix any damage that exists on the volume. + if (devvp != rootvp) { + retval = EINVAL; + goto error_exit; + } } } // XXXdbg @@ -1134,6 +1251,15 @@ hfsmp->hfs_phys_block_count *= hfsmp->hfs_phys_block_size / blksize; hfsmp->hfs_phys_block_size = blksize; + if (hfsmp->jnl) { + // close and re-open this with the new block size + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { + mp->mnt_flag |= MNT_JOURNALED; + } + } + /* Try again with a smaller block size... */ retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args); } @@ -1150,6 +1276,45 @@ mp->mnt_maxsymlinklen = 0; devvp->v_specflags |= SI_MOUNTEDON; + if (args) { + /* + * Set the free space warning levels for a non-root volume: + * + * Set the lower freespace limit (the level that will trigger a warning) + * to 5% of the volume size or 250MB, whichever is less, and the desired + * level (which will cancel the alert request) to 1/2 above that limit. + * Start looking for free space to drop below this level and generate a + * warning immediately if needed: + */ + hfsmp->hfs_freespace_notify_warninglimit = + MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_desiredlevel = + MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION); + } else { + /* + * Set the free space warning levels for the root volume: + * + * Set the lower freespace limit (the level that will trigger a warning) + * to 1% of the volume size or 50MB, whichever is less, and the desired + * level (which will cancel the alert request) to 2% or 75MB, whichever is less. + */ + hfsmp->hfs_freespace_notify_warninglimit = + MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_desiredlevel = + MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION); + }; + + /* + * Start looking for free space to drop below this level and generate a + * warning immediately if needed: + */ + hfsmp->hfs_notification_conditions = 0; + hfs_generate_volume_notifications(hfsmp); + if (ronly == 0) { (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); } @@ -1214,13 +1379,16 @@ if ((retval = hfs_flushfiles(mp, flags, p)) && !force) return (retval); + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) + (void) hfs_recording_suspend(hfsmp, p); + /* * Flush out the b-trees, volume bitmap and Volume Header */ - if (hfsmp->hfs_fs_ronly == 0) { + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { hfs_global_shared_lock_acquire(hfsmp); grabbed_lock = 1; - if (hfsmp->jnl) { + if (hfsmp->jnl) { journal_start_transaction(hfsmp->jnl); started_tr = 1; } @@ -1242,18 +1410,27 @@ } } + if (hfsmp->hfc_filevp && (hfsmp->hfc_filevp->v_flag & VSYSTEM)) { + retval = VOP_FSYNC(hfsmp->hfc_filevp, NOCRED, MNT_WAIT, p); + if (retval && !force) + goto err_exit; + } + if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) { if (!force) goto err_exit; } - + +#if 0 /* See if this volume is damaged, is so do not unmount cleanly */ if (HFSTOVCB(hfsmp)->vcbFlags & kHFS_DamagedVolume) { HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; } else { HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; } - +#else + HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; +#endif retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); if (retval) { HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; @@ -1280,26 +1457,45 @@ */ (void) hfsUnmount(hfsmp, p); + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) (void) hfs_relconverter(hfsmp->hfs_encoding); // XXXdbg if (hfsmp->jnl) { journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; } if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, + retval = VOP_CLOSE(hfsmp->jvp, + hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(hfsmp->jvp); - hfsmp->jvp = NULL; + hfsmp->jvp = NULL; } // XXXdbg +#ifdef HFS_SPARSE_DEV + /* Drop our reference on the backing fs (if any). */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { + struct vnode * tmpvp; + + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + tmpvp = hfsmp->hfs_backingfs_rootvp; + hfsmp->hfs_backingfs_rootvp = NULLVP; + vrele(tmpvp); + } +#endif /* HFS_SPARSE_DEV */ + hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON; retval = VOP_CLOSE(hfsmp->hfs_devvp, - hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, - NOCRED, p); + hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, + NOCRED, p); if (retval && !force) return(retval); @@ -1551,9 +1747,8 @@ return (0); hfsmp = VFSTOHFS(mp); - if (hfsmp->hfs_fs_ronly != 0) { - panic("update: rofs mod"); - }; + if (hfsmp->hfs_flags & HFS_READ_ONLY) + return (EROFS); #if 0 // XXXdbg first go through and flush out any modified @@ -1590,12 +1785,7 @@ // restart our whole search if this guy is locked // or being reclaimed. - // XXXdbg - at some point this should go away or we - // need to change all file systems to have - // this same code. vget() should never return - // success if either of these conditions is - // true. - if (vp->v_tag != VT_HFS || cp == NULL) { + if (vp->v_tag != VT_HFS || cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { simple_unlock(&vp->v_interlock); continue; } @@ -1619,9 +1809,15 @@ } didhold = ubc_hold(vp); + + // mark the cnode so that fsync won't flush + // the journal since we're going to do that... + cp->c_flag |= C_FROMSYNC; if ((error = VOP_FSYNC(vp, cred, waitfor, p))) { allerror = error; }; + cp->c_flag &= ~C_FROMSYNC; + VOP_UNLOCK(vp, 0, p); if (didhold) ubc_rele(vp); @@ -1675,6 +1871,8 @@ #if QUOTA hfs_qsync(mp); #endif /* QUOTA */ + + hfs_hotfilesync(hfsmp, p); /* * Write back modified superblock. */ @@ -1731,7 +1929,7 @@ * Get the export permission structure for this tuple. */ np = vfs_export_lookup(mp, &VFSTOHFS(mp)->hfs_export, nam); - if (np == NULL) { + if (nam && (np == NULL)) { return EACCES; }; @@ -1755,9 +1953,23 @@ return (ESTALE); }; + if (VNAME(nvp) == NULL) { + struct cnode *cp = VTOC(nvp); + + if (nvp == cp->c_rsrc_vp) { + // the +1/-2 thing is to skip the leading "/" on the rsrc fork spec + // and to not count the trailing null byte at the end of the string. + VNAME(nvp) = add_name(_PATH_RSRCFORKSPEC+1, sizeof(_PATH_RSRCFORKSPEC)-2, 0, 0); + } else { + VNAME(nvp) = add_name(cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, 0); + } + } + *vpp = nvp; - *exflagsp = np->netc_exflags; - *credanonp = &np->netc_anon; + if (np) { + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + } return (0); } @@ -1782,7 +1994,7 @@ hfsfhp = (struct hfsfid *)fhp; hfsfhp->hfsfid_len = sizeof(struct hfsfid); hfsfhp->hfsfid_pad = 0; - hfsfhp->hfsfid_cnid = cp->c_cnid; + hfsfhp->hfsfid_cnid = cp->c_fileid; hfsfhp->hfsfid_gen = cp->c_itime; return (0); @@ -1807,6 +2019,8 @@ dqinit(); #endif /* QUOTA */ + BTReserveSetup(); + /* * Allocate Catalog Iterator cache... */ @@ -1815,6 +2029,31 @@ return (0); } +static int +hfs_getmountpoint(vp, hfsmpp) + struct vnode *vp; + struct hfsmount **hfsmpp; +{ + struct hfsmount * hfsmp; + + if (vp == NULL) + return (EINVAL); + + if ((vp->v_flag & VROOT) == 0) + return (EINVAL); + + if (strcmp(vp->v_mount->mnt_stat.f_fstypename, "hfs") != 0) + return (EINVAL); + + hfsmp = VTOHFS(vp); + + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) + return (EINVAL); + + *hfsmpp = hfsmp; + + return (0); +} // XXXdbg #include @@ -1833,17 +2072,68 @@ size_t newlen; struct proc *p; { - extern u_int32_t hfs_encodingbias; + extern u_int32_t hfs_getencodingbias(void); + extern void hfs_setencodingbias(u_int32_t); + + int error; + struct sysctl_req *req; + struct vfsidctl vc; + struct mount *mp; + struct hfsmount *hfsmp; + struct vfsquery vq; /* all sysctl names at this level are terminal */ - if (name[0] == HFS_ENCODINGBIAS) - return (sysctl_int(oldp, oldlenp, newp, newlen, - &hfs_encodingbias)); - else if (name[0] == 0x082969) { + if (name[0] == HFS_ENCODINGBIAS) { + u_int32_t bias; + + bias = hfs_getencodingbias(); + error = sysctl_int(oldp, oldlenp, newp, newlen, &bias); + if (error == 0 && newp) + hfs_setencodingbias(bias); + return (error); + + } else if (name[0] == HFS_EXTEND_FS) { + u_int64_t newsize; + + if (newp == NULL) + return (EINVAL); + if ((error = hfs_getmountpoint(p->p_fd->fd_cdir, &hfsmp))) + return (error); + error = sysctl_quad(oldp, oldlenp, newp, newlen, &newsize); + if (error) + return (error); + + error = hfs_extendfs(HFSTOVFS(hfsmp), newsize, p); + return (error); + + } else if (name[0] == HFS_ENCODINGHINT) { + size_t bufsize; + size_t bytes; + u_int32_t hint; + u_int16_t *unicode_name; + char *filename; + + bufsize = MAX(newlen * 3, MAXPATHLEN); + MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK); + MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK); + + error = copyin(newp, (caddr_t)filename, newlen); + if (error == 0) { + error = utf8_decodestr(filename, newlen - 1, unicode_name, + &bytes, bufsize, 0, UTF_DECOMPOSED); + if (error == 0) { + hint = hfs_pickencoding(unicode_name, bytes / 2); + error = sysctl_int(oldp, oldlenp, NULL, NULL, &hint); + } + } + FREE(unicode_name, M_TEMP); + FREE(filename, M_TEMP); + return (error); + + } else if (name[0] == HFS_ENABLE_JOURNALING) { // make the file system journaled... struct vnode *vp = p->p_fd->fd_cdir, *jvp; - struct hfsmount *hfsmp; ExtendedVCB *vcb; int retval; struct cat_attr jnl_attr, jinfo_attr; @@ -1851,11 +2141,12 @@ void *jnl = NULL; /* Only root can enable journaling */ - if (current_proc()->p_ucred->cr_uid != 0) { + if (current_proc()->p_ucred->cr_uid != 0) { return (EPERM); } + hfsmp = VTOHFS(vp); - if (hfsmp->hfs_fs_ronly) { + if (hfsmp->hfs_flags & HFS_READ_ONLY) { return EROFS; } if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { @@ -1893,7 +2184,7 @@ jnl = journal_create(jvp, (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, - (off_t)name[3], + (off_t)((unsigned)name[3]), hfsmp->hfs_devvp, hfsmp->hfs_phys_block_size, 0, @@ -1903,7 +2194,7 @@ if (jnl == NULL) { printf("hfs: FAILED to create the journal!\n"); if (jvp && jvp != hfsmp->hfs_devvp) { - VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, FSCRED, p); } jvp = NULL; @@ -1919,6 +2210,7 @@ // save this off for the hack-y check in hfs_remove() hfsmp->jnl_start = (u_int32_t)name[2]; + hfsmp->jnl_size = (off_t)((unsigned)name[3]); hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid; hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid; @@ -1928,21 +2220,18 @@ hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); return 0; - } else if (name[0] == 0x031272) { + } else if (name[0] == HFS_DISABLE_JOURNALING) { // clear the journaling bit struct vnode *vp = p->p_fd->fd_cdir; - struct hfsmount *hfsmp; void *jnl; int retval; /* Only root can disable journaling */ - if (current_proc()->p_ucred->cr_uid != 0) { + if (current_proc()->p_ucred->cr_uid != 0) { return (EPERM); } + hfsmp = VTOHFS(vp); - if (hfsmp->jnl == NULL) { - return EINVAL; - } printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount); @@ -1955,7 +2244,7 @@ journal_close(jnl); if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { - VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, FSCRED, p); } hfsmp->jnl = NULL; hfsmp->jvp = NULL; @@ -1970,7 +2259,45 @@ hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); return 0; - } + } else if (name[0] == HFS_GET_JOURNAL_INFO) { + struct vnode *vp = p->p_fd->fd_cdir; + off_t jnl_start, jnl_size; + + hfsmp = VTOHFS(vp); + if (hfsmp->jnl == NULL) { + jnl_start = 0; + jnl_size = 0; + } else { + jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset; + jnl_size = (off_t)hfsmp->jnl_size; + } + + if ((error = copyout((caddr_t)&jnl_start, (void *)name[1], sizeof(off_t))) != 0) { + return error; + } + if ((error = copyout((caddr_t)&jnl_size, (void *)name[2], sizeof(off_t))) != 0) { + return error; + } + + return 0; + } else if (name[0] == HFS_SET_PKG_EXTENSIONS) { + + return set_package_extensions_table((void *)name[1], name[2], name[3]); + + } else if (name[0] == VFS_CTL_QUERY) { + req = oldp; /* we're new style vfs sysctl. */ + + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) return (error); + + mp = vfs_getvfs(&vc.vc_fsid); + if (mp == NULL) return (ENOENT); + + hfsmp = VFSTOHFS(mp); + bzero(&vq, sizeof(vq)); + vq.vq_flags = hfsmp->hfs_notification_conditions; + return SYSCTL_OUT(req, &vq, sizeof(vq));; + }; return (EOPNOTSUPP); } @@ -1999,36 +2326,149 @@ } /* + * Check to see if a given vnode is only referenced for events: + * [ entered with vp->v_interlock locked ] + */ +static int +hfs_evtonly(struct vnode *vp) +{ + int ubc_refcount; + + ubc_refcount = UBCINFOEXISTS(vp) ? 1 : 0; + return (vp->v_usecount == (ubc_refcount + EVTONLYREFS(vp))); +} + +/* + * Check to see if all non-system vnodes for a given mountpoint are events-only + */ +static int +hfs_flush_evtonly(struct mount *mp, int flags, int dispose, struct proc *p) +{ + struct vnode *vp, *nvp; + int busy = 0; + + simple_lock(&mntvnode_slock); +loop: + for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { + if (vp->v_mount != mp) goto loop; + nvp = vp->v_mntvnodes.le_next; + + simple_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM or VNOFLUSH. + */ + if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || (vp->v_flag & VNOFLUSH))) { + simple_unlock(&vp->v_interlock); + continue; + }; + /* + * Skip over a vnodes marked VSWAP. + */ + if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { + simple_unlock(&vp->v_interlock); + continue; + } + if (hfs_evtonly(vp)) { + if (dispose) { + /* "dispose" implies "forcibly", a la "FORCECLOSE": */ + simple_unlock(&mntvnode_slock); + vgonel(vp, p); + simple_lock(&mntvnode_slock); + } else { + simple_unlock(&vp->v_interlock); + }; + continue; + }; + + simple_unlock(&vp->v_interlock); + ++busy; + /* If asked to dispose, keep trying. If only checking, the answer is now known. */ + if (dispose) { + continue; + } else { + break; + }; + } + simple_unlock(&mntvnode_slock); + + return (busy == 0); +} + +/* * Flush out all the files in a filesystem. */ -int +static int hfs_flushfiles(struct mount *mp, int flags, struct proc *p) { - register struct hfsmount *hfsmp; + struct hfsmount *hfsmp; + struct vnode *skipvp = NULLVP; + struct vnode *rsrcvp; + int quotafilecnt; int i; int error; -#if QUOTA hfsmp = VFSTOHFS(mp); +#if QUOTA + /* + * The open quota files have an indirect reference on + * the root directory vnode. We must account for this + * extra reference when doing the intial vflush. + */ + quotafilecnt = 0; + if (mp->mnt_flag & MNT_QUOTA) { + + /* Find out how many quota files we have open. */ + for (i = 0; i < MAXQUOTAS; i++) { + if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) + ++quotafilecnt; + } + + /* Obtain the root vnode so we can skip over it. */ + if (hfs_chashget(hfsmp->hfs_raw_dev, kRootDirID, 0, + &skipvp, &rsrcvp) == NULL) { + skipvp = NULLVP; + } + } +#endif /* QUOTA */ + + error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags); + /* + * If the vflush() call failed solely because there are + * some event-only vnodes in the list, then forcibly get + * rid of those vnodes before the final vflush() pass. + */ + if ((error == EBUSY) && hfs_flush_evtonly(mp, SKIPSYSTEM | SKIPSWAP, 0, p)) { + (void) hfs_flush_evtonly(mp, SKIPSYSTEM | SKIPSWAP, 1, p); + }; + error = vflush(mp, skipvp, SKIPSYSTEM | flags); + +#if QUOTA if (mp->mnt_flag & MNT_QUOTA) { - if (error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) + if (skipvp) { + /* + * See if there are additional references on the + * root vp besides the ones obtained from the open + * quota files and the hfs_chashget call above. + */ + if ((error == 0) && + (skipvp->v_usecount > (1 + quotafilecnt))) { + error = EBUSY; /* root directory is still open */ + } + vput(skipvp); + } + if (error && (flags & FORCECLOSE) == 0) return (error); + for (i = 0; i < MAXQUOTAS; i++) { if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) continue; hfs_quotaoff(p, mp, i); } - /* - * Here we fall through to vflush again to ensure - * that we have gotten rid of all the system vnodes. - */ + error = vflush(mp, NULLVP, SKIPSYSTEM | flags); } #endif /* QUOTA */ - error = vflush(mp, NULLVP, (SKIPSYSTEM | SKIPSWAP | flags)); - error = vflush(mp, NULLVP, (SKIPSYSTEM | flags)); - return (error); } @@ -2056,8 +2496,8 @@ break; } - if (index < 128) { - HFSTOVCB(hfsmp)->encodingsBitmap |= (1 << index); + if (index < 64) { + HFSTOVCB(hfsmp)->encodingsBitmap |= (u_int64_t)(1ULL << index); HFSTOVCB(hfsmp)->vcbFlags |= 0xFF00; } } @@ -2209,7 +2649,14 @@ return (retval); } - +/* + * Flush any dirty in-memory mount data to the on-disk + * volume header. + * + * Note: the on-disk volume signature is intentionally + * not flushed since the on-disk "H+" and "HX" signatures + * are always stored in-memory as "H+". + */ __private_extern__ int hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) @@ -2223,7 +2670,12 @@ int sectorsize; int priIDSector; int critical = 0; + u_int16_t signature; + u_int16_t version; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return(0); + } if (vcb->vcbSigWord == kHFSSigWord) return hfs_flushMDB(hfsmp, waitfor, altflush); @@ -2252,6 +2704,7 @@ } hfs_global_shared_lock_release(hfsmp); + printf("HFS: err %d reading VH blk (%s)\n", retval, vcb->vcbVN); return (retval); } @@ -2262,6 +2715,24 @@ volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize)); /* + * Sanity check what we just read. + */ + signature = SWAP_BE16 (volumeHeader->signature); + version = SWAP_BE16 (volumeHeader->version); + if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || + (version < kHFSPlusVersion) || (version > 100) || + (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) { +#if 1 + panic("HFS: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d", + vcb->vcbVN, signature, version, + SWAP_BE32 (volumeHeader->blockSize)); +#endif + printf("HFS: corrupt VH blk (%s)\n", vcb->vcbVN); + brelse(bp); + return (EIO); + } + + /* * For embedded HFS+ volumes, update create date if it changed * (ie from a setattrlist call) */ @@ -2303,28 +2774,6 @@ } } -// XXXdbg - only monkey around with the volume signature on non-root volumes -// -#if 0 - if (hfsmp->jnl && - hfsmp->hfs_fs_ronly == 0 && - (HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) { - - int old_sig = volumeHeader->signature; - - if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) { - volumeHeader->signature = kHFSPlusSigWord; - } else { - volumeHeader->signature = kHFSJSigWord; - } - - if (old_sig != volumeHeader->signature) { - altflush = 1; - } - } -#endif -// XXXdbg - /* Note: only update the lower 16 bits worth of attributes */ volumeHeader->attributes = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb); volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock); @@ -2433,6 +2882,251 @@ vcb->vcbFlags &= 0x00FF; return (retval); +} + + +/* + * Extend a file system. + */ +static int +hfs_extendfs(struct mount *mp, u_int64_t newsize, struct proc *p) +{ + struct vnode *vp; + struct vnode *devvp; + struct buf *bp; + struct hfsmount *hfsmp; + struct filefork *fp = NULL; + ExtendedVCB *vcb; + struct cat_fork forkdata; + u_int64_t oldsize; + u_int64_t newblkcnt; + u_int32_t addblks; + u_int64_t sectorcnt; + u_int32_t sectorsize; + daddr_t prev_alt_sector; + daddr_t bitmapblks; + int error; + + hfsmp = VFSTOHFS(mp); + devvp = hfsmp->hfs_devvp; + vcb = HFSTOVCB(hfsmp); + + /* + * - HFS Plus file systems only. + * - Journaling must be enabled. + * - No embedded volumes. + */ + if ((vcb->vcbSigWord == kHFSSigWord) || + (hfsmp->jnl == NULL) || + (vcb->hfsPlusIOPosOffset != 0)) { + return (EPERM); + } + /* + * If extending file system by non-root, then verify + * ownership and check permissions. + */ + if (p->p_ucred->cr_uid != 0) { + error = hfs_root(mp, &vp); + if (error) + return (error); + error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, p->p_ucred, p, 0); + if (error == 0) { + error = hfs_write_access(vp, p->p_ucred, p, false); + } + vput(vp); + if (error) + return (error); + + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p); + VOP_UNLOCK(devvp, 0, p); + if (error) + return (error); + } + if (VOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§orsize, 0, FSCRED, p)) { + return (ENXIO); + } + if (sectorsize != hfsmp->hfs_phys_block_size) { + return (ENXIO); + } + if (VOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§orcnt, 0, FSCRED, p)) { + return (ENXIO); + } + if ((sectorsize * sectorcnt) < newsize) { + printf("hfs_extendfs: not enough space on device\n"); + return (ENOSPC); + } + oldsize = (u_int64_t)hfsmp->hfs_phys_block_count * + (u_int64_t)hfsmp->hfs_phys_block_size; + + /* + * Validate new size. + */ + if ((newsize <= oldsize) || (newsize % vcb->blockSize)) { + printf("hfs_extendfs: invalid size\n"); + return (EINVAL); + } + newblkcnt = newsize / vcb->blockSize; + if (newblkcnt > (u_int64_t)0xFFFFFFFF) + return (EOVERFLOW); + + addblks = newblkcnt - vcb->totalBlocks; + + printf("hfs_extendfs: growing %s by %d blocks\n", vcb->vcbVN, addblks); + /* + * Enclose changes inside a transaction. + */ + hfs_global_shared_lock_acquire(hfsmp); + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return (EINVAL); + } + + /* + * Remember the location of existing alternate VH. + */ + prev_alt_sector = (vcb->hfsPlusIOPosOffset / sectorsize) + + HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); + + vp = vcb->allocationsRefNum; + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (error) { + goto out2; + } + fp = VTOF(vp); + bcopy(&fp->ff_data, &forkdata, sizeof(forkdata)); + + /* + * Calculate additional space required (if any) by allocation bitmap. + */ + bitmapblks = roundup(newblkcnt / 8, vcb->vcbVBMIOSize) / vcb->blockSize; + if (bitmapblks > fp->ff_blocks) + bitmapblks -= fp->ff_blocks; + else + bitmapblks = 0; + + if (bitmapblks > 0) { + daddr_t blkno; + daddr_t blkcnt; + + /* + * Add a new extent to the allocation bitmap file. + */ + error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks); + if (error) { + printf("hfs_extendfs: error %d adding extents\n", error); + goto out; + } + blkcnt = bitmapblks; + blkno = fp->ff_blocks; + fp->ff_blocks += bitmapblks; + fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; + VTOC(vp)->c_blocks = fp->ff_blocks; + /* + * Zero out the new bitmap blocks. + */ + { + + bp = NULL; + while (blkcnt > 0) { + error = meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp); + if (error) { + if (bp) { + brelse(bp); + } + break; + } + bzero((char *)bp->b_data, vcb->blockSize); + bp->b_flags |= B_AGE; + error = bwrite(bp); + if (error) + break; + --blkcnt; + ++blkno; + } + } + if (error) { + printf("hfs_extendfs: error %d clearing blocks\n", error); + goto out; + } + /* + * Mark the new bitmap space as allocated. + */ + error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks); + if (error) { + printf("hfs_extendfs: error %d setting bitmap\n", error); + goto out; + } + } + /* + * Mark the new alternate VH as allocated. + */ + if (vcb->blockSize == 512) + error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2); + else + error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1); + if (error) { + printf("hfs_extendfs: error %d setting bitmap (VH)\n", error); + goto out; + } + /* + * Mark the old alternate VH as free. + */ + if (vcb->blockSize == 512) + (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2); + else + (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1); + + /* + * Adjust file system variables for new space. + */ + vcb->totalBlocks += addblks; + vcb->freeBlocks += addblks - bitmapblks; + hfsmp->hfs_phys_block_count = newsize / sectorsize; + + MarkVCBDirty(vcb); + error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + if (error) { + printf("hfs_extendfs: couldn't flush volume headers (%d)", error); + /* + * Restore to old state. + */ + fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; + vcb->totalBlocks -= addblks; + vcb->freeBlocks -= addblks - bitmapblks; + hfsmp->hfs_phys_block_count = oldsize / sectorsize; + MarkVCBDirty(vcb); + if (vcb->blockSize == 512) + (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2); + else + (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1); + goto out; + } + /* + * Invalidate the old alternate volume header. + */ + bp = NULL; + if (meta_bread(hfsmp->hfs_devvp, prev_alt_sector, sectorsize, + NOCRED, &bp) == 0) { + journal_modify_block_start(hfsmp->jnl, bp); + bzero(bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); + journal_modify_block_end(hfsmp->jnl, bp); + } else if (bp) { + brelse(bp); + } +out: + if (error && fp) { + /* Restore allocation fork. */ + bcopy(&forkdata, &fp->ff_data, sizeof(forkdata)); + VTOC(vp)->c_blocks = fp->ff_blocks; + + } + VOP_UNLOCK(vp, 0, p); +out2: + journal_end_transaction(hfsmp->jnl); + hfs_global_shared_lock_release(hfsmp); + + return (error); } diff -urN xnu-344.49/bsd/hfs/hfs_vfsutils.c xnu-517/bsd/hfs/hfs_vfsutils.c --- xnu-344.49/bsd/hfs/hfs_vfsutils.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_vfsutils.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -54,12 +54,16 @@ extern int count_lock_queue __P((void)); -extern uid_t console_user; static void ReleaseMetaFileVNode(struct vnode *vp); static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args); +static void hfs_metadatazone_init(struct hfsmount *); +static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); + + + u_int32_t GetLogicalBlockSize(struct vnode *vp); /* BTree accessor routines */ @@ -86,6 +90,7 @@ char hfs_privdirname[] = "\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80HFS+ Private Data"; +__private_extern__ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, struct proc *p) { @@ -102,9 +107,11 @@ return (EINVAL); /* don't mount a writeable volume if its dirty, it must be cleaned by fsck_hfs */ - if ((hfsmp->hfs_fs_ronly == 0) && ((SWAP_BE16(mdb->drAtrb) & kHFSVolumeUnmountedMask) == 0)) + if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && + ((SWAP_BE16(mdb->drAtrb) & kHFSVolumeUnmountedMask) == 0)) { return (EINVAL); - + } + hfsmp->hfs_flags |= HFS_STANDARD; /* * The MDB seems OK: transfer info from it into VCB * Note - the VCB starts out clear (all zeros) @@ -130,7 +137,7 @@ vcb->vcbFilCnt = SWAP_BE32 (mdb->drFilCnt); vcb->vcbDirCnt = SWAP_BE32 (mdb->drDirCnt); bcopy(mdb->drFndrInfo, vcb->vcbFndrInfo, sizeof(vcb->vcbFndrInfo)); - if (!hfsmp->hfs_fs_ronly) + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) vcb->vcbWrCnt++; /* Compensate for write of MDB on last flush */ /* convert hfs encoded name into UTF-8 string */ @@ -149,6 +156,7 @@ bzero(&cndesc, sizeof(cndesc)); cndesc.cd_parentcnid = kRootParID; + cndesc.cd_flags |= CD_ISMETA; bzero(&cnattr, sizeof(cnattr)); cnattr.ca_nlink = 1; cnattr.ca_mode = S_IFREG; @@ -163,6 +171,7 @@ fork.cf_size = SWAP_BE32(mdb->drXTFlSize); fork.cf_blocks = fork.cf_size / vcb->blockSize; fork.cf_clump = SWAP_BE32(mdb->drXTClpSiz); + fork.cf_vblocks = 0; fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drXTExtRec[0].startBlock); fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drXTExtRec[0].blockCount); fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drXTExtRec[1].startBlock); @@ -175,9 +184,7 @@ &vcb->extentsRefNum); if (error) goto MtVolErr; error = MacToVFSError(BTOpenPath(VTOF(vcb->extentsRefNum), - (KeyCompareProcPtr)CompareExtentKeys, - GetBTreeBlock, ReleaseBTreeBlock, - ExtendBTreeFile, SetBTreeBlockSize)); + (KeyCompareProcPtr)CompareExtentKeys)); if (error) { VOP_UNLOCK(vcb->extentsRefNum, 0, p); goto MtVolErr; @@ -192,6 +199,7 @@ fork.cf_size = SWAP_BE32(mdb->drCTFlSize); fork.cf_blocks = fork.cf_size / vcb->blockSize; fork.cf_clump = SWAP_BE32(mdb->drCTClpSiz); + fork.cf_vblocks = 0; fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drCTExtRec[0].startBlock); fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drCTExtRec[0].blockCount); fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drCTExtRec[1].startBlock); @@ -207,9 +215,7 @@ goto MtVolErr; } error = MacToVFSError(BTOpenPath(VTOF(vcb->catalogRefNum), - (KeyCompareProcPtr)CompareCatalogKeys, - GetBTreeBlock, ReleaseBTreeBlock, - ExtendBTreeFile, SetBTreeBlockSize)); + (KeyCompareProcPtr)CompareCatalogKeys)); if (error) { VOP_UNLOCK(vcb->catalogRefNum, 0, p); VOP_UNLOCK(vcb->extentsRefNum, 0, p); @@ -249,38 +255,57 @@ // //******************************************************************************* +__private_extern__ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args) { register ExtendedVCB *vcb; struct cat_desc cndesc; struct cat_attr cnattr; + struct cat_fork cfork; UInt32 blockSize; + u_int64_t volumesize; + struct BTreeInfoRec btinfo; + u_int16_t signature; + u_int16_t version; + int i; OSErr retval; - // XXXdbg - added the kHFSJSigWord case - if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && - SWAP_BE16(vhp->signature) != kHFSJSigWord) || - SWAP_BE16(vhp->version) != kHFSPlusVersion) { - // XXXdbg - printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n", - vhp->signature, vhp->version); + signature = SWAP_BE16(vhp->signature); + version = SWAP_BE16(vhp->version); + + if (signature == kHFSPlusSigWord) { + if (version != kHFSPlusVersion) { + printf("hfs_mount: invalid HFS+ version: %d\n", version); + return (EINVAL); + } + } else if (signature == kHFSXSigWord) { + if (version != kHFSXVersion) { + printf("hfs_mount: invalid HFSX version: %d\n", version); + return (EINVAL); + } + /* The in-memory signature is always 'H+'. */ + signature = kHFSPlusSigWord; + hfsmp->hfs_flags |= HFS_X; + } else { + printf("hfs_mount: invalid HFS+ sig 0x%04x\n", signature); return (EINVAL); } /* Block size must be at least 512 and a power of 2 */ blockSize = SWAP_BE32(vhp->blockSize); - if (blockSize < 512 || (blockSize & (blockSize-1)) != 0) + if (blockSize < 512 || !powerof2(blockSize)) return (EINVAL); /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ - if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0 && hfsmp->jnl == NULL && + (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) return (EINVAL); /* Make sure we can live with the physical block size. */ if ((disksize & (hfsmp->hfs_phys_block_size - 1)) || (embeddedOffset & (hfsmp->hfs_phys_block_size - 1)) || - (SWAP_BE32(vhp->blockSize) < hfsmp->hfs_phys_block_size)) { + (blockSize < hfsmp->hfs_phys_block_size)) { return (ENXIO); } /* @@ -289,13 +314,7 @@ */ vcb = HFSTOVCB(hfsmp); - vcb->vcbSigWord = SWAP_BE16(vhp->signature); - - // XXXdbg - remap this in case we've mounted a dirty journaled volume - if (vcb->vcbSigWord == kHFSJSigWord) { - vcb->vcbSigWord = kHFSPlusSigWord; - } - + vcb->vcbSigWord = signature; vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); vcb->vcbAtrb = (UInt16)SWAP_BE32(vhp->attributes); @@ -310,7 +329,7 @@ bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo)); vcb->vcbAlBlSt = 0; /* hfs+ allocation blocks start at first block of volume */ - if (!hfsmp->hfs_fs_ronly) + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) vcb->vcbWrCnt++; /* compensate for write of Volume Header on last flush */ VCB_LOCK_INIT(vcb); @@ -319,7 +338,7 @@ vcb->nextAllocation = SWAP_BE32(vhp->nextAllocation); vcb->totalBlocks = SWAP_BE32(vhp->totalBlocks); vcb->freeBlocks = SWAP_BE32(vhp->freeBlocks); - vcb->blockSize = SWAP_BE32(vhp->blockSize); + vcb->blockSize = blockSize; vcb->encodingsBitmap = SWAP_BE64(vhp->encodingsBitmap); vcb->localCreateDate = SWAP_BE32(vhp->createDate); @@ -338,6 +357,7 @@ bzero(&cndesc, sizeof(cndesc)); cndesc.cd_parentcnid = kRootParID; + cndesc.cd_flags |= CD_ISMETA; bzero(&cnattr, sizeof(cnattr)); cnattr.ca_nlink = 1; cnattr.ca_mode = S_IFREG; @@ -349,19 +369,23 @@ cndesc.cd_namelen = strlen(hfs_extname); cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; - SWAP_HFS_PLUS_FORK_DATA (&vhp->extentsFile); - cnattr.ca_blocks = vhp->extentsFile.totalBlocks; - - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, - (struct cat_fork *)&vhp->extentsFile, + cfork.cf_size = SWAP_BE64 (vhp->extentsFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->extentsFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->extentsFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, &vcb->extentsRefNum); - SWAP_HFS_PLUS_FORK_DATA (&vhp->extentsFile); if (retval) goto ErrorExit; retval = MacToVFSError(BTOpenPath(VTOF(vcb->extentsRefNum), - (KeyCompareProcPtr) CompareExtentKeysPlus, - GetBTreeBlock, ReleaseBTreeBlock, - ExtendBTreeFile, SetBTreeBlockSize)); + (KeyCompareProcPtr) CompareExtentKeysPlus)); if (retval) { VOP_UNLOCK(vcb->extentsRefNum, 0, p); goto ErrorExit; @@ -374,26 +398,39 @@ cndesc.cd_namelen = strlen(hfs_catname); cndesc.cd_cnid = cnattr.ca_fileid = kHFSCatalogFileID; - SWAP_HFS_PLUS_FORK_DATA(&vhp->catalogFile); - cnattr.ca_blocks = vhp->catalogFile.totalBlocks; - - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, - (struct cat_fork *)&vhp->catalogFile, + cfork.cf_size = SWAP_BE64 (vhp->catalogFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->catalogFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->catalogFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, &vcb->catalogRefNum); - SWAP_HFS_PLUS_FORK_DATA(&vhp->catalogFile); if (retval) { VOP_UNLOCK(vcb->extentsRefNum, 0, p); goto ErrorExit; } retval = MacToVFSError(BTOpenPath(VTOF(vcb->catalogRefNum), - (KeyCompareProcPtr) CompareExtendedCatalogKeys, - GetBTreeBlock, ReleaseBTreeBlock, - ExtendBTreeFile, SetBTreeBlockSize)); + (KeyCompareProcPtr) CompareExtendedCatalogKeys)); if (retval) { VOP_UNLOCK(vcb->catalogRefNum, 0, p); VOP_UNLOCK(vcb->extentsRefNum, 0, p); goto ErrorExit; } + if ((hfsmp->hfs_flags & HFS_X) && + BTGetInformation(VTOF(vcb->catalogRefNum), 0, &btinfo) == 0) { + if (btinfo.keyCompareType == kHFSBinaryCompare) { + hfsmp->hfs_flags |= HFS_CASE_SENSITIVE; + /* Install a case-sensitive key compare */ + (void) BTOpenPath(VTOF(vcb->catalogRefNum), + (KeyCompareProcPtr)cat_binarykeycompare); + } + } /* * Set up Allocation file vnode @@ -402,13 +439,19 @@ cndesc.cd_namelen = strlen(hfs_vbmname); cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; - SWAP_HFS_PLUS_FORK_DATA(&vhp->allocationFile); - cnattr.ca_blocks = vhp->allocationFile.totalBlocks; - - retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, - (struct cat_fork *)&vhp->allocationFile, + cfork.cf_size = SWAP_BE64 (vhp->allocationFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->allocationFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->allocationFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, &cndesc, 0, &cnattr, &cfork, &vcb->allocationsRefNum); - SWAP_HFS_PLUS_FORK_DATA(&vhp->allocationFile); if (retval) { VOP_UNLOCK(vcb->catalogRefNum, 0, p); VOP_UNLOCK(vcb->extentsRefNum, 0, p); @@ -430,7 +473,7 @@ /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; - if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) { + if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { hfs_flushvolumeheader(hfsmp, TRUE, TRUE); } @@ -441,17 +484,6 @@ VOP_UNLOCK(vcb->catalogRefNum, 0, p); VOP_UNLOCK(vcb->extentsRefNum, 0, p); - /* setup private/hidden directory for unlinked files */ - hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb); - if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0)) - hfs_remove_orphans(hfsmp); - - if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected - { - MarkVCBDirty( vcb ); // mark VCB dirty so it will be written - } - - // // Check if we need to do late journal initialization. This only // happens if a previous version of MacOS X (or 9) touched the disk. @@ -482,6 +514,40 @@ } } + /* + * Establish a metadata allocation zone. + */ + hfs_metadatazone_init(hfsmp); + + /* + * Make any metadata zone adjustments. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* Keep the roving allocator out of the metadata zone. */ + if (vcb->nextAllocation >= hfsmp->hfs_metazone_start && + vcb->nextAllocation <= hfsmp->hfs_metazone_end) { + vcb->nextAllocation = hfsmp->hfs_metazone_end + 1; + } + } + + /* setup private/hidden directory for unlinked files */ + FindMetaDataDirectory(vcb); + if (hfsmp->jnl && ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) + hfs_remove_orphans(hfsmp); + + if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected + { + MarkVCBDirty( vcb ); // mark VCB dirty so it will be written + } + + + /* + * Allow hot file clustering if conditions allow. + */ + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && + ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { + (void) hfs_recording_init(hfsmp, p); + } return (0); @@ -527,13 +593,20 @@ * *************************************************************/ -short hfsUnmount( register struct hfsmount *hfsmp, struct proc *p) +__private_extern__ +int +hfsUnmount( register struct hfsmount *hfsmp, struct proc *p) { ExtendedVCB *vcb = HFSTOVCB(hfsmp); int retval = E_NONE; InvalidateCatalogCache( vcb ); + if (hfsmp->hfc_filevp) { + ReleaseMetaFileVNode(hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + if (vcb->vcbSigWord == kHFSPlusSigWord) ReleaseMetaFileVNode(vcb->allocationsRefNum); @@ -545,16 +618,11 @@ /* - * Some 3rd party kexts link against hfs_getcatalog so keep a stub for now. + * Test is fork has overflow extents. */ -short -hfs_getcatalog(void *p1, u_long p2, void *p3, short p4, void *p5) -{ - return ENOENT; -} - - -int overflow_extents(struct filefork *fp) +__private_extern__ +int +overflow_extents(struct filefork *fp) { u_long blocks; @@ -583,7 +651,10 @@ } -/* __private_extern__ */ +/* + * Lock/Unlock a metadata file. + */ +__private_extern__ int hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flags, struct proc *p) { @@ -610,19 +681,19 @@ panic("hfs_lockmetafile: invalid fileID"); } - /* Release, if necesary any locked buffer caches */ - if ((flags & LK_TYPE_MASK) == LK_RELEASE) { + if ((flags & LK_TYPE_MASK) != LK_RELEASE) { + flags |= LK_RETRY; + } else if (hfsmp->jnl == NULL) { struct timeval tv = time; u_int32_t lastfsync = tv.tv_sec; (void) BTGetLastSync((FCB*)VTOF(vp), &lastfsync); numOfLockedBuffs = count_lock_queue(); - if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || ((numOfLockedBuffs>1) && ((tv.tv_sec - lastfsync) > kMaxSecsForFsync))) { + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > kMaxSecsForFsync))) { hfs_btsync(vp, HFS_SYNCTRANS); } - } else { - flags |= LK_RETRY; } retval = lockmgr(&VTOC(vp)->c_lock, flags, &vp->v_interlock, p); @@ -645,7 +716,7 @@ void * self; pid = current_proc()->p_pid; - self = (void *) current_thread(); + self = (void *) current_act(); lkp = &VTOC(vp)->c_lock; simple_lock(&lkp->lk_interlock); @@ -680,13 +751,11 @@ * There are three ways to qualify for ownership rights on an object: * * 1. (a) Your UID matches the cnode's UID. - * (b) The object in question is owned by "unknown" and - * your UID matches the console user's UID. + * (b) The object in question is owned by "unknown" * 2. (a) Permissions on the filesystem are being ignored and * your UID matches the replacement UID. * (b) Permissions on the filesystem are being ignored and - * the replacement UID is "unknown" and - * your UID matches the console user UID. + * the replacement UID is "unknown". * 3. You are root. * */ @@ -695,11 +764,10 @@ struct proc *p, int invokesuperuserstatus) { if ((cred->cr_uid == cnode_uid) || /* [1a] */ - ((cnode_uid == UNKNOWNUID) && (cred->cr_uid == console_user)) || /* [1b] */ + (cnode_uid == UNKNOWNUID) || /* [1b] */ ((HFSTOVFS(hfsmp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) && /* [2] */ ((cred->cr_uid == hfsmp->hfs_uid) || /* [2a] */ - ((hfsmp->hfs_uid == UNKNOWNUID) && /* [2b] */ - (cred->cr_uid == console_user)))) || + (hfsmp->hfs_uid == UNKNOWNUID))) || /* [2b] */ (invokesuperuserstatus && (suser(cred, &p->p_acflag) == 0))) { /* [3] */ return (0); } else { @@ -755,8 +823,9 @@ * To make the HFS Plus filesystem follow UFS unlink semantics, a remove * of an active vnode is translated to a move/rename so the file appears * deleted. The destination folder for these move/renames is setup here - * and a reference to it is place in hfsmp->hfs_private_metadata_dir. + * and a reference to it is place in hfsmp->hfs_privdir_desc. */ +__private_extern__ u_long FindMetaDataDirectory(ExtendedVCB *vcb) { @@ -765,7 +834,9 @@ struct cnode * dcp = NULL; struct FndrDirInfo * fndrinfo; struct cat_desc out_desc = {0}; + struct proc *p = current_proc(); struct timeval tv; + cat_cookie_t cookie; int error; if (vcb->vcbSigWord != kHFSPlusSigWord) @@ -781,28 +852,52 @@ } /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc()); - if (error) + if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p) != 0) return (0); error = cat_lookup(hfsmp, &hfsmp->hfs_privdir_desc, 0, NULL, &hfsmp->hfs_privdir_attr, NULL); + /* Unlock catalog b-tree */ + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + if (error == 0) { - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); hfsmp->hfs_metadata_createdate = hfsmp->hfs_privdir_attr.ca_itime; + hfsmp->hfs_privdir_desc.cd_cnid = hfsmp->hfs_privdir_attr.ca_fileid; + /* + * Clear the system immutable flag if set... + */ + if ((hfsmp->hfs_privdir_attr.ca_flags & SF_IMMUTABLE) && + (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + hfsmp->hfs_privdir_attr.ca_flags &= ~SF_IMMUTABLE; + + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + hfs_global_shared_lock_release(hfsmp); + return (hfsmp->hfs_privdir_attr.ca_fileid); + } + } + if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p) == 0) { + (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, NULL, NULL); + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + } + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + } return (hfsmp->hfs_privdir_attr.ca_fileid); - } else if (hfsmp->hfs_fs_ronly) { - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); + + } else if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (0); } /* Setup the default attributes */ bzero(&hfsmp->hfs_privdir_attr, sizeof(struct cat_attr)); hfsmp->hfs_privdir_attr.ca_mode = S_IFDIR; - hfsmp->hfs_privdir_attr.ca_flags = SF_IMMUTABLE; hfsmp->hfs_privdir_attr.ca_nlink = 2; hfsmp->hfs_privdir_attr.ca_itime = vcb->vcbCrDate; hfsmp->hfs_privdir_attr.ca_mtime = time.tv_sec; @@ -821,12 +916,24 @@ return (0); } } + /* Reserve some space in the Catalog file. */ + if (cat_preflight(hfsmp, CAT_CREATE, &cookie, p) != 0) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + return (0); + } - error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc, - &hfsmp->hfs_privdir_attr, &out_desc); + if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) == 0) { + error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, &out_desc); - /* Unlock catalog b-tree */ - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + } + + cat_postflight(hfsmp, &cookie, p); + if (error) { if (hfsmp->jnl) { journal_end_transaction(hfsmp->jnl); @@ -896,7 +1003,7 @@ if (error == 0) { return (fattr->ca_fileid); - } else if (hfsmp->hfs_fs_ronly) { + } else if (hfsmp->hfs_flags & HFS_READ_ONLY) { return (0); } } @@ -916,15 +1023,20 @@ struct FSBufferDescriptor btdata; struct HFSPlusCatalogFile filerec; struct HFSPlusCatalogKey * keyp; + struct proc *p = current_proc(); FCB *fcb; ExtendedVCB *vcb; char filename[32]; char tempname[32]; size_t namelen; + cat_cookie_t cookie = {0}; int catlock = 0; - int result, started_tr = 0; + int catreserve = 0; + int started_tr = 0; + int shared_lock = 0; + int result; - if (hfsmp->hfs_orphans_cleaned) + if (hfsmp->hfs_flags & HFS_CLEANED_ORPHANS) return; vcb = HFSTOVCB(hfsmp); @@ -937,38 +1049,34 @@ MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); keyp = (HFSPlusCatalogKey*)&iterator->key; - keyp->parentID = hfsmp->hfs_private_metadata_dir; - - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return; - } - started_tr = 1; - } + keyp->parentID = hfsmp->hfs_privdir_desc.cd_cnid; - /* Lock catalog b-tree */ - result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc()); + result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (result) goto exit; - catlock = 1; - /* * Position the iterator at the folder thread record. * (i.e. one record before first child) */ result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); if (result) goto exit; /* Visit all the children in the HFS+ private directory. */ for (;;) { + result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); + if (result) + goto exit; + result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); if (result) break; - if (keyp->parentID != hfsmp->hfs_private_metadata_dir) + + if (keyp->parentID != hfsmp->hfs_privdir_desc.cd_cnid) break; if (filerec.recordType != kHFSPlusFileRecord) continue; @@ -982,46 +1090,92 @@ * Delete all files named "tempxxx", where * xxx is the file's cnid in decimal. * - * Delete all files named "iNodexxx", that - * have a link count of zero. */ if (bcmp(tempname, filename, namelen) == 0) { - struct filefork fork = {0}; - struct cnode cnode = {0}; + struct filefork dfork = {0}; + struct filefork rfork = {0}; + struct cnode cnode = {0}; + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + shared_lock = 1; + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + goto exit; + } + started_tr = 1; + } + + /* + * Reserve some space in the Catalog file. + */ + if (cat_preflight(hfsmp, CAT_DELETE, &cookie, p) != 0) { + goto exit; + } + catreserve = 1; - // XXXdebug - //printf("hfs_remove_orphans: removing %s\n", filename); + /* Lock catalog b-tree */ + if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, + LK_EXCLUSIVE, p) != 0) { + goto exit; + } + catlock = 1; /* Build a fake cnode */ - cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + cat_convertattr(hfsmp, (CatalogRecord *)&filerec, &cnode.c_attr, + &dfork.ff_data, &rfork.ff_data); + cnode.c_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; cnode.c_desc.cd_nameptr = filename; cnode.c_desc.cd_namelen = namelen; - cnode.c_desc.cd_cnid = filerec.fileID; - cnode.c_attr.ca_fileid = filerec.fileID; - cnode.c_blocks = filerec.dataFork.totalBlocks + - filerec.resourceFork.totalBlocks; + cnode.c_desc.cd_cnid = cnode.c_attr.ca_fileid; + cnode.c_blocks = dfork.ff_blocks + rfork.ff_blocks; /* Position iterator at previous entry */ if (BTIterateRecord(fcb, kBTreePrevRecord, iterator, - NULL, NULL) != 0) + NULL, NULL) != 0) { break; - + } + /* Truncate the file to zero (both forks) */ - if (filerec.dataFork.totalBlocks > 0) { - fork.ff_cp = &cnode; - cnode.c_datafork = ⋔ - bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork)); - if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) { - printf("error truncting data fork!\n"); - break; + if (dfork.ff_blocks > 0) { + u_int64_t fsize; + + dfork.ff_cp = &cnode; + cnode.c_datafork = &dfork; + cnode.c_rsrcfork = NULL; + fsize = (u_int64_t)dfork.ff_blocks * (u_int64_t)HFSTOVCB(hfsmp)->blockSize; + while (fsize > 0) { + if (fsize > HFS_BIGFILE_SIZE) { + fsize -= HFS_BIGFILE_SIZE; + } else { + fsize = 0; + } + + if (TruncateFileC(vcb, (FCB*)&dfork, fsize, false) != 0) { + printf("error truncting data fork!\n"); + break; + } + + // + // if we're iteratively truncating this file down, + // then end the transaction and start a new one so + // that no one transaction gets too big. + // + if (fsize > 0 && started_tr) { + journal_end_transaction(hfsmp->jnl); + if (journal_start_transaction(hfsmp->jnl) != 0) { + started_tr = 0; + break; + } + } } } - if (filerec.resourceFork.totalBlocks > 0) { - fork.ff_cp = &cnode; + + if (rfork.ff_blocks > 0) { + rfork.ff_cp = &cnode; cnode.c_datafork = NULL; - cnode.c_rsrcfork = ⋔ - bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork)); - if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) { + cnode.c_rsrcfork = &rfork; + if (TruncateFileC(vcb, (FCB*)&rfork, 0, false) != 0) { printf("error truncting rsrc fork!\n"); break; } @@ -1038,21 +1192,37 @@ (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, &hfsmp->hfs_privdir_attr, NULL, NULL); hfs_volupdate(hfsmp, VOL_RMFILE, 0); - } - } + + /* Drop locks and end the transaction */ + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + cat_postflight(hfsmp, &cookie, p); + catlock = catreserve = 0; + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + hfs_global_shared_lock_release(hfsmp); + shared_lock = 0; + + } /* end if */ + } /* end for */ exit: - /* Unlock catalog b-tree */ - if (catlock) - (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); - + if (catlock) { + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); + } + if (catreserve) { + cat_postflight(hfsmp, &cookie, p); + } if (started_tr) { journal_end_transaction(hfsmp->jnl); } - hfs_global_shared_lock_release(hfsmp); + if (shared_lock) { + hfs_global_shared_lock_release(hfsmp); + } FREE(iterator, M_TEMP); - hfsmp->hfs_orphans_cleaned = 1; + hfsmp->hfs_flags |= HFS_CLEANED_ORPHANS; } @@ -1111,8 +1281,47 @@ else freeblks = 0; } + if (freeblks > vcb->loanedBlocks) + freeblks -= vcb->loanedBlocks; + else + freeblks = 0; + +#ifdef HFS_SPARSE_DEV + /* + * When the underlying device is sparse, check the + * available space on the backing store volume. + */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { + struct statfs statbuf; /* 272 bytes */ + u_int32_t vfreeblks; + u_int32_t loanedblks; + struct mount * backingfs_mp; + + backingfs_mp = hfsmp->hfs_backingfs_rootvp->v_mount; + + if (VFS_STATFS(backingfs_mp, &statbuf, current_proc()) == 0) { + vfreeblks = statbuf.f_bavail; + /* Normalize block count if needed. */ + if (statbuf.f_bsize != vcb->blockSize) { + vfreeblks = ((u_int64_t)vfreeblks * (u_int64_t)statbuf.f_bsize) / vcb->blockSize; + } + if (vfreeblks > hfsmp->hfs_sparsebandblks) + vfreeblks -= hfsmp->hfs_sparsebandblks; + else + vfreeblks = 0; + + /* Take into account any delayed allocations. */ + loanedblks = 2 * vcb->loanedBlocks; + if (vfreeblks > loanedblks) + vfreeblks -= loanedblks; + else + vfreeblks = 0; + + freeblks = MIN(vfreeblks, freeblks); + } + } +#endif /* HFS_SPARSE_DEV */ - freeblks -= vcb->loanedBlocks; return (freeblks); } @@ -1127,9 +1336,8 @@ switch (err) { case dskFulErr: /* -34 */ + case btNoSpaceAvail: /* -32733 */ return ENOSPC; - case btNoSpaceAvail: /* -32733 */ - return EFBIG; case fxOvFlErr: /* -32750 */ return EOVERFLOW; @@ -1184,7 +1392,7 @@ void *self; if (index > 0) { - self = current_thread(); + self = current_act(); SLIST_FOREACH(entry, &dcp->c_indexlist, hi_link) { if ((entry->hi_index == index) && (entry->hi_thread == self)) @@ -1211,7 +1419,7 @@ MALLOC(entry, struct hfs_index *, len + sizeof(struct hfs_index), M_TEMP, M_WAITOK); entry->hi_index = index; - entry->hi_thread = current_thread(); + entry->hi_thread = current_act(); bcopy(namehint, entry->hi_name, len + 1); SLIST_INSERT_HEAD(&dcp->c_indexlist, entry, hi_link); } @@ -1229,7 +1437,7 @@ void *self; if (index > 0) { - self = current_thread(); + self = current_act(); SLIST_FOREACH(entry, &dcp->c_indexlist, hi_link) { if ((entry->hi_index == index) && (entry->hi_thread == self)) { @@ -1341,6 +1549,7 @@ // save this off for the hack-y check in hfs_remove() hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize); + hfsmp->jnl_size = jibp->size; if (jibp->flags & kJIJournalNeedInitMask) { printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", @@ -1358,6 +1567,8 @@ // we'd just re-init it on the next mount. jibp->flags &= ~kJIJournalNeedInitMask; jibp->flags = SWAP_BE32(jibp->flags); + jibp->offset = SWAP_BE64(jibp->offset); + jibp->size = SWAP_BE64(jibp->size); bwrite(jinfo_bp); jinfo_bp = NULL; jibp = NULL; @@ -1382,6 +1593,9 @@ if (hfsmp->jnl && mdbp) { // reload the mdb because it could have changed // if the journal had to be replayed. + if (mdb_offset == 0) { + mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); + } retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); if (retval) { brelse(bp); @@ -1401,9 +1615,7 @@ // if we expected the journal to be there and we couldn't // create it or open it then we have to bail out. if (hfsmp->jnl == NULL) { - hfsmp->jnl_start = 0; - - printf("hfs: failed to open/create the journal (retval %d).\n", retval); + printf("hfs: early jnl init: failed to open/create the journal (retval %d).\n", retval); return EINVAL; } @@ -1524,6 +1736,7 @@ // save this off for the hack-y check in hfs_remove() hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize); + hfsmp->jnl_size = jibp->size; if (jibp->flags & kJIJournalNeedInitMask) { printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", @@ -1585,11 +1798,226 @@ // if we expected the journal to be there and we couldn't // create it or open it then we have to bail out. if (hfsmp->jnl == NULL) { - hfsmp->jnl_start = 0; - - printf("hfs: failed to open/create the journal (retval %d).\n", retval); + printf("hfs: late jnl init: failed to open/create the journal (retval %d).\n", retval); return EINVAL; } return 0; } + +/* + * Calculate the allocation zone for metadata. + * + * This zone includes the following: + * Allocation Bitmap file + * Overflow Extents file + * Journal file + * Quota files + * Clustered Hot files + * Catalog file + * + * METADATA ALLOCATION ZONE + * ____________________________________________________________________________ + * | | | | | | | + * | BM | JF | OEF | CATALOG |---> | HOT FILES | + * |____|____|_____|_______________|______________________________|___________| + * + * <------------------------------- N * 128 MB -------------------------------> + * + */ +#define GIGABYTE (u_int64_t)(1024*1024*1024) + +#define OVERFLOW_DEFAULT_SIZE (4*1024*1024) +#define OVERFLOW_MAXIMUM_SIZE (128*1024*1024) +#define JOURNAL_DEFAULT_SIZE (8*1024*1024) +#define JOURNAL_MAXIMUM_SIZE (512*1024*1024) +#define HOTBAND_MINIMUM_SIZE (10*1024*1024) +#define HOTBAND_MAXIMUM_SIZE (512*1024*1024) + +static void +hfs_metadatazone_init(struct hfsmount *hfsmp) +{ + ExtendedVCB *vcb; + struct BTreeInfoRec btinfo; + u_int64_t fs_size; + u_int64_t zonesize; + u_int64_t temp; + u_int64_t filesize; + u_int32_t blk; + int items; + + vcb = HFSTOVCB(hfsmp); + fs_size = (u_int64_t)vcb->blockSize * (u_int64_t)vcb->totalBlocks; + + /* + * For volumes less than 10 GB, don't bother. + */ + if (fs_size < ((u_int64_t)10 * GIGABYTE)) + return; + /* + * Skip non-journaled volumes as well. + */ + if (hfsmp->jnl == NULL) + return; + + /* + * Start with allocation bitmap (a fixed size). + */ + zonesize = roundup(vcb->totalBlocks / 8, vcb->vcbVBMIOSize); + + /* + * Overflow Extents file gets 4 MB per 100 GB. + */ + items = fs_size / ((u_int64_t)100 * GIGABYTE); + filesize = (u_int64_t)(items + 1) * OVERFLOW_DEFAULT_SIZE; + if (filesize > OVERFLOW_MAXIMUM_SIZE) + filesize = OVERFLOW_MAXIMUM_SIZE; + zonesize += filesize; + hfsmp->hfs_overflow_maxblks = filesize / vcb->blockSize; + + /* + * Plan for at least 8 MB of journal for each + * 100 GB of disk space (up to a 512 MB). + */ + items = fs_size / ((u_int64_t)100 * GIGABYTE); + filesize = (u_int64_t)(items + 1) * JOURNAL_DEFAULT_SIZE; + if (filesize > JOURNAL_MAXIMUM_SIZE) + filesize = JOURNAL_MAXIMUM_SIZE; + zonesize += filesize; + + /* + * Catalog file gets 10 MB per 1 GB. + * + * How about considering the current catalog size (used nodes * node size) + * and the current file data size to help estimate the required + * catalog size. + */ + filesize = MIN((fs_size / 1024) * 10, GIGABYTE); + hfsmp->hfs_catalog_maxblks = filesize / vcb->blockSize; + zonesize += filesize; + + /* + * Add space for hot file region. + * + * ...for now, use 5 MB per 1 GB (0.5 %) + */ + filesize = (fs_size / 1024) * 5; + if (filesize > HOTBAND_MAXIMUM_SIZE) + filesize = HOTBAND_MAXIMUM_SIZE; + else if (filesize < HOTBAND_MINIMUM_SIZE) + filesize = HOTBAND_MINIMUM_SIZE; + /* + * Calculate user quota file requirements. + */ + items = QF_USERS_PER_GB * (fs_size / GIGABYTE); + if (items < QF_MIN_USERS) + items = QF_MIN_USERS; + else if (items > QF_MAX_USERS) + items = QF_MAX_USERS; + if (!powerof2(items)) { + int x = items; + items = 4; + while (x>>1 != 1) { + x = x >> 1; + items = items << 1; + } + } + filesize += (items + 1) * sizeof(struct dqblk); + /* + * Calculate group quota file requirements. + * + */ + items = QF_GROUPS_PER_GB * (fs_size / GIGABYTE); + if (items < QF_MIN_GROUPS) + items = QF_MIN_GROUPS; + else if (items > QF_MAX_GROUPS) + items = QF_MAX_GROUPS; + if (!powerof2(items)) { + int x = items; + items = 4; + while (x>>1 != 1) { + x = x >> 1; + items = items << 1; + } + } + filesize += (items + 1) * sizeof(struct dqblk); + hfsmp->hfs_hotfile_maxblks = filesize / vcb->blockSize; + zonesize += filesize; + + /* + * Round up entire zone to a bitmap block's worth. + * The extra space goes to the catalog file and hot file area. + */ + temp = zonesize; + zonesize = roundup(zonesize, vcb->vcbVBMIOSize * 8 * vcb->blockSize); + temp = zonesize - temp; /* temp has extra space */ + filesize += temp / 3; + hfsmp->hfs_catalog_maxblks += (temp - (temp / 3)) / vcb->blockSize; + + /* Convert to allocation blocks. */ + blk = zonesize / vcb->blockSize; + + /* The default metadata zone location is at the start of volume. */ + hfsmp->hfs_metazone_start = 1; + hfsmp->hfs_metazone_end = blk - 1; + + /* The default hotfile area is at the end of the zone. */ + hfsmp->hfs_hotfile_start = blk - (filesize / vcb->blockSize); + hfsmp->hfs_hotfile_end = hfsmp->hfs_metazone_end; + hfsmp->hfs_hotfile_freeblks = hfs_hotfile_freeblocks(hfsmp); +#if 0 + printf("HFS: metadata zone is %d to %d\n", hfsmp->hfs_metazone_start, hfsmp->hfs_metazone_end); + printf("HFS: hot file band is %d to %d\n", hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end); + printf("HFS: hot file band free blocks = %d\n", hfsmp->hfs_hotfile_freeblks); +#endif + hfsmp->hfs_flags |= HFS_METADATA_ZONE; +} + + +static u_int32_t +hfs_hotfile_freeblocks(struct hfsmount *hfsmp) +{ + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + int freeblocks; + + freeblocks = MetaZoneFreeBlocks(vcb); + /* Minus Extents overflow file reserve. */ + freeblocks -= + hfsmp->hfs_overflow_maxblks - VTOF(vcb->extentsRefNum)->ff_blocks; + /* Minus catalog file reserve. */ + freeblocks -= + hfsmp->hfs_catalog_maxblks - VTOF(vcb->catalogRefNum)->ff_blocks; + if (freeblocks < 0) + freeblocks = 0; + + return MIN(freeblocks, hfsmp->hfs_hotfile_maxblks); +} + +/* + * Determine if a file is a "virtual" metadata file. + * This includes journal and quota files. + */ +__private_extern__ +int +hfs_virtualmetafile(struct cnode *cp) +{ + char * filename; + + + if (cp->c_parentcnid != kHFSRootFolderID) + return (0); + + filename = cp->c_desc.cd_nameptr; + if (filename == NULL) + return (0); + + if ((strcmp(filename, ".journal") == 0) || + (strcmp(filename, ".journal_info_block") == 0) || + (strcmp(filename, ".quota.user") == 0) || + (strcmp(filename, ".quota.group") == 0) || + (strcmp(filename, ".hotfiles.btree") == 0)) + return (1); + + return (0); +} + diff -urN xnu-344.49/bsd/hfs/hfs_vnops.c xnu-517/bsd/hfs/hfs_vnops.c --- xnu-344.49/bsd/hfs/hfs_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfs_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -35,6 +35,8 @@ #include #include #include +#include +#include #include #include @@ -58,14 +60,16 @@ #define MAKE_DELETED_NAME(NAME,FID) \ (void) sprintf((NAME), "%s%d", HFS_DELETE_PREFIX, (FID)) +#define KNDETACH_VNLOCKED 0x00000001 -extern uid_t console_user; +#define CARBON_TEMP_DIR_NAME "Cleanup At Startup" -extern unsigned long strtoul(const char *, char **, int); /* Global vfs data structures for hfs */ +extern unsigned long strtoul(const char *, char **, int); + extern int groupmember(gid_t gid, struct ucred *cred); static int hfs_makenode(int mode, struct vnode *dvp, struct vnode **vpp, @@ -76,6 +80,19 @@ static int hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p); +static int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, + int); + +static int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, + int); + +/* Options for hfs_removedir and hfs_removefile */ +#define HFSRM_PARENT_LOCKED 0x01 +#define HFSRM_SKIP_RESERVE 0x02 +#define HFSRM_SAVE_NAME 0x04 +#define HFSRM_RENAMEOPTS 0x07 + + int hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean considerFlags); int hfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, @@ -200,6 +217,8 @@ } */ *ap; { struct vnode *vp = ap->a_vp; + struct filefork *fp = VTOF(vp); + struct timeval tv; /* * Files marked append-only must be opened for appending. @@ -208,6 +227,36 @@ (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); + if (ap->a_mode & O_EVTONLY) { + if (vp->v_type == VREG) { + ++VTOF(vp)->ff_evtonly_refs; + } else { + ++VTOC(vp)->c_evtonly_refs; + }; + }; + + /* + * On the first (non-busy) open of a fragmented + * file attempt to de-frag it (if its less than 20MB). + */ + if ((VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) || + !UBCISVALID(vp) || ubc_isinuse(vp, 1)) { + return (0); + } + fp = VTOF(vp); + if (fp->ff_blocks && + fp->ff_extents[7].blockCount != 0 && + fp->ff_size <= (20 * 1024 * 1024)) { + /* + * Wait until system bootup is done (3 min). + */ + microuptime(&tv); + if (tv.tv_sec < (60 * 3)) { + return (0); + } + (void) hfs_relocate(vp, VTOVCB(vp)->nextAllocation + 4096, ap->a_cred, ap->a_p); + } + return (0); } @@ -252,6 +301,14 @@ } simple_unlock(&vp->v_interlock); + if (ap->a_fflag & O_EVTONLY) { + if (vp->v_type == VREG) { + --VTOF(vp)->ff_evtonly_refs; + } else { + --VTOC(vp)->c_evtonly_refs; + }; + }; + /* * VOP_CLOSE can be called with vp locked (from vclean). * We check for this case using VOP_ISLOCKED and bail. @@ -263,7 +320,9 @@ leof = fp->ff_size; - if ((fp->ff_blocks > 0) && !ISSET(cp->c_flag, C_DELETED)) { + if ((fp->ff_blocks > 0) && + !ISSET(cp->c_flag, C_DELETED) && + ((VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) == 0)) { enum vtype our_type = vp->v_type; u_long our_id = vp->v_id; int was_nocache = ISSET(vp->v_flag, VNOCACHE_DATA); @@ -336,6 +395,8 @@ } VOP_UNLOCK(vp, 0, p); } + if ((vp->v_flag & VSYSTEM) && (vp->v_usecount == 1)) + vgone(vp); return (0); } @@ -378,7 +439,7 @@ case VDIR: case VLNK: case VREG: - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); #if QUOTA if ((error = hfs_getinoquota(cp))) @@ -386,20 +447,20 @@ #endif /* QUOTA */ break; } + /* If immutable bit set, nobody gets to write it. */ + if (cp->c_flags & IMMUTABLE) + return (EPERM); } - /* If immutable bit set, nobody gets to write it. */ - if ((mode & VWRITE) && (cp->c_flags & IMMUTABLE)) - return (EPERM); /* Otherwise, user id 0 always gets access. */ - if (ap->a_cred->cr_uid == 0) + if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ - if (hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, ap->a_p, false) == 0) { + if ( (cp->c_uid == cred->cr_uid) || (cp->c_uid == UNKNOWNUID) ) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) @@ -466,6 +527,8 @@ CTIMES(cp, &tv, &tv); vap->va_type = vp->v_type; + vap->va_mode = cp->c_mode; + vap->va_nlink = cp->c_nlink; /* * [2856576] Since we are dynamically changing the owner, also * effectively turn off the set-user-id and set-group-id bits, @@ -473,9 +536,12 @@ * a security hole where set-user-id programs run as whoever is * logged on (or root if nobody is logged in yet!) */ - vap->va_mode = (cp->c_uid == UNKNOWNUID) ? cp->c_mode & ~(S_ISUID | S_ISGID) : cp->c_mode; - vap->va_nlink = cp->c_nlink; - vap->va_uid = (cp->c_uid == UNKNOWNUID) ? console_user : cp->c_uid; + if (cp->c_uid == UNKNOWNUID) { + vap->va_mode &= ~(S_ISUID | S_ISGID); + vap->va_uid = ap->a_cred->cr_uid; + } else { + vap->va_uid = cp->c_uid; + } vap->va_gid = cp->c_gid; vap->va_fsid = cp->c_dev; /* @@ -502,7 +568,6 @@ vap->va_rdev = 0; vap->va_blocksize = VTOVFS(vp)->mnt_stat.f_iosize; vap->va_filerev = 0; - vap->va_spare = 0; if (vp->v_type == VDIR) { vap->va_size = cp->c_nlink * AVERAGE_HFSDIRENTRY_SIZE; vap->va_bytes = 0; @@ -555,8 +620,19 @@ return (EINVAL); } + // XXXdbg + // don't allow people to set the attributes of symlinks + // (nfs has a bad habit of doing ths and it can cause + // problems for journaling). + // + if (vp->v_type == VLNK) { + return 0; + } + + + if (vap->va_flags != VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); if ((error = hfs_chflags(vp, vap->va_flags, cred, p))) return (error); @@ -571,7 +647,7 @@ if (VTOHFS(vp)->jnl && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; - extd = &cp->c_datafork->ff_data.cf_extents[0]; + extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { return EPERM; } @@ -581,7 +657,7 @@ * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); if ((error = hfs_chown(vp, vap->va_uid, vap->va_gid, cred, p))) return (error); @@ -597,7 +673,7 @@ return (EISDIR); case VLNK: case VREG: - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); break; default: @@ -608,7 +684,7 @@ } cp = VTOC(vp); if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); if (((error = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, true)) != 0) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || @@ -640,10 +716,11 @@ } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); error = hfs_chmod(vp, (int)vap->va_mode, cred, p); } + HFS_KNOTE(vp, NOTE_ATTRIB); return (error); } @@ -652,6 +729,7 @@ * Change the mode on a file. * cnode must be locked before calling. */ +__private_extern__ int hfs_chmod(vp, mode, cred, p) register struct vnode *vp; @@ -669,7 +747,7 @@ if (VTOHFS(vp)->jnl && cp && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; - extd = &cp->c_datafork->ff_data.cf_extents[0]; + extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { return EPERM; } @@ -695,6 +773,7 @@ } +__private_extern__ int hfs_write_access(struct vnode *vp, struct ucred *cred, struct proc *p, Boolean considerFlags) { @@ -712,9 +791,9 @@ case VDIR: case VLNK: case VREG: - if (VTOVFS(vp)->mnt_flag & MNT_RDONLY) + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) return (EROFS); - break; + break; default: break; } @@ -747,6 +826,7 @@ * Change the flags on a file or directory. * cnode must be locked before calling. */ +__private_extern__ int hfs_chflags(vp, flags, cred, p) register struct vnode *vp; @@ -789,6 +869,7 @@ * Perform chown operation on cnode cp; * code must be locked prior to call. */ +__private_extern__ int hfs_chown(vp, uid, gid, cred, p) register struct vnode *vp; @@ -934,14 +1015,13 @@ { struct vnode *from_vp = ap->a_fvp; struct vnode *to_vp = ap->a_tvp; - struct vnode *from_rvp = NULL; - struct vnode *to_rvp = NULL; struct cnode *from_cp = VTOC(from_vp); struct cnode *to_cp = VTOC(to_vp); struct hfsmount *hfsmp = VTOHFS(from_vp); struct cat_desc tempdesc; struct cat_attr tempattr; int error = 0, started_tr = 0, grabbed_lock = 0; + cat_cookie_t cookie = {0}; /* The files must be on the same volume. */ if (from_vp->v_mount != to_vp->v_mount) @@ -958,45 +1038,20 @@ struct HFSPlusExtentDescriptor *extd; if (from_cp->c_datafork) { - extd = &from_cp->c_datafork->ff_data.cf_extents[0]; + extd = &from_cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { return EPERM; } } if (to_cp->c_datafork) { - extd = &to_cp->c_datafork->ff_data.cf_extents[0]; + extd = &to_cp->c_datafork->ff_extents[0]; if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { return EPERM; } } } - from_rvp = from_cp->c_rsrc_vp; - to_rvp = to_cp->c_rsrc_vp; - - /* If one of the resource forks is open then get the other one. */ - if (from_rvp || to_rvp) { - error = hfs_vgetrsrc(hfsmp, from_vp, &from_rvp, ap->a_p); - if (error) - return (error); - error = hfs_vgetrsrc(hfsmp, to_vp, &to_rvp, ap->a_p); - if (error) { - vrele(from_rvp); - return (error); - } - } - - /* Ignore any errors, we are doing a 'best effort' on flushing */ - if (from_vp) - (void) vinvalbuf(from_vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - if (to_vp) - (void) vinvalbuf(to_vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - if (from_rvp) - (void) vinvalbuf(from_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - if (to_rvp) - (void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - // XXXdbg hfs_global_shared_lock_acquire(hfsmp); grabbed_lock = 1; @@ -1007,6 +1062,13 @@ started_tr = 1; } + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_EXCHANGE, &cookie, ap->a_p))) { + goto Err_Exit; + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p); if (error) goto Err_Exit; @@ -1091,18 +1153,17 @@ from_cp->c_flags &= ~UF_NODUMP; from_cp->c_flag |= C_CHANGE; } - if ((to_cp->c_flags & UF_NODUMP) && (to_cp->c_parentcnid != from_cp->c_parentcnid)) { to_cp->c_flags &= ~UF_NODUMP; to_cp->c_flag |= C_CHANGE; } + HFS_KNOTE(from_vp, NOTE_ATTRIB); + HFS_KNOTE(to_vp, NOTE_ATTRIB); + Err_Exit: - if (to_rvp) - vrele(to_rvp); - if (from_rvp) - vrele(from_rvp); + cat_postflight(hfsmp, &cookie, ap->a_p); // XXXdbg if (started_tr) { @@ -1161,12 +1222,7 @@ if (vp->v_flag & VSYSTEM) { if (VTOF(vp)->fcbBTCBPtr != NULL) { // XXXdbg - if (hfsmp->jnl) { - if (BTIsDirty(VTOF(vp))) { - panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n", - vp, hfsmp->jnl); - } - } else { + if (hfsmp->jnl == NULL) { BTFlushPath(VTOF(vp)); } } @@ -1311,6 +1367,18 @@ !ISSET(cp->c_flag, C_DELETED | C_NOEXISTS)) { hfs_metasync(VTOHFS(vp), cp->c_hint, ap->a_p); } + + // make sure that we've really been called from the user + // fsync() and if so push out any pending transactions + // that this file might is a part of (and get them on + // stable storage). + if (vp->v_flag & VFULLFSYNC) { + if (hfsmp->jnl) { + journal_flush(hfsmp->jnl); + } else { + VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, ap->a_p); + } + } } return (retval); @@ -1443,13 +1511,25 @@ struct componentname *a_cnp; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct vnode *dvp = ap->a_dvp; - struct proc *p = ap->a_cnp->cn_proc; + return (hfs_removedir(ap->a_dvp, ap->a_vp, ap->a_cnp, 0)); +} + +/* + * hfs_removedir + */ +static int +hfs_removedir(dvp, vp, cnp, options) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; + int options; +{ + struct proc *p = cnp->cn_proc; struct cnode *cp; struct cnode *dcp; struct hfsmount * hfsmp; struct timeval tv; + cat_cookie_t cookie = {0}; int error = 0, started_tr = 0, grabbed_lock = 0; cp = VTOC(vp); @@ -1465,7 +1545,6 @@ #if QUOTA (void)hfs_getinoquota(cp); #endif - // XXXdbg hfs_global_shared_lock_acquire(hfsmp); grabbed_lock = 1; @@ -1476,6 +1555,15 @@ started_tr = 1; } + if (!(options & HFSRM_SKIP_RESERVE)) { + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + } + /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since @@ -1520,16 +1608,22 @@ dcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(dvp, &tv, &tv, 0); + HFS_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); cp->c_mode = 0; /* Makes the vnode go away...see inactive */ cp->c_flag |= C_NOEXISTS; out: - if (dvp) + if (!(options & HFSRM_PARENT_LOCKED)) { vput(dvp); + } + HFS_KNOTE(vp, NOTE_DELETE); vput(vp); + if (!(options & HFSRM_SKIP_RESERVE)) { + cat_postflight(hfsmp, &cookie, p); + } // XXXdbg if (started_tr) { journal_end_transaction(hfsmp->jnl); @@ -1561,23 +1655,42 @@ struct componentname *a_cnp; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct vnode *dvp = ap->a_dvp; + return (hfs_removefile(ap->a_dvp, ap->a_vp, ap->a_cnp, 0)); +} + + + +/* + * hfs_removefile + * + * Similar to hfs_remove except there are additional options. + */ +static int +hfs_removefile(dvp, vp, cnp, options) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; + int options; +{ struct vnode *rvp = NULL; struct cnode *cp; struct cnode *dcp; struct hfsmount *hfsmp; - struct proc *p = current_proc(); + struct proc *p = cnp->cn_proc; int dataforkbusy = 0; int rsrcforkbusy = 0; int truncated = 0; struct timeval tv; + cat_cookie_t cookie = {0}; int error = 0; int started_tr = 0, grabbed_lock = 0; + int refcount, isbigfile = 0; - /* Redirect directories to rmdir */ - if (vp->v_type == VDIR) - return (hfs_rmdir(ap)); + /* Directories should call hfs_rmdir! */ + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } cp = VTOC(vp); dcp = VTOC(dvp); @@ -1610,7 +1723,7 @@ if (hfsmp->jnl && cp->c_datafork) { struct HFSPlusExtentDescriptor *extd; - extd = &cp->c_datafork->ff_data.cf_extents[0]; + extd = &cp->c_datafork->ff_extents[0]; if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { error = EPERM; goto out; @@ -1624,18 +1737,27 @@ * vnode (vp). And we took a ref on the resource vnode (rvp). * Hence set 1 in the tookref parameter of ubc_isinuse(). */ - if (UBCISVALID(vp) && ubc_isinuse(vp, 1)) + if (VTOC(vp)->c_flag & C_VPREFHELD) { + refcount = 2; + } else { + refcount = 1; + } + if (UBCISVALID(vp) && ubc_isinuse(vp, refcount)) dataforkbusy = 1; if (rvp && UBCISVALID(rvp) && ubc_isinuse(rvp, 1)) rsrcforkbusy = 1; + // need this to check if we have to break the deletion + // into multiple pieces + isbigfile = (VTOC(vp)->c_datafork->ff_size >= HFS_BIGFILE_SIZE); + /* * Carbon semantics prohibit deleting busy files. * (enforced when NODELETEBUSY is requested) */ if ((dataforkbusy || rsrcforkbusy) && - ((ap->a_cnp->cn_flags & NODELETEBUSY) || - (hfsmp->hfs_private_metadata_dir == 0))) { + ((cnp->cn_flags & NODELETEBUSY) || + (hfsmp->hfs_privdir_desc.cd_cnid == 0))) { error = EBUSY; goto out; } @@ -1654,6 +1776,15 @@ started_tr = 1; } + if (!(options & HFSRM_SKIP_RESERVE)) { + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + } + /* Remove our entry from the namei cache. */ cache_purge(vp); @@ -1695,7 +1826,7 @@ if ((cp->c_flag & C_HARDLINK) == 0) { int mode = cp->c_mode; - if (!dataforkbusy && cp->c_datafork->ff_blocks != 0) { + if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { cp->c_mode = 0; /* Suppress VOP_UPDATES */ error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); cp->c_mode = mode; @@ -1722,16 +1853,16 @@ if (cp->c_flag & C_HARDLINK) { struct cat_desc desc; - if ((ap->a_cnp->cn_flags & HASBUF) == 0 || - ap->a_cnp->cn_nameptr[0] == '\0') { + if ((cnp->cn_flags & HASBUF) == 0 || + cnp->cn_nameptr[0] == '\0') { error = ENOENT; /* name missing! */ goto out; } /* Setup a descriptor for the link */ bzero(&desc, sizeof(desc)); - desc.cd_nameptr = ap->a_cnp->cn_nameptr; - desc.cd_namelen = ap->a_cnp->cn_namelen; + desc.cd_nameptr = cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; desc.cd_parentcnid = dcp->c_cnid; /* XXX - if cnid is out of sync then the wrong thread rec will get deleted. */ desc.cd_cnid = cp->c_cnid; @@ -1760,7 +1891,7 @@ bzero(&from_desc, sizeof(from_desc)); from_desc.cd_nameptr = inodename; from_desc.cd_namelen = strlen(inodename); - from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + from_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; from_desc.cd_flags = 0; from_desc.cd_cnid = cp->c_fileid; @@ -1768,7 +1899,7 @@ bzero(&to_desc, sizeof(to_desc)); to_desc.cd_nameptr = delname; to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; to_desc.cd_flags = 0; to_desc.cd_cnid = cp->c_fileid; @@ -1780,10 +1911,6 @@ /* Unlock the Catalog */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - /* All done with component name... */ - if ((ap->a_cnp->cn_flags & (HASBUF | SAVENAME)) == (HASBUF | SAVENAME)) - FREE_ZONE(ap->a_cnp->cn_pnbuf, ap->a_cnp->cn_pnlen, M_NAMEI); - if (error != 0) goto out; @@ -1793,7 +1920,7 @@ hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); - } else if (dataforkbusy || rsrcforkbusy) { + } else if (dataforkbusy || rsrcforkbusy || isbigfile) { char delname[32]; struct cat_desc to_desc; struct cat_desc todir_desc; @@ -1808,7 +1935,7 @@ bzero(&to_desc, sizeof(to_desc)); to_desc.cd_nameptr = delname; to_desc.cd_namelen = strlen(delname); - to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; to_desc.cd_flags = 0; to_desc.cd_cnid = cp->c_cnid; @@ -1839,9 +1966,14 @@ } else /* Not busy */ { if (cp->c_blocks > 0) { - printf("hfs_remove: attempting to delete a non-empty file!"); +#if 0 + panic("hfs_remove: attempting to delete a non-empty file!"); +#else + printf("hfs_remove: attempting to delete a non-empty file %s\n", + cp->c_desc.cd_nameptr); error = EBUSY; goto out; +#endif } /* Lock catalog b-tree */ @@ -1852,10 +1984,10 @@ error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); if (error && error != ENXIO && error != ENOENT && truncated) { - if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) || - (cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) { + if ((cp->c_datafork && cp->c_datafork->ff_size != 0) || + (cp->c_rsrcfork && cp->c_rsrcfork->ff_size != 0)) { panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)", - error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size); + error, cp->c_datafork->ff_size, cp->c_rsrcfork->ff_size); } else { printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n", cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); @@ -1871,6 +2003,7 @@ #endif /* QUOTA */ cp->c_mode = 0; + truncated = 0; // because the catalog entry is gone cp->c_flag |= C_CHANGE | C_NOEXISTS; --cp->c_nlink; hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); @@ -1894,39 +2027,29 @@ dcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(dvp, &tv, &tv, 0); + HFS_KNOTE(dvp, NOTE_WRITE); - // XXXdbg - if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); +out: + /* All done with component name... */ + if ((options & HFSRM_SAVE_NAME) == 0 && + (cnp != 0) && + (cnp->cn_flags & (HASBUF | SAVENAME)) == (HASBUF | SAVENAME)) { + char *tmp = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); } - if (rvp) - vrele(rvp); - VOP_UNLOCK(vp, 0, p); - // XXXdbg - try to prevent the lost ubc_info panic - if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) { - (void) ubc_uncache(vp); + if (!(options & HFSRM_SKIP_RESERVE)) { + cat_postflight(hfsmp, &cookie, p); } - vrele(vp); - vput(dvp); - - return (0); -out: - if (rvp) - vrele(rvp); - /* Commit the truncation to the catalog record */ if (truncated) { - cp->c_flag |= C_CHANGE | C_UPDATE; - tv = time; - (void) VOP_UPDATE(vp, &tv, &tv, 0); + cp->c_flag |= C_CHANGE | C_UPDATE | C_FORCEUPDATE; + tv = time; + (void) VOP_UPDATE(vp, &tv, &tv, 0); } - vput(vp); - vput(dvp); // XXXdbg if (started_tr) { @@ -1936,6 +2059,26 @@ hfs_global_shared_lock_release(hfsmp); } + HFS_KNOTE(vp, NOTE_DELETE); + if (rvp) { + HFS_KNOTE(rvp, NOTE_DELETE); + vrele(rvp); + }; + + if (error) { + vput(vp); + } else { + VOP_UNLOCK(vp, 0, p); + // XXXdbg - try to prevent the lost ubc_info panic + if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) { + (void) ubc_uncache(vp); + } + vrele(vp); + } + if (!(options & HFSRM_PARENT_LOCKED)) { + vput(dvp); + } + return (error); } @@ -1950,7 +2093,7 @@ cp->c_desc.cd_nameptr = 0; cp->c_desc.cd_namelen = 0; cp->c_desc.cd_flags &= ~CD_HASBUF; - FREE(name, M_TEMP); + remove_name(name); } bcopy(cdp, &cp->c_desc, sizeof(cp->c_desc)); @@ -1963,19 +2106,11 @@ /* # -#% rename fdvp U U U -#% rename fvp U U U -#% rename tdvp L U U -#% rename tvp X U U -# - vop_rename { - IN WILLRELE struct vnode *fdvp; - IN WILLRELE struct vnode *fvp; - IN struct componentname *fcnp; - IN WILLRELE struct vnode *tdvp; - IN WILLRELE struct vnode *tvp; - IN struct componentname *tcnp; - }; +#% rename fdvp U U U +#% rename fvp U U U +#% rename tdvp L U U +#% rename tvp X U U +# */ /* * Rename a cnode. @@ -2014,53 +2149,114 @@ struct cat_desc from_desc; struct cat_desc to_desc; struct cat_desc out_desc; - struct hfsmount *hfsmp; + struct hfsmount *hfsmp = NULL; struct timeval tv; - int fdvp_locked, fvp_locked, tdvp_locked; + cat_cookie_t cookie = {0}; + int fdvp_locked, fvp_locked, tdvp_locked, tvp_locked; int tvp_deleted; int started_tr = 0, grabbed_lock = 0; int error = 0; - hfsmp = VTOHFS(tdvp); /* Establish our vnode lock state. */ tdvp_locked = 1; + tvp_locked = (tvp != 0); fdvp_locked = 0; fvp_locked = 0; tvp_deleted = 0; /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + goto out; + } + + /* * When fvp matches tvp they must be case variants * or hard links. * - * For the hardlink case there can be an extra ref on fvp. + * In some cases tvp will be locked in other cases + * it be unlocked with no reference. Normalize the + * state here (unlocked with a reference) so that + * we can exit in a known state. */ if (fvp == tvp) { - if (VOP_ISLOCKED(fvp) && - (VTOC(fvp)->c_lock.lk_lockholder == p->p_pid) && - (VTOC(fvp)->c_lock.lk_lockthread == current_thread())) { - fvp_locked = 1; - vrele(fvp); /* drop the extra ref */ + if (VOP_ISLOCKED(tvp) && + (VTOC(tvp)->c_lock.lk_lockholder == p->p_pid) && + (VTOC(tvp)->c_lock.lk_lockthread == current_thread())) { + vput(tvp); } tvp = NULL; + tvp_locked = 0; + /* - * If this a hard link and its not a case - * variant then keep tvp around for removal. + * If this a hard link with different parents + * and its not a case variant then keep tvp + * around for removal. */ if ((VTOC(fvp)->c_flag & C_HARDLINK) && ((fdvp != tdvp) || (hfs_namecmp(fcnp->cn_nameptr, fcnp->cn_namelen, tcnp->cn_nameptr, tcnp->cn_namelen) != 0))) { tvp = fvp; + vref(tvp); } } /* - * Check for cross-device rename. + * The following edge case is caught here: + * (to cannot be a descendent of from) + * + * o fdvp + * / + * / + * o fvp + * \ + * \ + * o tdvp + * / + * / + * o tvp */ - if ((fvp->v_mount != tdvp->v_mount) || - (tvp && (fvp->v_mount != tvp->v_mount))) { - error = EXDEV; + if (tdcp->c_parentcnid == VTOC(fvp)->c_cnid) { + error = EINVAL; + goto out; + } + + /* + * The following two edge cases are caught here: + * (note tvp is not empty) + * + * o tdvp o tdvp + * / / + * / / + * o tvp tvp o fdvp + * \ \ + * \ \ + * o fdvp o fvp + * / + * / + * o fvp + */ + if (tvp && (tvp->v_type == VDIR) && (VTOC(tvp)->c_entries != 0)) { + error = ENOTEMPTY; + goto out; + } + + /* + * The following edge case is caught here: + * (the from child and parent are the same) + * + * o tdvp + * / + * / + * fdvp o fvp + */ + if (fdvp == fvp) { + error = EINVAL; goto out; } @@ -2073,16 +2269,7 @@ goto out; } - /* - * Be sure we are not renaming ".", "..", or an alias of ".". - */ - if ((fvp->v_type == VDIR) && - (((fcnp->cn_namelen == 1) && (fcnp->cn_nameptr[0] == '.')) || - (fdvp == fvp) || - (fcnp->cn_flags&ISDOTDOT))) { - error = EINVAL; - goto out; - } + hfsmp = VTOHFS(tdvp); /* * If the destination parent directory is "sticky", then the @@ -2090,146 +2277,123 @@ * the rename, otherwise the destination may not be changed * (except by root). This implements append-only directories. * - * Note that checks for immutable, write access, and a non-empty - * target are done by the call to VOP_REMOVE. + * Note that checks for immutable and write access are done + * by the call to VOP_REMOVE. */ if (tvp && (tdcp->c_mode & S_ISTXT) && (tcnp->cn_cred->cr_uid != 0) && (tcnp->cn_cred->cr_uid != tdcp->c_uid) && (hfs_owner_rights(hfsmp, VTOC(tvp)->c_uid, tcnp->cn_cred, p, false)) ) { - error = EPERM; - goto out; + error = EPERM; + goto out; } +#if QUOTA + if (tvp) + (void)hfs_getinoquota(VTOC(tvp)); +#endif + /* - * All done with preflighting. - * - * We now break the call into two transactions: - * 1 - Remove the destionation (if any) using VOP_REMOVE, - * which in itself is a complete transaction. - * - * 2 - Rename source to destination. - * - * Since all the preflighting is done, we assume that a - * rename failure is unlikely once part 1 is complete. - * Breaking rename into two transactions buys us a much - * simpler implementation with respect to the locking - * protocol. There are only 3 vnodes to worry about - * locking in the correct order (instead of 4). + * Lock all the vnodes before starting a journal transaction. */ /* - * Part 1 - If the destination exists then it needs to be removed. + * Simple case (same parent) - just lock child (fvp). */ - if (tvp) { - /* - * VOP_REMOVE will vput tdvp so we better bump its - * ref count and relockit, always set tvp to NULL - * afterwards to indicate that we're done with it. - */ - VREF(tdvp); - - if (tvp == fvp) { - if (fvp_locked) { - VREF(fvp); - } else { - error = vget(fvp, LK_EXCLUSIVE | LK_RETRY, p); - if (error) - goto out; - fvp_locked = 1; - } - } else { - cache_purge(tvp); - } - - /* Clear SAVENAME to keep VOP_REMOVE from smashing tcnp. */ - tcnp->cn_flags &= ~SAVENAME; - - if (tvp->v_type == VDIR) - error = VOP_RMDIR(tdvp, tvp, tcnp); - else - error = VOP_REMOVE(tdvp, tvp, tcnp); - - /* Get lock states back in sync. */ - tdvp_locked = 0; - if (tvp == fvp) - fvp_locked = 0; - tvp = NULL; /* all done with tvp */ - tvp_deleted = 1; - - if (error) - goto out; /* couldn't remove destination! */ + if (fdvp == tdvp) { + if (error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)) + goto out; + fvp_locked = 1; + goto vnlocked; } - /* - * All done with tvp. - * - * For POSIX compliance, if tvp was removed the only - * error we can return from this point on is EIO. - */ /* - * Part 2 - rename source to destination + * If fdvp is the parent of tdvp then we'll need to + * drop tdvp's lock before acquiring a lock on fdvp. + * + * fdvp + * o + * / \ + * / \ + * tdvp o o fvp + * \ + * \ + * o tvp + * + * + * If the parent directories are unrelated then we'll + * need to aquire their vnode locks in vnode address + * order. Otherwise we can race with another rename + * call that involves the same vnodes except that to + * and from are switched and potentially deadlock. + * [ie rename("a/b", "c/d") vs rename("c/d", "a/b")] + * + * If its not either of the two above cases then we + * can safely lock fdvp and fvp. */ + if ((VTOC(fdvp)->c_cnid == VTOC(tdvp)->c_parentcnid) || + ((VTOC(tdvp)->c_cnid != VTOC(fdvp)->c_parentcnid) && + (fdvp < tdvp))) { - /* - * Lock the vnodes before starting a journal transaction. - */ - if (fdvp != tdvp) { - /* - * fvp is a child and must be locked last. - */ - if (fvp_locked) { - VOP_UNLOCK(fvp, 0, p); - fvp_locked = 0; + /* Drop locks on tvp and tdvp */ + if (tvp_locked) { + VOP_UNLOCK(tvp, 0, p); + tvp_locked = 0; } + VOP_UNLOCK(tdvp, 0, p); + tdvp_locked = 0; + + /* Aquire locks in correct order */ + if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) + goto out; + fdvp_locked = 1; + if ((error = vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY, p))) + goto out; + tdvp_locked = 1; + /* - * If fdvp is the parent of tdvp then it needs to be locked first. + * Now that the parents are locked only one thread + * can continue. So the lock order of the children + * doesn't really matter */ - if ((VTOC(fdvp)->c_cnid == VTOC(tdvp)->c_parentcnid)) { - if (tdvp_locked) { - VOP_UNLOCK(tdvp, 0, p); - tdvp_locked = 0; - } - if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - fdvp_locked = 1; - if ((error = vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY, p))) + if (tvp == fvp) { + if ((error = vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p))) goto out; - tdvp_locked = 1; - - } else /* Lock tdvp then fdvp */ { - if (!tdvp_locked) { - if ((error = vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY, p))) + tvp_locked = 1; + } else { + if (tvp) { + if ((error = vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p))) goto out; - tdvp_locked = 1; + tvp_locked = 1; } - if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) + if ((error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p))) goto out; - fdvp_locked = 1; + fvp_locked = 1; } - } else if (!tdvp_locked) { - /* - * fvp is a child and must be locked last. - */ - if (fvp_locked) { - VOP_UNLOCK(fvp, 0, p); - fvp_locked = 0; - } - if ((error = vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto out; - tdvp_locked = 1; - } - /* Now its safe to lock fvp */ - if (!fvp_locked) { + } else /* OK to lock fdvp and fvp */ { + if ((error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p))) + goto out; + fdvp_locked = 1; if (error = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)) goto out; - fvp_locked = 1; + if (tvp == fvp) + tvp_locked = 1; + else + fvp_locked = 1; } +vnlocked: fdcp = VTOC(fdvp); fcp = VTOC(fvp); + /* + * While fvp is still locked, purge it from the name cache and + * grab it's c_cnid value. Note that the removal of tvp (below) + * can drop fvp's lock when fvp == tvp. + */ + cache_purge(fvp); + /* * When a file moves out of "Cleanup At Startup" * we can drop its NODUMP status. @@ -2238,24 +2402,13 @@ (fvp->v_type == VREG) && (fdvp != tdvp) && (fdcp->c_desc.cd_nameptr != NULL) && - (strcmp(fdcp->c_desc.cd_nameptr, "Cleanup At Startup") == 0)) { + (strcmp(fdcp->c_desc.cd_nameptr, CARBON_TEMP_DIR_NAME) == 0)) { fcp->c_flags &= ~UF_NODUMP; fcp->c_flag |= C_CHANGE; tv = time; (void) VOP_UPDATE(fvp, &tv, &tv, 0); } - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - if (hfsmp->jnl) { - if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { - goto out; - } - started_tr = 1; - } - - cache_purge(fvp); - bzero(&from_desc, sizeof(from_desc)); from_desc.cd_nameptr = fcnp->cn_nameptr; from_desc.cd_namelen = fcnp->cn_namelen; @@ -2270,6 +2423,52 @@ to_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); to_desc.cd_cnid = fcp->c_cnid; + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + goto out; + } + started_tr = 1; + } + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_RENAME + CAT_DELETE, &cookie, p))) { + goto out; + } + + /* + * If the destination exists then it needs to be removed. + */ + + if (tvp) { + if (tvp != fvp) + cache_purge(tvp); + /* + * Note that hfs_removedir and hfs_removefile + * will keep tdvp locked with a reference. + * But tvp will lose its lock and reference. + */ + if (tvp->v_type == VDIR) + error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_RENAMEOPTS); + else + error = hfs_removefile(tdvp, tvp, tcnp, HFSRM_RENAMEOPTS); + + if (tvp == fvp) + fvp_locked = 0; + tvp = NULL; + tvp_locked = 0; + tvp_deleted = 1; + if (error) + goto out; + } + + /* + * All done with tvp and fvp + */ + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) @@ -2279,22 +2478,23 @@ /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (error) + + if (error) { goto out; + } /* Update cnode's catalog descriptor */ - replace_desc(fcp, &out_desc); + if (fvp_locked) { + replace_desc(fcp, &out_desc); + fcp->c_parentcnid = tdcp->c_cnid; + fcp->c_hint = 0; + } hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE, (fdcp->c_cnid == kHFSRootFolderID)); hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE, (tdcp->c_cnid == kHFSRootFolderID)); - VOP_UNLOCK(fvp, 0, p); - fcp = NULL; - fvp_locked = 0; - /* All done with fvp. */ - /* Update both parent directories. */ tv = time; if (fdvp != tdvp) { @@ -2312,6 +2512,9 @@ (void) VOP_UPDATE(tdvp, &tv, &tv, 0); out: + if (hfsmp) { + cat_postflight(hfsmp, &cookie, p); + } if (started_tr) { journal_end_transaction(hfsmp->jnl); } @@ -2319,6 +2522,14 @@ hfs_global_shared_lock_release(hfsmp); } + /* Note that if hfs_removedir or hfs_removefile was invoked above they will already have + generated a NOTE_WRITE for tdvp and a NOTE_DELETE for tvp. + */ + if (error == 0) { + HFS_KNOTE(fvp, NOTE_RENAME); + HFS_KNOTE(fdvp, NOTE_WRITE); + if (tdvp != fdvp) HFS_KNOTE(tdvp, NOTE_WRITE); + }; if (fvp_locked) { VOP_UNLOCK(fvp, 0, p); } @@ -2328,18 +2539,18 @@ if (tdvp_locked) { VOP_UNLOCK(tdvp, 0, p); } - if (tvp && (tvp != fvp)) { - if (tvp != tdvp) - VOP_UNLOCK(tvp, 0, p); - vrele(tvp); + if (tvp_locked) { + VOP_UNLOCK(tvp, 0, p); } vrele(fvp); vrele(fdvp); + if (tvp) + vrele(tvp); vrele(tdvp); /* After tvp is removed the only acceptable error is EIO */ - if ((error == ENOSPC) && tvp_deleted) + if (error && tvp_deleted) error = EIO; return (error); @@ -2441,7 +2652,6 @@ vp = *vpp; len = strlen(ap->a_target); fp = VTOF(vp); - fp->ff_clumpsize = VTOVCB(vp)->blockSize; #if QUOTA (void)hfs_getinoquota(VTOC(vp)); @@ -2570,6 +2780,10 @@ int eofflag = 0; void *user_start = NULL; int user_len; + + int ncookies=0; + u_long *cookies=NULL; + u_long *cookiep=NULL; /* We assume it's all one big buffer... */ if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE) @@ -2602,7 +2816,6 @@ } } - /* Create the entries for . and .. */ if (uio->uio_offset < sizeof(rootdots)) { caddr_t dep; @@ -2627,10 +2840,58 @@ goto Exit; } + if (ap->a_ncookies != NULL) { + /* + * These cookies are handles that allow NFS to restart + * scanning through a directory. If a directory is large + * enough, NFS will issue a successive readdir() with a + * uio->uio_offset that is equal to one of these cookies. + * + * The cookies that we generate are synthesized byte-offsets. + * The offset is where the dirent the dirent would be if the + * directory were an array of packed dirent structs. It is + * synthetic because that's not how directories are stored in + * HFS but other code expects that the cookie is a byte offset. + * + * We have to pre-allocate the cookies because cat_getdirentries() + * is the only one that can properly synthesize the offsets (since + * it may have to skip over entries and only it knows the true + * virtual offset of any particular directory entry). So we allocate + * a cookie table here and pass it in to cat_getdirentries(). + * + * Note that the handling of "." and ".." is mostly done here but + * cat_getdirentries() is aware of. + * + * Only the NFS server uses cookies so fortunately this code is + * not executed unless the NFS server is issuing the readdir + * request. + * + * Also note that the NFS server is the one responsible for + * free'ing the cookies even though we allocated them. Ick. + * + * We allocate a reasonable number of entries for the size of + * the buffer that we're going to fill in. cat_getdirentries() + * is smart enough to not overflow if there's more room in the + * buffer but not enough room in the cookie table. + */ + if (uio->uio_segflg != UIO_SYSSPACE) + panic("hfs_readdir: unexpected uio from NFS server"); + + ncookies = uio->uio_iov->iov_len / (AVERAGE_HFSDIRENTRY_SIZE/2); + MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); + + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } + /* If there are no children then we're done */ if (cp->c_entries == 0) { eofflag = 1; retval = 0; + if (cookies) { + cookies[0] = 0; + cookies[1] = sizeof(struct hfsdotentry); + } goto Exit; } @@ -2638,7 +2899,7 @@ retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); if (retval) goto Exit; - retval = cat_getdirentries(hfsmp, &cp->c_desc, uio, &eofflag); + retval = cat_getdirentries(hfsmp, &cp->c_desc, cp->c_entries, uio, &eofflag, cookies, ncookies); /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); @@ -2654,38 +2915,6 @@ } cp->c_flag |= C_ACCESS; - /* Bake any cookies */ - if (!retval && ap->a_ncookies != NULL) { - struct dirent* dpStart; - struct dirent* dpEnd; - struct dirent* dp; - int ncookies; - u_long *cookies; - u_long *cookiep; - - /* - * Only the NFS server uses cookies, and it loads the - * directory block into system space, so we can just look at - * it directly. - */ - if (uio->uio_segflg != UIO_SYSSPACE) - panic("hfs_readdir: unexpected uio from NFS server"); - dpStart = (struct dirent *)(uio->uio_iov->iov_base - (uio->uio_offset - off)); - dpEnd = (struct dirent *) uio->uio_iov->iov_base; - for (dp = dpStart, ncookies = 0; - dp < dpEnd && dp->d_reclen != 0; - dp = (struct dirent *)((caddr_t)dp + dp->d_reclen)) - ncookies++; - MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); - for (dp = dpStart, cookiep = cookies; - dp < dpEnd; - dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { - off += dp->d_reclen; - *cookiep++ = (u_long) off; - } - *ap->a_ncookies = ncookies; - *ap->a_cookies = cookies; - } Exit:; if (hfsmp->jnl && user_start) { @@ -2761,40 +2990,28 @@ } } retval = uiomove((caddr_t)fp->ff_symlinkptr, (int)fp->ff_size, ap->a_uio); - - return (retval); -} - - -/* - * hfs abort op, called after namei() when a CREATE/DELETE isn't actually - * done. If a buffer has been saved in anticipation of a CREATE, delete it. -#% abortop dvp = = = -# - vop_abortop { - IN struct vnode *dvp; - IN struct componentname *cnp; - - */ - -/* ARGSUSED */ - -static int -hfs_abortop(ap) - struct vop_abortop_args /* { - struct vnode *a_dvp; - struct componentname *a_cnp; - } */ *ap; -{ - if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { - FREE_ZONE(ap->a_cnp->cn_pnbuf, ap->a_cnp->cn_pnlen, M_NAMEI); - ap->a_cnp->cn_flags &= ~HASBUF; +#if 1 + /* + * Keep track blocks read + */ + if ((VTOHFS(vp)->hfc_stage == HFC_RECORDING) && (retval == 0)) { + + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) + VTOF(vp)->ff_bytesread = fp->ff_size; + else + VTOF(vp)->ff_bytesread += fp->ff_size; + + // if (VTOF(vp)->ff_bytesread > fp->ff_size) + // cp->c_flag |= C_ACCESS; } - - return (0); +#endif + return (retval); } - /* * Lock an cnode. If its already locked, set the WANT bit and sleep. #% lock vp U L U @@ -2816,9 +3033,6 @@ struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); - if (cp == NULL) - panic("hfs_lock: cnode in vnode is null\n"); - return (lockmgr(&cp->c_lock, ap->a_flags, &vp->v_interlock, ap->a_p)); } @@ -2842,10 +3056,12 @@ { struct vnode *vp = ap->a_vp; struct cnode *cp = VTOC(vp); - - if (cp == NULL) - panic("hfs_unlock: cnode in vnode is null\n"); - +#if 0 + if (!lockstatus(&cp->c_lock)) { + printf("hfs_unlock: vnode %s wasn't locked!\n", + cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); + } +#endif return (lockmgr(&cp->c_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, ap->a_p)); } @@ -2929,6 +3145,9 @@ case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; /* 1024 */ break; + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; @@ -2939,7 +3158,10 @@ *ap->a_retval = kHFSPlusMaxFileNameChars; break; case _PC_CASE_SENSITIVE: - *ap->a_retval = 0; + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_CASE_SENSITIVE) + *ap->a_retval = 1; + else + *ap->a_retval = 0; break; case _PC_CASE_PRESERVING: *ap->a_retval = 1; @@ -3015,12 +3237,16 @@ return (EINVAL); } - if (start < 0) - return (EINVAL); if (fl->l_len == 0) end = -1; - else + else if (fl->l_len > 0) end = start + fl->l_len - 1; + else { /* l_len is negative */ + end = start - 1; + start += fl->l_len; + } + if (start < 0) + return (EINVAL); /* * Create the hfslockf structure @@ -3098,14 +3324,14 @@ hfsmp = VTOHFS(vp); /* XXX do we really want to clear the sytem cnode flags here???? */ - if ((vp->v_flag & VSYSTEM) || - (VTOVFS(vp)->mnt_flag & MNT_RDONLY) || + if (((vp->v_flag & VSYSTEM) && (cp->c_cnid < kHFSFirstUserCatalogNodeID))|| + (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) || (cp->c_mode == 0)) { cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); return (0); } - updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); + updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_FORCEUPDATE); /* Nothing to update. */ if (updateflag == 0) { @@ -3117,23 +3343,15 @@ } if (updateflag & C_ACCESS) { /* - * If only the access time is changing then defer - * updating it on-disk util later (in hfs_inactive). - * If it was recently updated then skip the update. + * When the access time is the only thing changing + * then make sure its sufficiently newer before + * committing it to disk. */ - if (updateflag == C_ACCESS) { - cp->c_flag &= ~C_ACCESS; - - /* Its going to disk or its sufficiently newer... */ - if ((cp->c_flag & C_ATIMEMOD) || - (ap->a_access->tv_sec > (cp->c_atime + ATIME_ACCURACY))) { - cp->c_atime = ap->a_access->tv_sec; - cp->c_flag |= C_ATIMEMOD; - } + if ((updateflag == C_ACCESS) && + (ap->a_access->tv_sec < (cp->c_atime + ATIME_ONDISK_ACCURACY))) { return (0); - } else { - cp->c_atime = ap->a_access->tv_sec; } + cp->c_atime = ap->a_access->tv_sec; } if (updateflag & C_UPDATE) { cp->c_mtime = ap->a_modify->tv_sec; @@ -3163,15 +3381,21 @@ * gets written to disk. * * Deleted files can defer meta data updates until inactive. + * + * If we're ever called with the C_FORCEUPDATE flag though + * we have to do the update. */ - if (ISSET(cp->c_flag, C_DELETED) || + if (ISSET(cp->c_flag, C_FORCEUPDATE) == 0 && + (ISSET(cp->c_flag, C_DELETED) || (dataforkp && cp->c_datafork->ff_unallocblocks) || - (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) { + (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks))) { if (updateflag & (C_CHANGE | C_UPDATE)) hfs_volupdate(hfsmp, VOL_UPDATE, 0); cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE); cp->c_flag |= C_MODIFIED; + HFS_KNOTE(vp, NOTE_ATTRIB); + return (0); } @@ -3195,6 +3419,19 @@ bcopy(dataforkp, &datafork, sizeof(datafork)); datafork.cf_size = CIRCLEQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; dataforkp = &datafork; + } else if (dataforkp && (cp->c_datafork->ff_unallocblocks != 0)) { + // always make sure the block count and the size + // of the file match the number of blocks actually + // allocated to the file on disk + bcopy(dataforkp, &datafork, sizeof(datafork)); + // make sure that we don't assign a negative block count + if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { + panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", + cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + } + datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); + datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; + dataforkp = &datafork; } /* @@ -3217,18 +3454,20 @@ /* Unlock the Catalog b-tree file. */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (updateflag & (C_CHANGE | C_UPDATE)) + if (updateflag & (C_CHANGE | C_UPDATE | C_FORCEUPDATE)) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + /* After the updates are finished, clear the flags */ + cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_FORCEUPDATE); + // XXXdbg if (hfsmp->jnl) { journal_end_transaction(hfsmp->jnl); } hfs_global_shared_lock_release(hfsmp); - /* After the updates are finished, clear the flags */ - cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD); - + HFS_KNOTE(vp, NOTE_ATTRIB); + return (error); } @@ -3253,6 +3492,7 @@ struct proc *p; struct cat_desc in_desc, out_desc; struct cat_attr attr; + cat_cookie_t cookie = {0}; int error, started_tr = 0, grabbed_lock = 0; enum vtype vnodetype; @@ -3339,6 +3579,17 @@ started_tr = 1; } + /* + * Reserve some space in the Catalog file. + * + * (we also add CAT_DELETE since our getnewvnode + * request can cause an hfs_inactive call to + * delete an unlinked file) + */ + if ((error = cat_preflight(hfsmp, CAT_CREATE | CAT_DELETE, &cookie, p))) { + goto exit; + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) @@ -3358,6 +3609,11 @@ dcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(dvp, &tv, &tv, 0); + if (vnodetype == VDIR) { + HFS_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + } else { + HFS_KNOTE(dvp, NOTE_WRITE); + }; hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE, (dcp->c_cnid == kHFSRootFolderID)); @@ -3388,6 +3644,8 @@ if (error) goto exit; + // XXXdbg + cache_enter(dvp, tvp, cnp); #if QUOTA cp = VTOC(tvp); @@ -3398,16 +3656,15 @@ */ if ((error = hfs_getinoquota(cp)) || (error = hfs_chkiq(cp, 1, cnp->cn_cred, FORCE))) { - if ((cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - cnp->cn_flags &= ~HASBUF; - } if (tvp->v_type == VDIR) VOP_RMDIR(dvp,tvp, cnp); else VOP_REMOVE(dvp,tvp, cnp); - return (error); + // because VOP_RMDIR and VOP_REMOVE already + // have done the vput() + dvp = NULL; + goto exit; } #endif /* QUOTA */ @@ -3432,17 +3689,22 @@ exit: cat_releasedesc(&out_desc); - if ((cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); + cat_postflight(hfsmp, &cookie, p); + if ((cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { + char *tmp = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + } /* * Check if a file is located in the "Cleanup At Startup" * directory. If it is then tag it as NODUMP so that we * can be lazy about zero filling data holes. */ - if ((error == 0) && (vnodetype == VREG) && + if ((error == 0) && dvp && (vnodetype == VREG) && (dcp->c_desc.cd_nameptr != NULL) && - (strcmp(dcp->c_desc.cd_nameptr, "Cleanup At Startup") == 0)) { + (strcmp(dcp->c_desc.cd_nameptr, CARBON_TEMP_DIR_NAME) == 0)) { struct vnode *ddvp; cnid_t parid; @@ -3463,11 +3725,9 @@ vput(ddvp); } } - if (dvp) vput(dvp); - // XXXdbg if (started_tr) { journal_end_transaction(hfsmp->jnl); started_tr = 0; @@ -3527,6 +3787,158 @@ } +static void +filt_hfsdetach(struct knote *kn) +{ + struct vnode *vp; + int result; + struct proc *p = current_proc(); + + vp = (struct vnode *)kn->kn_hook; + if (1) { /* ! KNDETACH_VNLOCKED */ + result = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (result) return; + }; + + result = KNOTE_DETACH(&VTOC(vp)->c_knotes, kn); + + if (1) { /* ! KNDETACH_VNLOCKED */ + VOP_UNLOCK(vp, 0, p); + }; +} + +/*ARGSUSED*/ +static int +filt_hfsread(struct knote *kn, long hint) +{ + struct vnode *vp = (struct vnode *)kn->kn_fp->f_data; + + if (hint == NOTE_REVOKE) { + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + + kn->kn_data = VTOF(vp)->ff_size - kn->kn_fp->f_offset; + return (kn->kn_data != 0); +} + +/*ARGSUSED*/ +static int +filt_hfswrite(struct knote *kn, long hint) +{ + if (hint == NOTE_REVOKE) { + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + } + + kn->kn_data = 0; + return (1); +} + +static int +filt_hfsvnode(struct knote *kn, long hint) +{ + + if (kn->kn_sfflags & hint) + kn->kn_fflags |= hint; + if (hint == NOTE_REVOKE) { + kn->kn_flags |= EV_EOF; + return (1); + } + return (kn->kn_fflags != 0); +} + +static struct filterops hfsread_filtops = + { 1, NULL, filt_hfsdetach, filt_hfsread }; +static struct filterops hfswrite_filtops = + { 1, NULL, filt_hfsdetach, filt_hfswrite }; +static struct filterops hfsvnode_filtops = + { 1, NULL, filt_hfsdetach, filt_hfsvnode }; + +/* + # + #% kqfilt_add vp L L L + # + vop_kqfilt_add + IN struct vnode *vp; + IN struct knote *kn; + IN struct proc *p; + */ +static int +hfs_kqfilt_add(ap) + struct vop_kqfilt_add_args /* { + struct vnode *a_vp; + struct knote *a_kn; + struct proc *p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct knote *kn = ap->a_kn; + + switch (kn->kn_filter) { + case EVFILT_READ: + if (vp->v_type == VREG) { + kn->kn_fop = &hfsread_filtops; + } else { + return EINVAL; + }; + break; + case EVFILT_WRITE: + if (vp->v_type == VREG) { + kn->kn_fop = &hfswrite_filtops; + } else { + return EINVAL; + }; + break; + case EVFILT_VNODE: + kn->kn_fop = &hfsvnode_filtops; + break; + default: + return (1); + } + + kn->kn_hook = (caddr_t)vp; + + /* simple_lock(&vp->v_pollinfo.vpi_lock); */ + KNOTE_ATTACH(&VTOC(vp)->c_knotes, kn); + /* simple_unlock(&vp->v_pollinfo.vpi_lock); */ + + return (0); +} + +/* + # + #% kqfilt_remove vp L L L + # + vop_kqfilt_remove + IN struct vnode *vp; + IN uintptr_t ident; + IN struct proc *p; + */ +static int +hfs_kqfilt_remove(ap) + struct vop_kqfilt_remove_args /* { + struct vnode *a_vp; + uintptr_t ident; + struct proc *p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + uintptr_t ident = ap->a_ident; + int result; + + result = ENOTSUP; /* XXX */ + + return (result); +} + /* * Wrapper for special device reads */ @@ -3656,6 +4068,43 @@ simple_unlock(&vp->v_interlock); return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); } + +/* + * kqfilt_add wrapper for fifos. + * + * Fall through to hfs kqfilt_add routines if needed + */ +int +hfsfifo_kqfilt_add(ap) + struct vop_kqfilt_add_args *ap; +{ + extern int (**fifo_vnodeop_p)(void *); + int error; + + error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_add), ap); + if (error) + error = hfs_kqfilt_add(ap); + return (error); +} + +/* + * kqfilt_remove wrapper for fifos. + * + * Fall through to hfs kqfilt_remove routines if needed + */ +int +hfsfifo_kqfilt_remove(ap) + struct vop_kqfilt_remove_args *ap; +{ + extern int (**fifo_vnodeop_p)(void *); + int error; + + error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilt_remove), ap); + if (error) + error = hfs_kqfilt_remove(ap); + return (error); +} + #endif /* FIFO */ @@ -3706,6 +4155,7 @@ { &vop_write_desc, (VOPFUNC)hfs_write }, /* write */ { &vop_ioctl_desc, (VOPFUNC)hfs_ioctl }, /* ioctl */ { &vop_select_desc, (VOPFUNC)hfs_select }, /* select */ + { &vop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ { &vop_exchange_desc, (VOPFUNC)hfs_exchange }, /* exchange */ { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ { &vop_fsync_desc, (VOPFUNC)hfs_fsync }, /* fsync */ @@ -3722,7 +4172,7 @@ { &vop_readdir_desc, (VOPFUNC)hfs_readdir }, /* readdir */ { &vop_readdirattr_desc, (VOPFUNC)hfs_readdirattr }, /* readdirattr */ { &vop_readlink_desc, (VOPFUNC)hfs_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)hfs_abortop }, /* abortop */ + { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ { &vop_inactive_desc, (VOPFUNC)hfs_inactive }, /* inactive */ { &vop_reclaim_desc, (VOPFUNC)hfs_reclaim }, /* reclaim */ { &vop_lock_desc, (VOPFUNC)hfs_lock }, /* lock */ @@ -3745,6 +4195,8 @@ { &vop_blktooff_desc, (VOPFUNC)hfs_blktooff }, /* blktooff */ { &vop_offtoblk_desc, (VOPFUNC)hfs_offtoblk }, /* offtoblk */ { &vop_cmap_desc, (VOPFUNC)hfs_cmap }, /* cmap */ + { &vop_kqfilt_add_desc, (VOPFUNC)hfs_kqfilt_add }, /* kqfilt_add */ + { &vop_kqfilt_remove_desc, (VOPFUNC)hfs_kqfilt_remove }, /* kqfilt_remove */ { NULL, (VOPFUNC)NULL } }; @@ -3776,6 +4228,7 @@ { &vop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ { &vop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ { &vop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vop_getattrlist_desc, (VOPFUNC)hfs_getattrlist }, { &vop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ { &vop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ { &vop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ @@ -3834,6 +4287,7 @@ { &vop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ { &vop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ { &vop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ + { &vop_getattrlist_desc, (VOPFUNC)hfs_getattrlist }, { &vop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ { &vop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ { &vop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ @@ -3861,6 +4315,8 @@ { &vop_blktooff_desc, (VOPFUNC)hfs_blktooff }, /* blktooff */ { &vop_offtoblk_desc, (VOPFUNC)hfs_offtoblk }, /* offtoblk */ { &vop_cmap_desc, (VOPFUNC)hfs_cmap }, /* cmap */ + { &vop_kqfilt_add_desc, (VOPFUNC)hfsfifo_kqfilt_add }, /* kqfilt_add */ + { &vop_kqfilt_remove_desc, (VOPFUNC)hfsfifo_kqfilt_remove }, /* kqfilt_remove */ { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } }; struct vnodeopv_desc hfs_fifoop_opv_desc = diff -urN xnu-344.49/bsd/hfs/hfscommon/BTree/BTree.c xnu-517/bsd/hfs/hfscommon/BTree/BTree.c --- xnu-344.49/bsd/hfs/hfscommon/BTree/BTree.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/BTree/BTree.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -155,6 +155,12 @@ */ #define kNumLeafRecSlack 10 +/* BTree accessor routines */ +extern OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions options, BlockDescriptor *block); +extern OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, ItemCount minBlockCount); +extern OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF); +extern OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options); + //////////////////////////////////// Globals //////////////////////////////////// @@ -171,9 +177,6 @@ Input: filePtr - pointer to file to open as a B-tree keyCompareProc - pointer to client's KeyCompare function - getBlockProc - pointer to client's GetBlock function - releaseBlockProc - pointer to client's ReleaseBlock function - setEndOfForkProc - pointer to client's SetEOF function Result: noErr - success paramErr - required ptr was nil @@ -182,12 +185,7 @@ != noErr - failure -------------------------------------------------------------------------------*/ -OSStatus BTOpenPath (FCB *filePtr, - KeyCompareProcPtr keyCompareProc, - GetBlockProcPtr getBlockProc, - ReleaseBlockProcPtr releaseBlockProc, - SetEndOfForkProcPtr setEndOfForkProc, - SetBlockSizeProcPtr setBlockSizeProc ) +OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc) { OSStatus err; BTreeControlBlockPtr btreePtr; @@ -196,21 +194,22 @@ ////////////////////// Preliminary Error Checking /////////////////////////// - if ( filePtr == nil || - getBlockProc == nil || - releaseBlockProc == nil || - setEndOfForkProc == nil || - setBlockSizeProc == nil ) + if ( filePtr == nil ) { return paramErr; } - if ( filePtr->fcbBTCBPtr != nil ) // already has a BTreeCB + /* + * Subsequent opens allow key compare proc to be changed. + */ + if ( filePtr->fcbBTCBPtr != nil && keyCompareProc != nil) { + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + btreePtr->keyCompareProc = keyCompareProc; return noErr; + } - // is file large enough to contain header node? if ( filePtr->fcbEOF < kMinNodeSize ) - return fsBTInvalidFileErr; //€€ or E_BadHeader? + return fsBTInvalidFileErr; //////////////////////// Allocate Control Block ///////////////////////////// @@ -222,9 +221,9 @@ return memFullErr; } - btreePtr->getBlockProc = getBlockProc; - btreePtr->releaseBlockProc = releaseBlockProc; - btreePtr->setEndOfForkProc = setEndOfForkProc; + btreePtr->getBlockProc = GetBTreeBlock; + btreePtr->releaseBlockProc = ReleaseBTreeBlock; + btreePtr->setEndOfForkProc = ExtendBTreeFile; btreePtr->keyCompareProc = keyCompareProc; /////////////////////////// Read Header Node //////////////////////////////// @@ -236,15 +235,20 @@ /* The minimum node size is the physical block size */ nodeRec.blockSize = VTOHFS(btreePtr->fileRefNum)->hfs_phys_block_size; + /* Start with the allocation block size for regular files. */ + if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) + { + nodeRec.blockSize = FCBTOVCB(filePtr)->blockSize; + } REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); // it is now safe to call M_ExitOnError (err) - err = setBlockSizeProc (btreePtr->fileRefNum, nodeRec.blockSize, 1); + err = SetBTreeBlockSize (btreePtr->fileRefNum, nodeRec.blockSize, 1); M_ExitOnError (err); - err = getBlockProc (btreePtr->fileRefNum, + err = GetBTreeBlock(btreePtr->fileRefNum, kHeaderNodeNum, kGetBlock, &nodeRec ); @@ -278,9 +282,12 @@ btreePtr->maxKeyLength = header->maxKeyLength; btreePtr->totalNodes = header->totalNodes; btreePtr->freeNodes = header->freeNodes; - // ignore header->clumpSize; //€€ rename this field? + if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) + filePtr->ff_clumpsize = header->clumpSize; btreePtr->btreeType = header->btreeType; + btreePtr->keyCompareType = header->keyCompareType; + btreePtr->attributes = header->attributes; if ( btreePtr->maxKeyLength > 40 ) @@ -304,7 +311,7 @@ * we cannot mount using the current physical block size. */ if (btreePtr->leafRecords > 0 || - VTOHFS(btreePtr->fileRefNum)->hfs_media_writeable) + VTOHFS(btreePtr->fileRefNum)->hfs_flags & HFS_WRITEABLE_MEDIA) { err = fsBTBadNodeSize; goto ErrorExit; @@ -321,14 +328,14 @@ } else { - err = setBlockSizeProc (btreePtr->fileRefNum, btreePtr->nodeSize, 32); //€€ we should try and get this down to 8 + err = SetBTreeBlockSize (btreePtr->fileRefNum, btreePtr->nodeSize, 32); M_ExitOnError (err); /* * Need to use kTrashBlock option to force the * buffer cache to read the entire node */ - err = releaseBlockProc(btreePtr->fileRefNum, &nodeRec, kTrashBlock); + err = ReleaseBTreeBlock(btreePtr->fileRefNum, &nodeRec, kTrashBlock); ++btreePtr->numReleaseNodes; M_ExitOnError (err); @@ -1255,7 +1262,7 @@ case fsBTEmptyErr: // if tree empty add 1st leaf node - if (btreePtr->freeNodes == 0) + if (BTAvailableNodes(btreePtr) == 0) { err = ExtendBTree (btreePtr, btreePtr->totalNodes + 1); M_ExitOnError (err); @@ -1317,10 +1324,10 @@ /////////////////////// Extend File If Necessary //////////////////////////// - nodesNeeded = btreePtr->treeDepth + 1 - btreePtr->freeNodes; //€€ math limit + nodesNeeded = (SInt32)btreePtr->treeDepth + 1 - BTAvailableNodes(btreePtr); if (nodesNeeded > 0) { - nodesNeeded += btreePtr->totalNodes; + nodesNeeded += (SInt32)btreePtr->totalNodes; if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! ++nodesNeeded; @@ -1469,10 +1476,10 @@ //////////////////////////// Make Some Room ///////////////////////////////// - nodesNeeded = btreePtr->treeDepth + 1 - btreePtr->freeNodes; //€€ math limit + nodesNeeded = (SInt32)btreePtr->treeDepth + 1 - BTAvailableNodes(btreePtr); if (nodesNeeded > 0) { - nodesNeeded += btreePtr->totalNodes; + nodesNeeded += (SInt32)btreePtr->totalNodes; if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! ++nodesNeeded; @@ -1480,7 +1487,6 @@ M_ExitOnError (err); } - // XXXdbg ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); @@ -1643,6 +1649,7 @@ BTreeControlBlockPtr btreePtr; TreePathTable treePathTable; BlockDescriptor nodeRec; + SInt32 nodesNeeded; UInt32 nodeNum; UInt16 index; @@ -1673,6 +1680,19 @@ M_ExitOnError (err); // record must exit for Delete + /////////////////////// Extend File If Necessary //////////////////////////// + + nodesNeeded = (SInt32)btreePtr->treeDepth + 1 - BTAvailableNodes(btreePtr); + if ((btreePtr->attributes & kBTVariableIndexKeysMask) && (nodesNeeded > 0)) + { + nodesNeeded += (SInt32)btreePtr->totalNodes; + if (nodesNeeded > CalcMapBits (btreePtr)) + ++nodesNeeded; + + err = ExtendBTree (btreePtr, nodesNeeded); + M_ExitOnError (err); + } + ///////////////////////////// Delete Record ///////////////////////////////// err = DeleteTree (btreePtr, treePathTable, &nodeRec, index, 1); @@ -1728,8 +1748,7 @@ info->numNodes = btreePtr->totalNodes; info->numFreeNodes = btreePtr->freeNodes; info->lastfsync = btreePtr->lastfsync; - info->reserved = 0; - + info->keyCompareType = btreePtr->keyCompareType; return noErr; } @@ -1940,25 +1959,10 @@ } -/*------------------------------------------------------------------------------- -Routine: BTCheckFreeSpace - -Function: Makes sure there is enough free space so that a tree operation - will succeed. - -Input: fcb - pointer file control block - -Output: none - -Result: noErr - success - --------------------------------------------------------------------------------*/ - __private_extern__ -OSStatus BTCheckFreeSpace (FCB *filePtr) +OSStatus BTHasContiguousNodes (FCB *filePtr) { BTreeControlBlockPtr btreePtr; - int nodesNeeded, err = noErr; M_ReturnErrorIf (filePtr == nil, paramErr); @@ -1969,33 +1973,85 @@ M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); - // XXXdbg this is highly conservative but so much better than - // winding up with turds on your disk. - // - nodesNeeded = (btreePtr->treeDepth + 1) * 10; + return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize); +} + + +/*------------------------------------------------------------------------------- +Routine: BTGetUserData + +Function: Read the user data area of the b-tree header node. + +-------------------------------------------------------------------------------*/ +__private_extern__ +OSStatus +BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize) +{ + BTreeControlBlockPtr btreePtr; + BlockDescriptor node; + char * offset; + OSStatus err; + + if (dataSize > kBTreeHeaderUserBytes) + return (EINVAL); + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + return (fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + err = GetNode(btreePtr, kHeaderNodeNum, &node); + if (err) + return (err); - if (btreePtr->freeNodes < nodesNeeded) { - err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes); - } + offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); + bcopy(offset, dataPtr, dataSize); - return err; + (void) ReleaseNode(btreePtr, &node); + + return (0); } +/*------------------------------------------------------------------------------- +Routine: BTSetUserData + +Function: Write the user data area of the b-tree header node. +-------------------------------------------------------------------------------*/ __private_extern__ -OSStatus BTHasContiguousNodes (FCB *filePtr) +OSStatus +BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize) { - BTreeControlBlockPtr btreePtr; - int nodesNeeded, err = noErr; - + BTreeControlBlockPtr btreePtr; + BlockDescriptor node; + char * offset; + OSStatus err; - M_ReturnErrorIf (filePtr == nil, paramErr); + if (dataSize > kBTreeHeaderUserBytes) + return (EINVAL); + node.buffer = nil; + node.blockHeader = nil; btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + return (fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + err = GetNode(btreePtr, kHeaderNodeNum, &node); + if (err) + return (err); - REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + ModifyBlockStart(btreePtr->fileRefNum, &node); - M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); + bcopy(dataPtr, offset, dataSize); - return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize); + err = UpdateNode (btreePtr, &node, 0, 0); + + return (err); } + diff -urN xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeAllocate.c xnu-517/bsd/hfs/hfscommon/BTree/BTreeAllocate.c --- xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeAllocate.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/BTree/BTreeAllocate.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -193,6 +193,10 @@ --btreePtr->freeNodes; btreePtr->flags |= kBTHeaderDirty; + + /* Account for allocations from node reserve */ + BTUpdateReserve(btreePtr, 1); + *nodeNum = nodeNumber; return noErr; diff -urN xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c xnu-517/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c --- xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -176,7 +176,7 @@ forkSize = (UInt64)totalNodes * (UInt64)header->nodeSize; - if ( forkSize != filePtr->fcbEOF ) + if ( forkSize > filePtr->fcbEOF ) return fsBTInvalidHeaderErr; if ( header->freeNodes >= totalNodes ) diff -urN xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c xnu-517/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c --- xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/hfs/hfscommon/BTree/BTreeNodeReserve.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include "../headers/BTreesPrivate.h" +#include "sys/malloc.h" + + +/* + * B-tree Node Reserve + * + * BTReserveSpace + * BTReleaseReserve + * BTUpdateReserve + * BTAvailableNodes + * + * Each kernel thread can have it's own reserve of b-tree + * nodes. This reserve info is kept in a hash table. + * + * Don't forget to call BTReleaseReserve when you're finished + * or you will leave stale node reserves in the hash. + */ + + +/* + * BE CAREFUL WHEN INCREASING THE SIZE OF THIS STRUCT! + * + * It must remain equal in size to the opaque cat_cookie_t + * struct (in hfs_catalog.h). + */ +struct nreserve { + LIST_ENTRY(nreserve) nr_hash; /* hash chain */ + int nr_nodecnt; /* count of nodes held in reserve */ + int nr_newnodes; /* nodes that were allocated */ + struct vnode *nr_btvp; /* b-tree file vnode */ + void *nr_tag; /* unique tag (per thread) */ +}; + +#define NR_GET_TAG() (current_act()) + +#define NR_CACHE 17 + +#define NR_HASH(btvp, tag) \ + (&nr_hashtbl[((((int)(btvp)) >> 8) ^ ((int)(tag) >> 4)) & nr_hashmask]) + +LIST_HEAD(nodereserve, nreserve) *nr_hashtbl; + +u_long nr_hashmask; + + +/* Internal Node Reserve Hash Routines (private) */ +static void nr_insert (struct vnode *, struct nreserve *nrp, int); +static void nr_delete (struct vnode *, struct nreserve *nrp, int *); +static int nr_lookup (struct vnode *); +static void nr_update (struct vnode *, int); + + +/* + * BTReserveSetup - initialize the node reserve hash table + */ +__private_extern__ +void +BTReserveSetup() +{ + if (sizeof(struct nreserve) != sizeof(cat_cookie_t)) + panic("BTReserveSetup: nreserve size != opaque struct size"); + + nr_hashtbl = hashinit(NR_CACHE, M_HFSMNT, &nr_hashmask); +} + + +/* + * BTAvailNodes - obtain the actual available nodes (for current thread) + * + */ +__private_extern__ +SInt32 +BTAvailableNodes(BTreeControlBlock *btree) +{ + SInt32 availNodes; + + availNodes = (SInt32)btree->freeNodes - (SInt32)btree->reservedNodes; + + return (availNodes + nr_lookup(btree->fileRefNum)); +} + + +/* + * BTReserveSpace - obtain a node reserve (for current thread) + * + * Used by the Catalog Layer (hfs_catalog.c) to reserve space. + */ +__private_extern__ +int +BTReserveSpace(FCB *file, int operations, void* data) +{ + BTreeControlBlock *btree; + int rsrvNodes, availNodes, totalNodes; + int height; + int inserts, deletes; + int err = 0; + + btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btree->fileRefNum, true); + + /* + * The node reserve is based on the number of b-tree + * operations (insert/deletes) and the height of the + * tree. + */ + height = btree->treeDepth; + inserts = operations & 0xffff; + deletes = operations >> 16; + + rsrvNodes = 1; /* allow for at least one root split */ + if (deletes) + rsrvNodes += (deletes * (height - 1)) - 1; + if (inserts) + rsrvNodes += (inserts * height) + 1; + + availNodes = btree->freeNodes - btree->reservedNodes; + + if (rsrvNodes > availNodes) { + totalNodes = rsrvNodes + btree->totalNodes - availNodes; + + /* See if we also need a map node */ + if (totalNodes > CalcMapBits(btree)) + ++totalNodes; + if ((err = ExtendBTree(btree, totalNodes))) + return (err); + } + + btree->reservedNodes += rsrvNodes; + nr_insert(btree->fileRefNum, (struct nreserve *)data, rsrvNodes); + return (0); +} + + +/* + * BTReleaseReserve - release the node reserve held by current thread + * + * Used by the Catalog Layer (hfs_catalog.c) to relinquish reserved space. + */ +__private_extern__ +int +BTReleaseReserve(FCB *file, void* data) +{ + BTreeControlBlock *btree; + int nodecnt; + + btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btree->fileRefNum, true); + + nr_delete(btree->fileRefNum, (struct nreserve *)data, &nodecnt); + + if (nodecnt) + btree->reservedNodes -= nodecnt; + + return (0); +} + +/* + * BTUpdateReserve - update a node reserve for allocations that occured. + */ +__private_extern__ +void +BTUpdateReserve(BTreeControlBlockPtr btreePtr, int nodes) +{ + nr_update(btreePtr->fileRefNum, nodes); +} + + +/*----------------------------------------------------------------------------*/ +/* Node Reserve Hash Functions (private) */ + + +int nrinserts = 0; +int nrdeletes = 0; + +/* + * Insert a new node reserve. + */ +static void +nr_insert(struct vnode * btvp, struct nreserve *nrp, int nodecnt) +{ + struct nodereserve *nrhead; + struct nreserve *tmp_nrp; + void * tag = NR_GET_TAG(); + + /* + * Check the cache - there may already be a reserve + */ + nrhead = NR_HASH(btvp, tag); + for (tmp_nrp = nrhead->lh_first; tmp_nrp; + tmp_nrp = tmp_nrp->nr_hash.le_next) { + if ((tmp_nrp->nr_tag == tag) && (tmp_nrp->nr_btvp == btvp)) { + nrp->nr_tag = 0; + return; + } + } + + nrp->nr_nodecnt = nodecnt; + nrp->nr_newnodes = 0; + nrp->nr_btvp = btvp; + nrp->nr_tag = tag; + LIST_INSERT_HEAD(nrhead, nrp, nr_hash); + ++nrinserts; +} + +/* + * Delete a node reserve. + */ +static void +nr_delete(struct vnode * btvp, struct nreserve *nrp, int *nodecnt) +{ + void * tag = NR_GET_TAG(); + + if (nrp->nr_tag) { + if ((nrp->nr_tag != tag) || (nrp->nr_btvp != btvp)) + panic("nr_delete: invalid NR (%08x)", nrp); + LIST_REMOVE(nrp, nr_hash); + *nodecnt = nrp->nr_nodecnt; + bzero(nrp, sizeof(struct nreserve)); + ++nrdeletes; + } else { + *nodecnt = 0; + } +} + +/* + * Lookup a node reserve. + */ +static int +nr_lookup(struct vnode * btvp) +{ + struct nodereserve *nrhead; + struct nreserve *nrp; + void* tag = NR_GET_TAG(); + + nrhead = NR_HASH(btvp, tag); + for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { + if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) + return (nrp->nr_nodecnt - nrp->nr_newnodes); + } + return (0); +} + +/* + * Update a node reserve for any allocations that occured. + */ +static void +nr_update(struct vnode * btvp, int nodecnt) +{ + struct nodereserve *nrhead; + struct nreserve *nrp; + void* tag = NR_GET_TAG(); + + nrhead = NR_HASH(btvp, tag); + for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { + if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) { + nrp->nr_newnodes += nodecnt; + break; + } + } +} diff -urN xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeScanner.c xnu-517/bsd/hfs/hfscommon/BTree/BTreeScanner.c --- xnu-344.49/bsd/hfs/hfscommon/BTree/BTreeScanner.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/BTree/BTreeScanner.c Sat Oct 25 00:25:25 2003 @@ -25,6 +25,7 @@ * @(#)BTreeScanner.c */ #include +#include "../../hfs_endian.h" #include "../headers/BTreeScanner.h" @@ -182,6 +183,23 @@ (u_int8_t *) scanState->currentNodePtr += scanState->btcb->nodeSize; } +#if BYTE_ORDER == LITTLE_ENDIAN + { + BlockDescriptor block; + FileReference fref; + + /* Fake a BlockDescriptor */ + block.buffer = scanState->currentNodePtr; + block.blockSize = scanState->btcb->nodeSize; + block.blockReadFromDisk = 1; + block.isModified = 0; + + fref = scanState->btcb->fileRefNum; + + SWAP_BT_NODE(&block, ISHFSPLUS(VTOVCB(fref)), VTOC(fref)->c_fileid, 0); + } +#endif + // Make sure this is a valid node if ( CheckNode( scanState->btcb, scanState->currentNodePtr ) != noErr ) { diff -urN xnu-344.49/bsd/hfs/hfscommon/Catalog/CatalogIterators.c xnu-517/bsd/hfs/hfscommon/Catalog/CatalogIterators.c --- xnu-344.49/bsd/hfs/hfscommon/Catalog/CatalogIterators.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/Catalog/CatalogIterators.c Sat Oct 25 00:25:25 2003 @@ -369,7 +369,7 @@ bestIterator->volume = volume; // update the iterator's volume bestIterator->folderID = folderID; // ... and folderID - bestIterator->currentIndex = 0xFFFFFFFF; // ... and offspring index marker + bestIterator->currentIndex = 0xFFFF; // ... and offspring index marker bestIterator->currentOffset = 0xFFFFFFFF; bestIterator->nextOffset = 0xFFFFFFFF; diff -urN xnu-344.49/bsd/hfs/hfscommon/Catalog/FileIDsServices.c xnu-517/bsd/hfs/hfscommon/Catalog/FileIDsServices.c --- xnu-344.49/bsd/hfs/hfscommon/Catalog/FileIDsServices.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/Catalog/FileIDsServices.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,9 +68,6 @@ err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); ReturnIfError(err); - err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); - ReturnIfError(err); - if ( isHFSPlus ) { //-- Step 1: Check the catalog nodes for extents diff -urN xnu-344.49/bsd/hfs/hfscommon/Misc/FileExtentMapping.c xnu-517/bsd/hfs/hfscommon/Misc/FileExtentMapping.c --- xnu-344.49/bsd/hfs/hfscommon/Misc/FileExtentMapping.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/Misc/FileExtentMapping.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -250,15 +250,12 @@ kPreviousRecord = -1 }; -void HFSToHFSPlusExtents( - const HFSExtentRecord oldExtents, - HFSPlusExtentRecord newExtents); -OSErr HFSPlusToHFSExtents( +static OSErr HFSPlusToHFSExtents( const HFSPlusExtentRecord oldExtents, HFSExtentRecord newExtents); -OSErr FindExtentRecord( +static OSErr FindExtentRecord( const ExtendedVCB *vcb, UInt8 forkType, UInt32 fileID, @@ -268,7 +265,7 @@ HFSPlusExtentRecord foundData, UInt32 *foundHint); -OSErr DeleteExtentRecord( +static OSErr DeleteExtentRecord( const ExtendedVCB *vcb, UInt8 forkType, UInt32 fileID, @@ -281,7 +278,7 @@ UInt32 *hint); -OSErr GetFCBExtentRecord( +static OSErr GetFCBExtentRecord( const FCB *fcb, HFSPlusExtentRecord extents); @@ -359,7 +356,7 @@ // fourth entry will be zeroes. // foundHint The BTree hint to find the node again //_________________________________________________________________________________ -OSErr FindExtentRecord( +static OSErr FindExtentRecord( const ExtendedVCB *vcb, UInt8 forkType, UInt32 fileID, @@ -376,7 +373,8 @@ UInt16 btRecordSize; err = noErr; - *foundHint = 0; + if (foundHint) + *foundHint = 0; fcb = GetFileControlBlock(vcb->extentsRefNum); MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); @@ -416,14 +414,15 @@ if (err == noErr) { UInt16 i; - // Copy the found key back for the caller - foundKey->keyLength = kHFSPlusExtentKeyMaximumLength; - foundKey->forkType = extentKeyPtr->forkType; - foundKey->pad = 0; - foundKey->fileID = extentKeyPtr->fileID; - foundKey->startBlock = extentKeyPtr->startBlock; - - // Copy the found data back for the caller + // Copy the found key back for the caller + if (foundKey) { + foundKey->keyLength = kHFSPlusExtentKeyMaximumLength; + foundKey->forkType = extentKeyPtr->forkType; + foundKey->pad = 0; + foundKey->fileID = extentKeyPtr->fileID; + foundKey->startBlock = extentKeyPtr->startBlock; + } + // Copy the found data back for the caller foundData[0].startBlock = extentData[0].startBlock; foundData[0].blockCount = extentData[0].blockCount; foundData[1].startBlock = extentData[1].startBlock; @@ -471,14 +470,16 @@ } if (err == noErr) { - // Copy the found key back for the caller - BlockMoveData(extentKeyPtr, foundKey, sizeof(HFSPlusExtentKey)); - // Copy the found data back for the caller + // Copy the found key back for the caller + if (foundKey) + BlockMoveData(extentKeyPtr, foundKey, sizeof(HFSPlusExtentKey)); + // Copy the found data back for the caller BlockMoveData(&extentData, foundData, sizeof(HFSPlusExtentRecord)); } } - - *foundHint = btIterator->hint.nodeNum; + + if (foundHint) + *foundHint = btIterator->hint.nodeNum; FREE(btIterator, M_TEMP); return err; } @@ -499,11 +500,6 @@ err = noErr; *hint = 0; - // XXXdbg - preflight that there's enough space - err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); - if (err) - return err; - MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -546,7 +542,7 @@ } -OSErr DeleteExtentRecord( +static OSErr DeleteExtentRecord( const ExtendedVCB *vcb, UInt8 forkType, UInt32 fileID, @@ -557,11 +553,6 @@ err = noErr; - // XXXdbg - preflight that there's enough space - err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); - if (err) - return err; - MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -616,6 +607,7 @@ // Called By: Log2Phys (read/write in place), Cache (map a file block). //_________________________________________________________________________________ +__private_extern__ OSErr MapFileBlockC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file @@ -685,12 +677,14 @@ // Determine the number of contiguous bytes until the end of the extent // (or the amount they asked for, whichever comes first). // - tmpOff = dataEnd - offset; - if (tmpOff > (off_t)(numberOfBytes)) - *availableBytes = numberOfBytes; // more there than they asked for, so pin the output - else - *availableBytes = tmpOff; - + if (availableBytes) + { + tmpOff = dataEnd - offset; + if (tmpOff > (off_t)(numberOfBytes)) + *availableBytes = numberOfBytes; // more there than they asked for, so pin the output + else + *availableBytes = tmpOff; + } return noErr; } @@ -827,6 +821,7 @@ // Function: Flushes the extent file for a specified volume //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +__private_extern__ OSErr FlushExtentFile( ExtendedVCB *vcb ) { FCB * fcb; @@ -856,6 +851,7 @@ // an HFS volume. //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +__private_extern__ SInt32 CompareExtentKeys( const HFSExtentKey *searchKey, const HFSExtentKey *trialKey ) { SInt32 result; // ± 1 @@ -919,6 +915,7 @@ // an HFS volume. //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +__private_extern__ SInt32 CompareExtentKeysPlus( const HFSPlusExtentKey *searchKey, const HFSPlusExtentKey *trialKey ) { SInt32 result; // ± 1 @@ -973,6 +970,72 @@ return( result ); } +/* + * Add a file extent to a file. + * + * Used by hfs_extendfs to extend the volume allocation bitmap file. + * + */ +__private_extern__ +int +AddFileExtent(ExtendedVCB *vcb, FCB *fcb, UInt32 startBlock, UInt32 blockCount) +{ + HFSPlusExtentKey foundKey; + HFSPlusExtentRecord foundData; + UInt32 foundIndex; + UInt32 hint; + UInt32 nextBlock; + SInt64 peof; + int i; + int error; + + peof = (SInt64)(fcb->ff_blocks + blockCount) * (SInt64)vcb->blockSize; + + error = SearchExtentFile(vcb, fcb, peof-1, &foundKey, foundData, &foundIndex, &hint, &nextBlock); + if (error != fxRangeErr) + return (EBUSY); + + /* + * Add new extent. See if there is room in the current record. + */ + if (foundData[foundIndex].blockCount != 0) + ++foundIndex; + if (foundIndex == kHFSPlusExtentDensity) { + /* + * Existing record is full so create a new one. + */ + foundKey.keyLength = kHFSPlusExtentKeyMaximumLength; + foundKey.forkType = kDataForkType; + foundKey.pad = 0; + foundKey.fileID = FTOC(fcb)->c_fileid; + foundKey.startBlock = nextBlock; + + foundData[0].startBlock = startBlock; + foundData[0].blockCount = blockCount; + + /* zero out remaining extents. */ + for (i = 1; i < kHFSPlusExtentDensity; ++i) { + foundData[i].startBlock = 0; + foundData[i].blockCount = 0; + } + + foundIndex = 0; + + error = CreateExtentRecord(vcb, &foundKey, foundData, &hint); + if (error == fxOvFlErr) + error = dskFulErr; + } else { + /* + * Add a new extent into existing record. + */ + foundData[foundIndex].startBlock = startBlock; + foundData[foundIndex].blockCount = blockCount; + error = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + } + (void) FlushExtentFile(vcb); + + return (error); +} //_________________________________________________________________________________ @@ -1000,6 +1063,7 @@ // Note: ExtendFile updates the PEOF in the FCB. //_________________________________________________________________________________ +__private_extern__ OSErr ExtendFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate @@ -1021,6 +1085,7 @@ Boolean allOrNothing; Boolean forceContig; Boolean wantContig; + Boolean useMetaZone; Boolean needsFlush; UInt32 actualStartBlock; UInt32 actualNumBlocks; @@ -1055,7 +1120,7 @@ // Determine how many blocks need to be allocated. // Round up the number of desired bytes to add. // - blocksToAdd = FileBytesToBlocks(bytesToAdd, volumeBlockSize); + blocksToAdd = howmany(bytesToAdd, volumeBlockSize); bytesToAdd = (SInt64)((SInt64)blocksToAdd * (SInt64)volumeBlockSize); /* @@ -1070,7 +1135,7 @@ FTOC(fcb)->c_blocks += blocksToAdd; fcb->ff_blocks += blocksToAdd; - FTOC(fcb)->c_flag |= C_MODIFIED; + FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; *actualBytesAdded = bytesToAdd; return (0); } @@ -1092,11 +1157,11 @@ // then set the maximum number of bytes to the requested number of bytes // rounded up to a multiple of the clump size. // - if ((fcb->fcbClmpSize > volumeBlockSize) + if ((vcb->vcbClpSiz > volumeBlockSize) && (bytesToAdd < (SInt64)HFS_MAX_DEFERED_ALLOC) && (flags & kEFNoClumpMask) == 0) { - maximumBytes = (SInt64)FileBytesToBlocks(bytesToAdd, fcb->fcbClmpSize); - maximumBytes *= fcb->fcbClmpSize; + maximumBytes = (SInt64)howmany(bytesToAdd, vcb->vcbClpSiz); + maximumBytes *= vcb->vcbClpSiz; } else { maximumBytes = bytesToAdd; } @@ -1131,7 +1196,7 @@ // Enough blocks are already allocated. Just update the FCB to reflect the new length. fcb->ff_blocks = peof / volumeBlockSize; FTOC(fcb)->c_blocks += (bytesToAdd / volumeBlockSize); - FTOC(fcb)->c_flag |= C_MODIFIED; + FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; goto Exit; } if (err != fxRangeErr) // Any real error? @@ -1158,6 +1223,7 @@ // else, keep getting bits and pieces (non-contig) err = noErr; wantContig = true; + useMetaZone = flags & kEFMetadataMask; vcb->vcbFreeExtCnt = 0; /* For now, force rebuild of free extent list */ do { if (blockHint != 0) @@ -1183,16 +1249,18 @@ err = BlockAllocate( vcb, startBlock, - MIN(bytesToAdd, availbytes), - MIN(maximumBytes, availbytes), + howmany(MIN(bytesToAdd, availbytes), volumeBlockSize), + howmany(MIN(maximumBytes, availbytes), volumeBlockSize), wantContig, + useMetaZone, &actualStartBlock, &actualNumBlocks); } } } else { - err = BlockAllocate(vcb, startBlock, bytesToAdd, maximumBytes, - wantContig, &actualStartBlock, &actualNumBlocks); + err = BlockAllocate(vcb, startBlock, howmany(bytesToAdd, volumeBlockSize), + howmany(maximumBytes, volumeBlockSize), wantContig, useMetaZone, + &actualStartBlock, &actualNumBlocks); } if (err == dskFulErr) { if (forceContig) @@ -1205,8 +1273,20 @@ } if (actualNumBlocks != 0) err = noErr; + if (useMetaZone == 0) { + /* Couldn't get anything so dip into metadat zone */ + err = noErr; + useMetaZone = 1; + continue; + } } if (err == noErr) { + if (actualNumBlocks != 0) { + // this catalog entry *must* get forced to disk when + // hfs_update() is called + FTOC(fcb)->c_flag |= C_FORCEUPDATE; + } + // Add the new extent to the existing extent record, or create a new one. if ((actualStartBlock == startBlock) && (blockHint == 0)) { // We grew the file's last extent, so just adjust the number of blocks. @@ -1284,7 +1364,7 @@ } fcb->ff_blocks += (bytesThisExtent / volumeBlockSize); FTOC(fcb)->c_blocks += (bytesThisExtent / volumeBlockSize); - FTOC(fcb)->c_flag |= C_MODIFIED; + FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; // If contiguous allocation was requested, then we've already got one contiguous // chunk. If we didn't get all we wanted, then adjust the error to disk full. @@ -1298,6 +1378,13 @@ ErrorExit: Exit: + if (VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) { + /* Keep the roving allocator out of the metadata zone. */ + if (vcb->nextAllocation >= VCBTOHFS(vcb)->hfs_metazone_start && + vcb->nextAllocation <= VCBTOHFS(vcb)->hfs_metazone_end) { + vcb->nextAllocation = VCBTOHFS(vcb)->hfs_metazone_end + 1; + } + } *actualBytesAdded = (SInt64)(fcb->ff_blocks - prevblocks) * (SInt64)volumeBlockSize; if (needsFlush) @@ -1335,6 +1422,7 @@ // Note: TruncateFile updates the PEOF in the FCB. //_________________________________________________________________________________ +__private_extern__ OSErr TruncateFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate @@ -1378,7 +1466,7 @@ // two gigabytes or more, then round down by one allocation block (??? really? // shouldn't that be an error?). // - nextBlock = FileBytesToBlocks(peof, vcb->blockSize); // number of allocation blocks to remain in file + nextBlock = howmany(peof, vcb->blockSize); // number of allocation blocks to remain in file peof = (SInt64)((SInt64)nextBlock * (SInt64)vcb->blockSize); // number of bytes in those blocks if ((vcb->vcbSigWord == kHFSSigWord) && (peof >= kTwoGigabytes)) { #if DEBUG_BUILD @@ -1391,10 +1479,16 @@ // // Update FCB's length // + /* + * XXX Any errors could cause ff_blocks and c_blocks to get out of sync... + */ numBlocks = peof / vcb->blockSize; FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); fcb->ff_blocks = numBlocks; - FTOC(fcb)->c_flag |= C_MODIFIED; + + // this catalog entry is modified and *must* get forced + // to disk when hfs_update() is called + FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; // // If the new PEOF is 0, then truncateToExtent has no meaning (we should always deallocate @@ -1502,6 +1596,147 @@ } +/* + * HFS Plus only + * + */ +__private_extern__ +OSErr HeadTruncateFile ( + ExtendedVCB *vcb, + FCB *fcb, + UInt32 headblks) +{ + HFSPlusExtentRecord extents; + HFSPlusExtentRecord tailExtents; + HFSCatalogNodeID fileID; + UInt8 forkType; + UInt32 blkcnt; + UInt32 startblk; + UInt32 blksfreed; + int i, j; + int error; + + + if (vcb->vcbSigWord != kHFSPlusSigWord) + return (-1); + + forkType = FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType; + fileID = FTOC(fcb)->c_fileid; + bzero(tailExtents, sizeof(tailExtents)); + + blksfreed = 0; + startblk = 0; + + /* + * Process catalog resident extents + */ + for (i = 0, j = 0; i < kHFSPlusExtentDensity; ++i) { + blkcnt = fcb->fcbExtents[i].blockCount; + if (blkcnt == 0) + break; /* end of extents */ + + if (blksfreed < headblks) { + error = BlockDeallocate(vcb, fcb->fcbExtents[i].startBlock, blkcnt); + /* + * Any errors after the first BlockDeallocate + * must be ignored so we can put the file in + * a known state. + */ + if (error ) { + if (i == 0) + goto ErrorExit; /* uh oh */ + else { + error = 0; + printf("HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? FTOC(fcb)->c_desc.cd_nameptr : "", error); + } + } + + blksfreed += blkcnt; + fcb->fcbExtents[i].startBlock = 0; + fcb->fcbExtents[i].blockCount = 0; + } else { + tailExtents[j].startBlock = fcb->fcbExtents[i].startBlock; + tailExtents[j].blockCount = blkcnt; + ++j; + } + startblk += blkcnt; + } + + if (blkcnt == 0) + goto CopyExtents; + + /* + * Process overflow extents + */ + for (;;) { + UInt32 extblks; + + error = FindExtentRecord(vcb, forkType, fileID, startblk, false, NULL, extents, NULL); + if (error) { + /* + * Any errors after the first BlockDeallocate + * must be ignored so we can put the file in + * a known state. + */ + if (error != btNotFound) + printf("HeadTruncateFile: problems finding extents %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + break; + } + + for(i = 0, extblks = 0; i < kHFSPlusExtentDensity; ++i) { + blkcnt = extents[i].blockCount; + if (blkcnt == 0) + break; /* end of extents */ + + if (blksfreed < headblks) { + error = BlockDeallocate(vcb, extents[i].startBlock, blkcnt); + if (error) { + printf("HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + } + blksfreed += blkcnt; + } else { + tailExtents[j].startBlock = extents[i].startBlock; + tailExtents[j].blockCount = blkcnt; + ++j; + } + extblks += blkcnt; + } + + error = DeleteExtentRecord(vcb, forkType, fileID, startblk); + if (error) { + printf("HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + } + + if (blkcnt == 0) + break; /* all done */ + + startblk += extblks; + } + +CopyExtents: + if (blksfreed) { + bcopy(tailExtents, fcb->fcbExtents, sizeof(tailExtents)); + blkcnt = fcb->ff_blocks - headblks; + FTOC(fcb)->c_blocks -= blkcnt; + fcb->ff_blocks = blkcnt; + + FTOC(fcb)->c_flag |= C_CHANGE | C_FORCEUPDATE; + + (void) FlushExtentFile(vcb); + } + +ErrorExit: + return MacToVFSError(error); +} + + //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ // Routine: SearchExtentRecord (was XRSearch) @@ -1749,11 +1984,6 @@ // btFCB = GetFileControlBlock(vcb->extentsRefNum); - // XXXdbg - preflight that there's enough space - err = BTCheckFreeSpace(btFCB); - if (err) - return err; - MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -1811,31 +2041,8 @@ -void HFSToHFSPlusExtents( - const HFSExtentRecord oldExtents, - HFSPlusExtentRecord newExtents) -{ - UInt32 i; - - // copy the first 3 extents - newExtents[0].startBlock = oldExtents[0].startBlock; - newExtents[0].blockCount = oldExtents[0].blockCount; - newExtents[1].startBlock = oldExtents[1].startBlock; - newExtents[1].blockCount = oldExtents[1].blockCount; - newExtents[2].startBlock = oldExtents[2].startBlock; - newExtents[2].blockCount = oldExtents[2].blockCount; - - // zero out the remaining ones - for (i = 3; i < kHFSPlusExtentDensity; ++i) - { - newExtents[i].startBlock = 0; - newExtents[i].blockCount = 0; - } -} - - -OSErr HFSPlusToHFSExtents( +static OSErr HFSPlusToHFSExtents( const HFSPlusExtentRecord oldExtents, HFSExtentRecord newExtents) { @@ -1864,7 +2071,7 @@ -OSErr GetFCBExtentRecord( +static OSErr GetFCBExtentRecord( const FCB *fcb, HFSPlusExtentRecord extents) { @@ -1923,6 +2130,7 @@ // Called by BTOpenPath during volume mount //_________________________________________________________________________________ +__private_extern__ Boolean NodesAreContiguous( ExtendedVCB *vcb, FCB *fcb, diff -urN xnu-344.49/bsd/hfs/hfscommon/Misc/VolumeAllocation.c xnu-517/bsd/hfs/hfscommon/Misc/VolumeAllocation.c --- xnu-344.49/bsd/hfs/hfscommon/Misc/VolumeAllocation.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/Misc/VolumeAllocation.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -121,6 +121,7 @@ UInt32 startingBlock, UInt32 endingBlock, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks); @@ -129,6 +130,7 @@ UInt32 startingBlock, UInt32 minBlocks, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks); @@ -138,19 +140,10 @@ UInt32 endingBlock, UInt32 minBlocks, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks); -static OSErr BlockMarkAllocated( - ExtendedVCB *vcb, - UInt32 startingBlock, - UInt32 numBlocks); - -static OSErr BlockMarkFree( - ExtendedVCB *vcb, - UInt32 startingBlock, - UInt32 numBlocks); - static OSErr BlockAllocateKnown( ExtendedVCB *vcb, UInt32 maxBlocks, @@ -175,19 +168,17 @@ ; the volume's allocation block pointer will be used as a starting ; point. ; -; All requests will be rounded up to the next highest clump size, as -; indicated in the file's FCB. -; ; Input Arguments: ; vcb - Pointer to ExtendedVCB for the volume to allocate space on ; fcb - Pointer to FCB for the file for which storage is being allocated ; startingBlock - Preferred starting allocation block, 0 = no preference ; forceContiguous - Force contiguous flag - if bit 0 set (NE), allocation is contiguous ; or an error is returned -; bytesRequested - Number of bytes requested. If the allocation is non-contiguous, +; useMetaZone - +; minBlocks - Number of blocks requested. If the allocation is non-contiguous, ; less than this may actually be allocated -; bytesMaximum - The maximum number of bytes to allocate. If there is additional free -; space after bytesRequested, then up to bytesMaximum bytes should really +; maxBlocks - The maximum number of blocks to allocate. If there is additional free +; space after bytesRequested, then up to maxBlocks bytes should really ; be allocated. (Used by ExtendFileC to round up allocations to a multiple ; of the file's clump size.) ; @@ -201,21 +192,22 @@ ;________________________________________________________________________________ */ +__private_extern__ OSErr BlockAllocate ( ExtendedVCB *vcb, /* which volume to allocate space on */ UInt32 startingBlock, /* preferred starting block, or 0 for no preference */ - SInt64 bytesRequested, /* desired number of BYTES to allocate */ - SInt64 bytesMaximum, /* maximum number of bytes to allocate */ + UInt32 minBlocks, /* desired number of blocks to allocate */ + UInt32 maxBlocks, /* maximum number of blocks to allocate */ Boolean forceContiguous, /* non-zero to force contiguous allocation and to force */ - /* bytesRequested bytes to actually be allocated */ + /* minBlocks bytes to actually be allocated */ + + Boolean useMetaZone, UInt32 *actualStartBlock, /* actual first block of allocation */ UInt32 *actualNumBlocks) /* number of blocks actually allocated; if forceContiguous */ - /* was zero, then this may represent fewer than bytesRequested */ - /* bytes */ + /* was zero, then this may represent fewer than minBlocks */ { + UInt32 freeBlocks; OSErr err; - UInt32 minBlocks; // minimum number of allocation blocks requested - UInt32 maxBlocks; // number of allocation blocks requested, rounded to clump size Boolean updateAllocPtr = false; // true if nextAllocation needs to be updated // @@ -223,25 +215,29 @@ // *actualStartBlock = 0; *actualNumBlocks = 0; - - // - // Compute the number of allocation blocks requested, and maximum - // - minBlocks = FileBytesToBlocks(bytesRequested, vcb->blockSize); - maxBlocks = FileBytesToBlocks(bytesMaximum, vcb->blockSize); + freeBlocks = hfs_freeblks(VCBTOHFS(vcb), 0); // // If the disk is already full, don't bother. // - if (hfs_freeblks(VCBTOHFS(vcb), 0) == 0) { + if (freeBlocks == 0) { err = dskFulErr; goto Exit; } - if (forceContiguous && hfs_freeblks(VCBTOHFS(vcb), 0) < minBlocks) { + if (forceContiguous && freeBlocks < minBlocks) { err = dskFulErr; goto Exit; } - + /* + * Clip if necessary so we don't over-subscribe the free blocks. + */ + if (minBlocks > freeBlocks) { + minBlocks = freeBlocks; + } + if (maxBlocks > freeBlocks) { + maxBlocks = freeBlocks; + } + // // If caller didn't specify a starting block number, then use the volume's // next block to allocate from. @@ -252,19 +248,27 @@ VCB_UNLOCK(vcb); updateAllocPtr = true; } + if (startingBlock >= vcb->totalBlocks) { + startingBlock = 0; /* overflow so start at beginning */ + } // // If the request must be contiguous, then find a sequence of free blocks // that is long enough. Otherwise, find the first free block. // if (forceContiguous) { - err = BlockAllocateContig(vcb, startingBlock, minBlocks, maxBlocks, actualStartBlock, actualNumBlocks); + err = BlockAllocateContig(vcb, startingBlock, minBlocks, maxBlocks, + useMetaZone, actualStartBlock, actualNumBlocks); /* * If we allocated from a new position then * also update the roving allocatior. */ - if ((err == noErr) && (*actualStartBlock > startingBlock)) - vcb->nextAllocation = *actualStartBlock; + if ((err == noErr) && + (*actualStartBlock > startingBlock) && + ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || + (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { + vcb->nextAllocation = *actualStartBlock; /* XXX */ + } } else { /* * Scan the bitmap once, gather the N largest free extents, then @@ -275,9 +279,13 @@ */ err = BlockAllocateKnown(vcb, maxBlocks, actualStartBlock, actualNumBlocks); if (err == dskFulErr) - err = BlockAllocateAny(vcb, startingBlock, vcb->totalBlocks, maxBlocks, actualStartBlock, actualNumBlocks); + err = BlockAllocateAny(vcb, startingBlock, vcb->totalBlocks, + maxBlocks, useMetaZone, actualStartBlock, + actualNumBlocks); if (err == dskFulErr) - err = BlockAllocateAny(vcb, 0, startingBlock, maxBlocks, actualStartBlock, actualNumBlocks); + err = BlockAllocateAny(vcb, 1, startingBlock, maxBlocks, + useMetaZone, actualStartBlock, + actualNumBlocks); } if (err == noErr) { @@ -291,13 +299,16 @@ // VCB_LOCK(vcb); - if (updateAllocPtr) + if (updateAllocPtr && + ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || + (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { vcb->nextAllocation = *actualStartBlock; - + } // // Update the number of free blocks on the volume // vcb->freeBlocks -= *actualNumBlocks; + hfs_generate_volume_notifications(VCBTOHFS(vcb)); VCB_UNLOCK(vcb); MarkVCBDirty(vcb); @@ -329,6 +340,7 @@ ;________________________________________________________________________________ */ +__private_extern__ OSErr BlockDeallocate ( ExtendedVCB *vcb, // Which volume to deallocate space on UInt32 firstBlock, // First block in range to deallocate @@ -356,44 +368,98 @@ // VCB_LOCK(vcb); vcb->freeBlocks += numBlocks; + hfs_generate_volume_notifications(VCBTOHFS(vcb)); if (vcb->nextAllocation == (firstBlock + numBlocks)) vcb->nextAllocation -= numBlocks; VCB_UNLOCK(vcb); MarkVCBDirty(vcb); - + Exit: return err; } -/* -;_______________________________________________________________________ -; -; Routine: FileBytesToBlocks -; -; Function: Divide numerator by denominator, rounding up the result if there -; was a remainder. This is frequently used for computing the number -; of whole and/or partial blocks used by some count of bytes. -; Actuall divides a 64 bit by a 32 bit into a 32bit result -; -; CAREFULL!!! THIS CAN CAUSE OVERFLOW....USER BEWARE!!! -;_______________________________________________________________________ -*/ -UInt32 FileBytesToBlocks( - SInt64 numerator, - UInt32 denominator) +UInt8 freebitcount[16] = { + 4, 3, 3, 2, 3, 2, 2, 1, /* 0 1 2 3 4 5 6 7 */ + 3, 2, 2, 1, 2, 1, 1, 0, /* 8 9 A B C D E F */ +}; + +__private_extern__ +UInt32 +MetaZoneFreeBlocks(ExtendedVCB *vcb) { - UInt32 quotient; - - quotient = (UInt32)(numerator / denominator); - if (quotient * denominator != numerator) - quotient++; + UInt32 freeblocks; + UInt32 *currCache; + UInt32 blockRef; + UInt32 bit; + UInt32 lastbit; + int bytesleft; + int bytesperblock; + UInt8 byte; + UInt8 *buffer; + blockRef = 0; + bytesleft = freeblocks = 0; + bit = VCBTOHFS(vcb)->hfs_metazone_start; + if (bit == 1) + bit = 0; - return quotient; + lastbit = VCBTOHFS(vcb)->hfs_metazone_end; + bytesperblock = vcb->vcbVBMIOSize; + + /* + * Count all the bits from bit to lastbit. + */ + while (bit < lastbit) { + /* + * Get next bitmap block. + */ + if (bytesleft == 0) { + if (blockRef) { + (void) ReleaseBitmapBlock(vcb, blockRef, false); + blockRef = 0; + } + if (ReadBitmapBlock(vcb, bit, &currCache, &blockRef) != 0) { + return (0); + } + buffer = (UInt8 *)currCache; + bytesleft = bytesperblock; + } + byte = *buffer++; + freeblocks += freebitcount[byte & 0x0F]; + freeblocks += freebitcount[(byte >> 4) & 0x0F]; + bit += kBitsPerByte; + --bytesleft; + } + if (blockRef) + (void) ReleaseBitmapBlock(vcb, blockRef, false); + + return (freeblocks); } +/* + * Obtain the next allocation block (bit) that's + * outside the metadata allocation zone. + */ +static UInt32 NextBitmapBlock( + ExtendedVCB *vcb, + UInt32 bit) +{ + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) + return (bit); + /* + * Skip over metadata allocation zone. + */ + if ((bit >= hfsmp->hfs_metazone_start) && + (bit <= hfsmp->hfs_metazone_end)) { + bit = hfsmp->hfs_metazone_end + 1; + } + return (bit); +} + /* ;_______________________________________________________________________ @@ -476,6 +542,12 @@ Boolean dirty) { struct buf *bp = (struct buf *)blockRef; + + if (blockRef == 0) { + if (dirty) + panic("ReleaseBitmapBlock: missing bp"); + return (0); + } if (bp) { if (dirty) { @@ -511,6 +583,7 @@ startingBlock Preferred first block for allocation minBlocks Minimum number of contiguous blocks to allocate maxBlocks Maximum number of contiguous blocks to allocate + useMetaZone Outputs: actualStartBlock First block of range allocated, or 0 if error @@ -522,6 +595,7 @@ UInt32 startingBlock, UInt32 minBlocks, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks) { @@ -541,18 +615,22 @@ * with the free extent cache, this can lead to duplicate entries * in the cache, causing the same blocks to be allocated twice. */ - err = BlockFindContiguous(vcb, startingBlock, vcb->totalBlocks, minBlocks, maxBlocks, - actualStartBlock, actualNumBlocks); + err = BlockFindContiguous(vcb, startingBlock, vcb->totalBlocks, minBlocks, + maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); if (err == dskFulErr && startingBlock != 0) { /* * Constrain the endingBlock so we don't bother looking for ranges * that would overlap those found in the previous call. */ - err = BlockFindContiguous(vcb, 0, startingBlock, minBlocks, maxBlocks, - actualStartBlock, actualNumBlocks); + err = BlockFindContiguous(vcb, 1, startingBlock, minBlocks, maxBlocks, + useMetaZone, actualStartBlock, actualNumBlocks); } if (err != noErr) goto Exit; + // sanity check + if ((*actualStartBlock + *actualNumBlocks) > vcb->totalBlocks) + panic("BlockAllocateContig: allocation overflow on \"%s\"", vcb->vcbVN); + // // Now mark those blocks allocated. // @@ -582,6 +660,7 @@ startingBlock Preferred first block for allocation endingBlock Last block to check + 1 maxBlocks Maximum number of contiguous blocks to allocate + useMetaZone Outputs: actualStartBlock First block of range allocated, or 0 if error @@ -593,6 +672,7 @@ UInt32 startingBlock, register UInt32 endingBlock, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks) { @@ -601,8 +681,8 @@ register UInt32 currentWord; // Pointer to current word within bitmap block register UInt32 bitMask; // Word with given bits already set (ready to OR in) register UInt32 wordsLeft; // Number of words left in this bitmap block - UInt32 *buffer = NULL; - UInt32 *currCache = NULL; + UInt32 *buffer = NULL; + UInt32 *currCache = NULL; UInt32 blockRef; UInt32 bitsPerBlock; UInt32 wordsPerBlock; @@ -614,12 +694,18 @@ maxBlocks = endingBlock - startingBlock; } + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) + startingBlock = NextBitmapBlock(vcb, startingBlock); + // // Pre-read the first bitmap block // - err = ReadBitmapBlock(vcb, startingBlock, &currCache, &blockRef); + err = ReadBitmapBlock(vcb, startingBlock, &currCache, &blockRef); if (err != noErr) goto Exit; - buffer = currCache; + buffer = currCache; // // Set up the current position within the block @@ -644,7 +730,7 @@ while (block < endingBlock) { if ((currentWord & bitMask) == 0) break; - + // Next bit ++block; bitMask >>= 1; @@ -652,27 +738,36 @@ // Next word bitMask = kHighBitInWordMask; ++buffer; - + if (--wordsLeft == 0) { // Next block - buffer = currCache = NULL; + buffer = currCache = NULL; err = ReleaseBitmapBlock(vcb, blockRef, false); if (err != noErr) goto Exit; - err = ReadBitmapBlock(vcb, block, &currCache, &blockRef); + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + block = NextBitmapBlock(vcb, block); + if (block >= endingBlock) { + err = dskFulErr; + goto Exit; + } + } + err = ReadBitmapBlock(vcb, block, &currCache, &blockRef); if (err != noErr) goto Exit; - buffer = currCache; + buffer = currCache; wordsLeft = wordsPerBlock; } - currentWord = SWAP_BE32 (*buffer); } } // Did we get to the end of the bitmap before finding a free block? // If so, then couldn't allocate anything. - if (block == endingBlock) { + if (block >= endingBlock) { err = dskFulErr; goto Exit; } @@ -717,13 +812,25 @@ if (--wordsLeft == 0) { // Next block - buffer = currCache = NULL; + buffer = currCache = NULL; err = ReleaseBitmapBlock(vcb, blockRef, true); if (err != noErr) goto Exit; - err = ReadBitmapBlock(vcb, block, &currCache, &blockRef); + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + UInt32 nextBlock; + + nextBlock = NextBitmapBlock(vcb, block); + if (nextBlock != block) { + goto Exit; /* allocation gap, so stop */ + } + } + + err = ReadBitmapBlock(vcb, block, &currCache, &blockRef); if (err != noErr) goto Exit; - buffer = currCache; + buffer = currCache; // XXXdbg if (hfsmp->jnl) { @@ -741,6 +848,10 @@ Exit: if (err == noErr) { *actualNumBlocks = block - *actualStartBlock; + + // sanity check + if ((*actualStartBlock + *actualNumBlocks) > vcb->totalBlocks) + panic("BlockAllocateAny: allocation overflow on \"%s\"", vcb->vcbVN); } else { *actualStartBlock = 0; @@ -828,6 +939,10 @@ vcb->vcbFreeExt[i-1].blockCount = newBlockCount; } + // sanity check + if ((*actualStartBlock + *actualNumBlocks) > vcb->totalBlocks) + panic("BlockAllocateKnown: allocation overflow on \"%s\"", vcb->vcbVN); + // // Now mark the found extent in the bitmap // @@ -851,7 +966,8 @@ numBlocks Number of blocks to mark as allocated _______________________________________________________________________ */ -static OSErr BlockMarkAllocated( +__private_extern__ +OSErr BlockMarkAllocated( ExtendedVCB *vcb, UInt32 startingBlock, register UInt32 numBlocks) @@ -869,6 +985,7 @@ // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); + // // Pre-read the bitmap block containing the first word of allocation // @@ -1018,7 +1135,8 @@ numBlocks Number of blocks to mark as freed _______________________________________________________________________ */ -static OSErr BlockMarkFree( +__private_extern__ +OSErr BlockMarkFree( ExtendedVCB *vcb, UInt32 startingBlock, register UInt32 numBlocks) @@ -1036,6 +1154,12 @@ // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); + if (startingBlock + numBlocks > vcb->totalBlocks) { + panic("hfs: block mark free: trying to free non-existent blocks (%d %d %d)\n", + startingBlock, numBlocks, vcb->totalBlocks); + } + + // // Pre-read the bitmap block containing the first word of allocation // @@ -1075,11 +1199,9 @@ numBits = numBlocks; // entire allocation is inside this one word bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last } -#if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { - panic("BlockMarkFree: blocks not allocated!"); + goto Corruption; } -#endif *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap numBlocks -= numBits; // adjust number of blocks left to free @@ -1112,12 +1234,9 @@ currentWord = buffer; wordsLeft = wordsPerBlock; } - -#if DEBUG_BUILD if (*currentWord != SWAP_BE32 (kAllBitsSetInWord)) { - panic("BlockMarkFree: blocks not allocated!"); + goto Corruption; } -#endif *currentWord = 0; // clear the entire word numBlocks -= kBitsPerWord; @@ -1151,11 +1270,9 @@ currentWord = buffer; wordsLeft = wordsPerBlock; } -#if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { - panic("BlockMarkFree: blocks not allocated!"); + goto Corruption; } -#endif *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap // No need to update currentWord or wordsLeft @@ -1167,6 +1284,17 @@ (void)ReleaseBitmapBlock(vcb, blockRef, true); return err; + +Corruption: +#if DEBUG_BUILD + panic("BlockMarkFree: blocks not allocated!"); +#else + printf("hfs: WARNING - blocks on volume %s not allocated!\n", vcb->vcbVN); + vcb->vcbAtrb |= kHFSVolumeInconsistentMask; + MarkVCBDirty(vcb); + err = EIO; + goto Exit; +#endif } @@ -1185,6 +1313,7 @@ endingBlock Last possible block in range + 1 minBlocks Minimum number of blocks needed. Must be > 0. maxBlocks Maximum (ideal) number of blocks desired + useMetaZone OK to dip into metadata allocation zone Outputs: actualStartBlock First block of range found, or 0 if error @@ -1202,6 +1331,7 @@ UInt32 endingBlock, UInt32 minBlocks, UInt32 maxBlocks, + Boolean useMetaZone, UInt32 *actualStartBlock, UInt32 *actualNumBlocks) { @@ -1228,7 +1358,13 @@ stopBlock = endingBlock - minBlocks + 1; currentBlock = startingBlock; firstBlock = 0; - + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) + currentBlock = NextBitmapBlock(vcb, currentBlock); + // // Pre-read the first bitmap block. // @@ -1240,7 +1376,7 @@ // wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - wordsLeft = (startingBlock / kBitsPerWord) & (wordsPerBlock-1); // Current index into buffer + wordsLeft = (currentBlock / kBitsPerWord) & (wordsPerBlock-1); // Current index into buffer currentWord = buffer + wordsLeft; wordsLeft = wordsPerBlock - wordsLeft; @@ -1287,6 +1423,15 @@ err = ReleaseBitmapBlock(vcb, blockRef, false); if (err != noErr) goto ErrorExit; + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + currentBlock = NextBitmapBlock(vcb, currentBlock); + if (currentBlock >= stopBlock) + break; + } + err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef); if ( err != noErr ) goto ErrorExit; @@ -1362,6 +1507,18 @@ buffer = NULL; err = ReleaseBitmapBlock(vcb, blockRef, false); if (err != noErr) goto ErrorExit; + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + UInt32 nextBlock; + + nextBlock = NextBitmapBlock(vcb, currentBlock); + if (nextBlock != currentBlock) { + break; /* allocation gap, so stop */ + } + } err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef); if ( err != noErr ) goto ErrorExit; diff -urN xnu-344.49/bsd/hfs/hfscommon/headers/BTreesInternal.h xnu-517/bsd/hfs/hfscommon/headers/BTreesInternal.h --- xnu-344.49/bsd/hfs/hfscommon/headers/BTreesInternal.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/headers/BTreesInternal.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -220,6 +220,8 @@ kReservedBTreeType = 255 // }; +#define kBTreeHeaderUserBytes 128 + typedef BTreeKey *BTreeKeyPtr; @@ -236,7 +238,8 @@ ItemCount numRecords; ItemCount numNodes; ItemCount numFreeNodes; - UInt32 reserved; + UInt8 keyCompareType; + UInt8 reserved[3]; }; typedef struct BTreeInfoRec BTreeInfoRec; typedef BTreeInfoRec *BTreeInfoPtr; @@ -282,12 +285,8 @@ typedef SInt32 (* IterateCallBackProcPtr)(BTreeKeyPtr key, void * record, UInt16 recordLen, void * state); -extern OSStatus BTOpenPath (FCB *filePtr, - KeyCompareProcPtr keyCompareProc, - GetBlockProcPtr getBlockProc, - ReleaseBlockProcPtr releaseBlockProc, - SetEndOfForkProcPtr setEndOfForkProc, - SetBlockSizeProcPtr setBlockSizeProc ); + +extern OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc); extern OSStatus BTClosePath (FCB *filePtr ); @@ -342,9 +341,19 @@ extern OSStatus BTSetLastSync (FCB *filePtr, UInt32 lastfsync ); -extern OSStatus BTCheckFreeSpace (FCB *filePtr); - extern OSStatus BTHasContiguousNodes(FCB *filePtr); + +extern OSStatus BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize); + +extern OSStatus BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize); + +/* B-tree node reserve routines. */ +extern void BTReserveSetup(void); + +extern int BTReserveSpace(FCB *file, int operations, void * data); + +extern int BTReleaseReserve(FCB *file, void * data); + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff -urN xnu-344.49/bsd/hfs/hfscommon/headers/BTreesPrivate.h xnu-517/bsd/hfs/hfscommon/headers/BTreesPrivate.h --- xnu-344.49/bsd/hfs/hfscommon/headers/BTreesPrivate.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/headers/BTreesPrivate.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -188,7 +188,7 @@ typedef struct BTreeControlBlock { // fields specific to BTree CBs - UInt8 reserved1; // keep for alignment with old style fields + UInt8 keyCompareType; /* Key string Comparison Type */ UInt8 btreeType; UInt16 treeDepth; FileReference fileRefNum; // refNum of btree file @@ -224,7 +224,7 @@ UInt32 numHintChecks; UInt32 numPossibleHints; // Looks like a formated hint UInt32 numValidHints; // Hint used to find correct record. - + UInt32 reservedNodes; } BTreeControlBlock, *BTreeControlBlockPtr; @@ -317,6 +317,10 @@ UInt32 CalcMapBits (BTreeControlBlockPtr btreePtr); +SInt32 BTAvailableNodes (BTreeControlBlock *btree); + +void BTUpdateReserve (BTreeControlBlockPtr btreePtr, + int nodes); //////////////////////////////// Misc Operations //////////////////////////////// diff -urN xnu-344.49/bsd/hfs/hfscommon/headers/CatalogPrivate.h xnu-517/bsd/hfs/hfscommon/headers/CatalogPrivate.h --- xnu-344.49/bsd/hfs/hfscommon/headers/CatalogPrivate.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/headers/CatalogPrivate.h Sat Oct 25 00:25:25 2003 @@ -188,6 +188,8 @@ extern OSErr ExchangeFiles( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); #endif +extern void UpdateCatalogName( ConstStr31Param srcName, Str31 destName ); + // Catalog Iterator Routines diff -urN xnu-344.49/bsd/hfs/hfscommon/headers/FileMgrInternal.h xnu-517/bsd/hfs/hfscommon/headers/FileMgrInternal.h --- xnu-344.49/bsd/hfs/hfscommon/headers/FileMgrInternal.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/hfs/hfscommon/headers/FileMgrInternal.h Sat Oct 25 00:25:25 2003 @@ -118,6 +118,7 @@ kEFReserveMask = 0x04, /* keep block reserve */ kEFDeferMask = 0x08, /* defer file block allocations */ kEFNoClumpMask = 0x10, /* don't round up to clump size */ + kEFMetadataMask = 0x20, /* metadata allocation */ kTFTrunExtBit = 0, /* truncate to the extent containing new PEOF*/ kTFTrunExtMask = 1 @@ -289,9 +290,10 @@ EXTERN_API_C( OSErr ) BlockAllocate (ExtendedVCB * vcb, UInt32 startingBlock, - SInt64 bytesRequested, - SInt64 bytesMaximum, + UInt32 minBlocks, + UInt32 maxBlocks, Boolean forceContiguous, + Boolean useMetaZone, UInt32 * startBlock, UInt32 * actualBlocks); @@ -300,10 +302,19 @@ UInt32 firstBlock, UInt32 numBlocks); +EXTERN_API_C( OSErr ) +BlockMarkAllocated(ExtendedVCB *vcb, UInt32 startingBlock, UInt32 numBlocks); + +EXTERN_API_C( OSErr ) +BlockMarkFree( ExtendedVCB *vcb, UInt32 startingBlock, UInt32 numBlocks); + EXTERN_API_C( UInt32 ) FileBytesToBlocks (SInt64 numerator, UInt32 denominator); +EXTERN_API_C( UInt32 ) +MetaZoneFreeBlocks(ExtendedVCB *vcb); + /* File Extent Mapping routines*/ EXTERN_API_C( OSErr ) FlushExtentFile (ExtendedVCB * vcb); @@ -337,6 +348,9 @@ off_t offset, daddr_t * startBlock, size_t * availableBytes); + +EXTERN_API_C( int ) +AddFileExtent (ExtendedVCB *vcb, FCB *fcb, UInt32 startBlock, UInt32 blockCount); #if TARGET_API_MACOS_X EXTERN_API_C( Boolean ) diff -urN xnu-344.49/bsd/i386/ucontext.h xnu-517/bsd/i386/ucontext.h --- xnu-344.49/bsd/i386/ucontext.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/i386/ucontext.h Sat Oct 25 00:25:25 2003 @@ -43,6 +43,13 @@ typedef struct mcontext * mcontext_t; +struct mcontext64 { + struct sigcontext sc; +}; +#define I386_MCONTEXT64_SIZE sizeof(struct mcontext64) + +typedef struct mcontext64 * mcontext64_t; + #endif /* __APPLE_API_UNSTABLE */ #endif /* _I386_UCONTEXT_H_ */ diff -urN xnu-344.49/bsd/i386/vmparam.h xnu-517/bsd/i386/vmparam.h --- xnu-344.49/bsd/i386/vmparam.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/i386/vmparam.h Sat Oct 25 00:25:25 2003 @@ -28,7 +28,7 @@ #include -#define USRSTACK 0xc0000000 +#define USRSTACK 0xbfff9000 /* * Virtual memory related constants, all in bytes @@ -40,10 +40,10 @@ #define MAXDSIZ (RLIM_INFINITY) /* max data size */ #endif #ifndef DFLSSIZ -#define DFLSSIZ (512*1024) /* initial stack size limit */ +#define DFLSSIZ (8*1024*1024 - 7*4*1024) /* initial stack size limit */ #endif #ifndef MAXSSIZ -#define MAXSSIZ (64*1024*1024) /* max stack size */ +#define MAXSSIZ (64*1024*1024 - 7*4*1024) /* max stack size */ #endif #ifndef DFLCSIZ #define DFLCSIZ (0) /* initial core size limit */ diff -urN xnu-344.49/bsd/if/ppc/if_en.c xnu-517/bsd/if/ppc/if_en.c --- xnu-344.49/bsd/if/ppc/if_en.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/if/ppc/if_en.c Thu Jan 1 01:00:00 1970 @@ -1,1132 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 Apple Computer, Inc. - * - * ethernet driver for mace on-board ethernet - * - * HISTORY - * - * Dieter Siegmund (dieter@next.com) Thu Feb 27 18:25:33 PST 1997 - * - ripped off code from MK/LINUX, turned it into a polled-mode - * driver for the PCI (8500) class machines - * - * Dieter Siegmund (dieter@next.com) Fri Mar 21 12:41:29 PST 1997 - * - reworked to support a BSD-style interface, and to support kdb polled - * interface and interrupt-driven interface concurrently - * - * Justin Walker (justin@apple.com) Tue May 20 10:29:29 PDT 1997 - * - Added multicast support - * - * Dieter Siegmund (dieter@next.com) Thu May 29 15:02:29 PDT 1997 - * - fixed problem with sending arp packets for ip address 0.0.0.0 - * - use kdp_register_send_receive() instead of defining - * en_send_pkt/en_recv_pkt routines to avoid name space - * collisions with IOEthernetDebugger and allow these routines to be - * overridden by a driverkit-style driver - * - * Dieter Siegmund (dieter@apple.com) Tue Jun 24 18:29:15 PDT 1997 - * - don't let the adapter auto-strip 802.3 receive frames, it messes - * up the frame size logic - * - * Dieter Siegmund (dieter@apple.com) Tue Aug 5 16:24:52 PDT 1997 - * - handle multicast address deletion correctly - */ -#ifdef MACE_DEBUG -/* - * Caveat: MACE_DEBUG delimits some code that is getting kind of - * stale. Before blindly turning on MACE_DEBUG for your - * testing, take a look at the code enabled by it to check - * that it is reasonably sane. - */ -#endif - -#include -#include - -#define RECEIVE_INT DBDMA_INT_ALWAYS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "if_en.h" -#include "mace.h" - -extern int kdp_flag; - -#if NBPFILTER > 0 -#include -#endif - -static void polled_send_pkt(char * data, int len); -static void polled_receive_pkt(char *data, int *len, int timeout_ms); -void mace_dbdma_rx_intr(int unit, void *, void *); -void mace_dbdma_tx_intr(int, void *, void *); -void mace_pci_intr(int, void *); -void mace_service_queue(struct ifnet * ifp); - -#ifdef MACE_DEBUG -static int mace_watchdog(); -#endif - -static __inline__ vm_offset_t -KVTOPHYS(vm_offset_t v) -{ - return (v); -} - -typedef int (*funcptr)(char *, int, void *); - -#ifdef MACE_DEBUG -static int -macAddrsEqual(unsigned char * one, unsigned char * two) -{ - int i; - - for (i = 0; i < NUM_EN_ADDR_BYTES; i++) - if (*one++ != *two++) - return 0; - return 1; -} -#endif - -static __inline__ int -isprint(unsigned char c) -{ - return (c >= 0x20 && c <= 0x7e); -} - -static void -printEtherHeader(enet_addr_t * dh, enet_addr_t * sh, u_short etype) -{ - u_char * dhost = dh->ether_addr_octet; - u_char * shost = sh->ether_addr_octet; - - printf("Dst: %x:%x:%x:%x:%x:%x Src: %x:%x:%x:%x:%x:%x Type: 0x%x\n", - dhost[0], dhost[1], dhost[2], dhost[3], dhost[4], dhost[5], - shost[0], shost[1], shost[2], shost[3], shost[4], shost[5], - etype); -} - -static void -printData(u_char * data_p, int n_bytes) -{ -#define CHARS_PER_LINE 16 - char line_buf[CHARS_PER_LINE + 1]; - int line_pos; - int offset; - - for (line_pos = 0, offset = 0; offset < n_bytes; offset++, data_p++) { - if (line_pos == 0) { - printf("%04d ", offset); - } - - line_buf[line_pos] = isprint(*data_p) ? *data_p : '.'; - printf(" %02x", *data_p); - line_pos++; - if (line_pos == CHARS_PER_LINE) { - line_buf[CHARS_PER_LINE] = '\0'; - printf(" %s\n", line_buf); - line_pos = 0; - } - } - if (line_pos) { /* need to finish up the line */ - for (; line_pos < CHARS_PER_LINE; line_pos++) { - printf(" "); - line_buf[line_pos] = ' '; - } - line_buf[CHARS_PER_LINE] = '\0'; - printf(" %s\n", line_buf); - } -} - -static void -printEtherPacket(enet_addr_t * dhost, enet_addr_t * shost, u_short type, - u_char * data_p, int n_bytes) -{ - printEtherHeader(dhost, shost, type); - printData(data_p, n_bytes); -} - -void -printContiguousEtherPacket(u_char * data_p, int n_bytes) -{ - printEtherPacket((enet_addr_t *)data_p, - (enet_addr_t *)(data_p + NUM_EN_ADDR_BYTES), - *((u_short *)(data_p + (NUM_EN_ADDR_BYTES * 2))), - data_p, n_bytes); -} - -mace_t mace; - -#define MACE_DMA_AREA_SIZE (ETHER_RX_NUM_DBDMA_BUFS * ETHERNET_BUF_SIZE + PG_SIZE) -static unsigned long mace_rx_dma_area[(MACE_DMA_AREA_SIZE + sizeof(long))/sizeof(long)]; - -static unsigned long mace_tx_dma_area[(ETHERNET_BUF_SIZE + PG_SIZE + sizeof(long))/sizeof(long)]; - -/* - * mace_get_hwid - * - * This function computes the Ethernet Hardware address - * from PROM. (Its best not to ask how this is done.) - */ - -unsigned char -mace_swapbits(unsigned char bits) -{ - unsigned char mask = 0x1, i, newbits = 0; - - for (i = 0x80; i; mask <<= 1, i >>=1) { - if (bits & mask) - newbits |= i; - } - - return newbits; -} - -void -mace_get_hwid(unsigned char *hwid_addr, mace_t * m) -{ - int i; - - for (i = 0; i < NUM_EN_ADDR_BYTES; i++, hwid_addr += 16) { - m->macaddr[i] = mace_swapbits(*hwid_addr); - } -} - -/* - * mace_reset - * - * Reset the board.. - */ - -void -mace_reset() -{ - dbdma_reset(DBDMA_ETHERNET_RV); - dbdma_reset(DBDMA_ETHERNET_TX); -} - - -/* - * mace_geteh: - * - * This function gets the ethernet address (array of 6 unsigned - * bytes) from the MACE board registers. - * - */ - -void -mace_geteh(char *ep) -{ - int i; - unsigned char ep_temp; - - mace.ereg->iac = IAC_PHYADDR; eieio(); - - for (i = 0; i < ETHER_ADD_SIZE; i++) { - ep_temp = mace.ereg->padr; eieio(); - *ep++ = ep_temp; - } -} - -/* - * mace_seteh: - * - * This function sets the ethernet address (array of 6 unsigned - * bytes) on the MACE board. - */ - -static void -mace_seteh(char *ep) -{ - int i; - unsigned char status; - - if (mace.chip_id != MACE_REVISION_A2) { - mace.ereg->iac = IAC_ADDRCHG|IAC_PHYADDR; eieio(); - - while ((status = mace.ereg->iac)) { - if ((status & IAC_ADDRCHG) == 0) { - eieio(); - break; - } - eieio(); - } - } - else { - /* start to load the address.. */ - mace.ereg->iac = IAC_PHYADDR; eieio(); - } - - for (i = 0; i < NUM_EN_ADDR_BYTES; i++) { - mace.ereg->padr = *(ep+i); eieio(); - } - return; -} - -/* - * mace_setup_dbdma - * - * Setup various dbdma pointers. - */ - -void -mace_setup_dbdma() -{ - mace_t * m = &mace; - int i; - dbdma_command_t * d; - vm_offset_t address; - dbdma_regmap_t * regmap; - -#define ALIGN_MASK 0xfffffffcUL - if (m->rv_dma_area == 0) { - m->rv_dma_area = (unsigned char *) - ((((unsigned long)mace_rx_dma_area) + 3) & ALIGN_MASK); - m->rv_dma = dbdma_alloc(ETHER_RX_NUM_DBDMA_BUFS + 2); - m->tx_dma = dbdma_alloc(TX_NUM_DBDMA); - m->tx_dma_area = (unsigned char *) - ((((unsigned long)mace_tx_dma_area) + 3) & ALIGN_MASK); - } - - /* set up a ring of buffers */ - d = m->rv_dma; - for (i = 0; i < ETHER_RX_NUM_DBDMA_BUFS; i++, d++) { - address = (vm_offset_t) KVTOPHYS((vm_offset_t)&m->rv_dma_area[i*ETHERNET_BUF_SIZE]); - DBDMA_BUILD(d, DBDMA_CMD_IN_LAST, 0, ETHERNET_BUF_SIZE, - address, RECEIVE_INT, - DBDMA_WAIT_NEVER, - DBDMA_BRANCH_NEVER); - } - - /* stop when we hit the end of the list */ - DBDMA_BUILD(d, DBDMA_CMD_STOP, 0, 0, 0, RECEIVE_INT, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_NEVER); - d++; - - /* branch to command at "address" ie. element 0 of the "array" */ - DBDMA_BUILD(d, DBDMA_CMD_NOP, 0, 0, 0, DBDMA_INT_NEVER, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_ALWAYS); - address = (vm_offset_t) KVTOPHYS((vm_offset_t)m->rv_dma); - dbdma_st4_endian(&d->d_cmddep, address); - - m->rv_head = 0; - m->rv_tail = ETHER_RX_NUM_DBDMA_BUFS; /* always contains DBDMA_CMD_STOP */ - - /* stop/init/restart dma channel */ - dbdma_reset(DBDMA_ETHERNET_RV); - dbdma_reset(DBDMA_ETHERNET_TX); - - /* Set the wait value.. */ - regmap = DBDMA_REGMAP(DBDMA_ETHERNET_RV); - dbdma_st4_endian(®map->d_wait, DBDMA_SET_CNTRL(0x00)); - - /* Set the tx wait value */ - regmap = DBDMA_REGMAP(DBDMA_ETHERNET_TX); - dbdma_st4_endian(®map->d_wait, DBDMA_SET_CNTRL(0x20)); - - flush_cache_v((vm_offset_t)m->rv_dma, - sizeof(dbdma_command_t) * (ETHER_RX_NUM_DBDMA_BUFS + 2)); - /* start receiving */ - dbdma_start(DBDMA_ETHERNET_RV, m->rv_dma); -} - -#ifdef MACE_DEBUG -static unsigned char testBuffer[PG_SIZE * 4]; -static unsigned char testMsg[] = "mace ethernet interface test"; - -static void -send_test_packet() -{ - unsigned char * tp; - - bzero(testBuffer, sizeof(testBuffer)); - - tp = testBuffer; - - /* send self-addressed packet */ - bcopy(&mace.macaddr[0], tp, NUM_EN_ADDR_BYTES); - tp += NUM_EN_ADDR_BYTES; - bcopy(&mace.macaddr[0], tp, NUM_EN_ADDR_BYTES); - tp += NUM_EN_ADDR_BYTES; - *tp++ = 0; - *tp++ = 0; - bcopy(testMsg, tp, sizeof(testMsg)); - polled_send_pkt(testBuffer, 80); - return; -} -#endif - -/* - * Function: init_mace - * - * Purpose: - * Called early on, initializes the adapter and readies it for - * kdb kernel debugging. - */ -void -init_mace() -{ - unsigned char status; - mace_t * m = &mace; - struct mace_board * ereg; - int mpc = 0; - - /* - * Only use in-kernel driver for early debugging (bootargs: kdp=1 or kdp=3) - */ - if ( (kdp_flag & 1) == 0 ) - { - return; - } - - bzero(&mace, sizeof(mace)); - - /* get the ethernet registers' mapped address */ - ereg = m->ereg - = (struct mace_board *) POWERMAC_IO(PCI_ETHERNET_BASE_PHYS); - mace_get_hwid((unsigned char *)POWERMAC_IO(PCI_ETHERNET_ADDR_PHYS), m); - - /* Reset the board & AMIC.. */ - mace_reset(); - - /* grab the MACE chip rev */ - m->chip_id = (ereg->chipid2 << 8 | ereg->chipid1); - - /* don't auto-strip for 802.3 */ - m->ereg->rcvfc &= ~(RCVFC_ASTRPRCV); - - /* set the ethernet address */ - mace_seteh(mace.macaddr); - { - unsigned char macaddr[NUM_EN_ADDR_BYTES]; - mace_geteh(macaddr); - printf("mace ethernet [%02x:%02x:%02x:%02x:%02x:%02x]\n", - macaddr[0], macaddr[1], macaddr[2], - macaddr[3], macaddr[4], macaddr[5]); - } - - /* Now clear the Multicast filter */ - if (m->chip_id != MACE_REVISION_A2) { - ereg->iac = IAC_ADDRCHG|IAC_LOGADDR; eieio(); - - while ((status = ereg->iac)) { - if ((status & IAC_ADDRCHG) == 0) - break; - eieio(); - } - eieio(); - } - else { - ereg->iac = IAC_LOGADDR; eieio(); - } - { - int i; - - for (i=0; i < 8; i++) - { ereg->ladrf = 0; - eieio(); - } - } - - /* register interrupt routines */ - mace_setup_dbdma(); - - /* Start the chip... */ - m->ereg->maccc = MACCC_ENXMT|MACCC_ENRCV; eieio(); - { - volatile char ch = mace.ereg->ir; eieio(); - } - - delay(500); /* paranoia */ - mace.ereg->imr = 0xfe; eieio(); - - /* register our debugger routines */ - kdp_register_send_receive((kdp_send_t)polled_send_pkt, - (kdp_receive_t)polled_receive_pkt); - -#if 0 - printf("Testing 1 2 3\n"); - send_test_packet(); - printf("Testing 1 2 3\n"); - send_test_packet(); - printf("Testing 1 2 3\n"); - send_test_packet(); - do { - static unsigned char buf[ETHERNET_BUF_SIZE]; - int len; - int nmpc = mace.ereg->mpc; eieio(); - - if (nmpc > mpc) { - mpc = nmpc; - printf("mpc %d\n", mpc); - } - polled_receive_pkt(buf, &len, 100); - if (len > 0) { - printf("rx %d\n", len); - printContiguousEtherPacket(buf, len); - } - } while(1); -#endif - - return; -} - -#ifdef MACE_DEBUG -static void -txstatus(char * msg) -{ - volatile dbdma_regmap_t * dmap = DBDMA_REGMAP(DBDMA_ETHERNET_TX); - volatile unsigned long status; - volatile unsigned long intr; - volatile unsigned long branch; - volatile unsigned long wait; - - status = dbdma_ld4_endian(&dmap->d_status); eieio(); - intr = dbdma_ld4_endian(&dmap->d_intselect); eieio(); - branch = dbdma_ld4_endian(&dmap->d_branch); eieio(); - wait = dbdma_ld4_endian(&dmap->d_wait); eieio(); - printf("(%s s=0x%x i=0x%x b=0x%x w=0x%x)", msg, status, intr, branch, - wait); - return; -} -#endif - -static void -tx_dbdma(char * data, int len) -{ - unsigned long count; - dbdma_command_t * d; - unsigned long page; - - d = mace.tx_dma; - page = ((unsigned long) data) & PG_MASK; - if ((page + len) <= PG_SIZE) { /* one piece dma */ - DBDMA_BUILD(d, DBDMA_CMD_OUT_LAST, DBDMA_KEY_STREAM0, - len, - (vm_offset_t) KVTOPHYS((vm_offset_t) data), - DBDMA_INT_NEVER, - DBDMA_WAIT_IF_FALSE, DBDMA_BRANCH_NEVER); - } - else { /* two piece dma */ - count = PG_SIZE - page; - DBDMA_BUILD(d, DBDMA_CMD_OUT_MORE, DBDMA_KEY_STREAM0, - count, - (vm_offset_t)KVTOPHYS((vm_offset_t) data), - DBDMA_INT_NEVER, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_NEVER); - d++; - DBDMA_BUILD(d, DBDMA_CMD_OUT_LAST, DBDMA_KEY_STREAM0, - len - count, (vm_offset_t) - KVTOPHYS((vm_offset_t)((unsigned char *)data + count)), - DBDMA_INT_NEVER, - DBDMA_WAIT_IF_FALSE, DBDMA_BRANCH_NEVER); - } - d++; - DBDMA_BUILD(d, DBDMA_CMD_LOAD_QUAD, DBDMA_KEY_SYSTEM, - 1, KVTOPHYS((vm_offset_t) &mace.ereg->xmtfs),DBDMA_INT_NEVER, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_NEVER); - d++; - DBDMA_BUILD(d, DBDMA_CMD_LOAD_QUAD, DBDMA_KEY_SYSTEM, - 1, KVTOPHYS((vm_offset_t) &mace.ereg->ir), DBDMA_INT_ALWAYS, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_NEVER); - d++; - DBDMA_BUILD(d, DBDMA_CMD_STOP, 0, 0, 0, 0, 0, 0); - flush_cache_v((vm_offset_t)mace.tx_dma, sizeof(dbdma_command_t) * TX_NUM_DBDMA); - dbdma_start(DBDMA_ETHERNET_TX, mace.tx_dma); - return; - -} - -static void -waitForDBDMADone(char * msg) -{ - { - /* wait for tx dma completion */ - volatile dbdma_regmap_t * dmap = DBDMA_REGMAP(DBDMA_ETHERNET_TX); - int i; - volatile unsigned long val; - - i = 0; - do { - val = dbdma_ld4_endian(&dmap->d_status); eieio(); - delay(50); - i++; - } while ((i < 100000) && (val & DBDMA_CNTRL_ACTIVE)); - if (i == 100000) - printf("mace(%s): tx_dbdma poll timed out 0x%x", msg, val); - } -} - -void -mace_service_queue(struct ifnet * ifp) -{ - unsigned char * buf_p; - struct mbuf * m; - struct mbuf * mp; - int len; - - if (mace.tx_busy) { /* transmit in progress? */ - return; - } - - IF_DEQUEUE(&(ifp->if_snd), m); - if (m == 0) { - return; - } - - len = m->m_pkthdr.len; - - if (len > ETHERMAXPACKET) { - printf("mace_start: packet too big (%d), dropping\n", len); - m_freem(m); - return; - - } - buf_p = mace.tx_dma_area; - if (m->m_nextpkt) { - printf("mace: sending more than one mbuf\n"); - } - for (mp = m; mp; mp = mp->m_next) { - if (mp->m_len == 0) - continue; - bcopy(mtod(mp, caddr_t), buf_p, min(mp->m_len, len)); - len -= mp->m_len; - buf_p += mp->m_len; - } - m_freem(m); - -#if NBPFILTER > 0 - if (ifp->if_bpf) - BPF_TAP(ifp->if_bpf, mace.tx_dma_area, m->m_pkthdr.len); -#endif - -#if 0 - printf("tx packet %d\n", m->m_pkthdr.len); - printContiguousEtherPacket(mace.tx_dma_area, m->m_pkthdr.len); -#endif - - /* fill in the dbdma records and kick off the dma */ - tx_dbdma(mace.tx_dma_area, m->m_pkthdr.len); - mace.tx_busy = 1; - return; -} - -#ifdef MACE_DEBUG -static int -mace_watchdog() -{ - struct ifnet * ifp = &mace.en_arpcom.ac_if; - int s; - - mace.txwatchdog++; - s = splnet(); - if (mace.rxintr == 0) { - printf("rx is hung up\n"); - rx_intr(); - } - mace.rxintr = 0; -#if 0 - if (mace.txintr == 0 && ifp->if_snd.ifq_head) { - if (mace.tx_busy) - dbdma_stop(DBDMA_ETHERNET_TX); - mace.tx_busy = 0; - mace_service_queue(ifp); - } - mace.txintr = 0; -#endif - timeout(mace_watchdog, 0, 10*hz); /* just in case we drop an interrupt */ - return (0); -} -#endif /* MACE_DEBUG */ - -static int -mace_start(struct ifnet * ifp) -{ -// int i = mace.tx_busy; - -// printf("mace_start %s\n", mace.tx_busy ? "(txBusy)" : ""); - mace_service_queue(ifp); - -// if (mace.tx_busy && !i) -// printf("(txStarted)\n"); - return 0; -} - -int -mace_recv_pkt(funcptr pktfunc, void * p) -{ - vm_offset_t address; - struct mace_board * board; - long bytes; - int done = 0; - int doContinue = 0; - mace_t * m; - unsigned long resid; - unsigned short status; - int tail; - - m = &mace; - board = m->ereg; - - /* remember where the tail was */ - tail = m->rv_tail; - for (done = 0; (done == 0) && (m->rv_head != tail);) { - dbdma_command_t * dmaHead; - - dmaHead = &m->rv_dma[m->rv_head]; - resid = dbdma_ld4_endian(&dmaHead->d_status_resid); - status = (resid >> 16); - bytes = resid & 0xffff; - bytes = ETHERNET_BUF_SIZE - bytes - 8; /* strip off FCS/CRC */ - - if ((status & DBDMA_ETHERNET_EOP) == 0) { - /* no packets are ready yet */ - break; - } - doContinue = 1; - /* if the packet is good, pass it up */ - if (bytes >= (ETHER_MIN_PACKET - 4)) { - char * dmaPacket; - dmaPacket = &m->rv_dma_area[m->rv_head * ETHERNET_BUF_SIZE]; - done = (*pktfunc)(dmaPacket, bytes, p); - } - /* mark the head as the new tail in the dma channel command list */ - DBDMA_BUILD(dmaHead, DBDMA_CMD_STOP, 0, 0, 0, RECEIVE_INT, - DBDMA_WAIT_NEVER, DBDMA_BRANCH_NEVER); - flush_cache_v((vm_offset_t)dmaHead, sizeof(*dmaHead)); - eieio(); - - /* make the tail an available dma'able entry */ - { - dbdma_command_t * dmaTail; - dmaTail = &m->rv_dma[m->rv_tail]; - address = KVTOPHYS((vm_offset_t) - &m->rv_dma_area[m->rv_tail*ETHERNET_BUF_SIZE]); - // this command is live so write it carefully - DBDMA_ST4_ENDIAN(&dmaTail->d_address, address); - dmaTail->d_status_resid = 0; - dmaTail->d_cmddep = 0; - eieio(); - DBDMA_ST4_ENDIAN(&dmaTail->d_cmd_count, - ((DBDMA_CMD_IN_LAST) << 28) | ((0) << 24) | - ((RECEIVE_INT) << 20) | - ((DBDMA_BRANCH_NEVER) << 18) | ((DBDMA_WAIT_NEVER) << 16) | - (ETHERNET_BUF_SIZE)); - eieio(); - flush_cache_v((vm_offset_t)dmaTail, sizeof(*dmaTail)); - } - /* head becomes the tail */ - m->rv_tail = m->rv_head; - - /* advance the head */ - m->rv_head++; - if (m->rv_head == (ETHER_RX_NUM_DBDMA_BUFS + 1)) - m->rv_head = 0; - } - if (doContinue) { - sync(); - dbdma_continue(DBDMA_ETHERNET_RV); - } - return (done); -} - -/* kdb handle buffer routines */ -struct kdbCopy { - int * len; - char * data; -}; - -static int -kdb_copy(char * pktBuf, int len, void * p) -{ - struct kdbCopy * cp = (struct kdbCopy *)p; - - bcopy(pktBuf, cp->data, len); - *cp->len = len; - return (1); /* signal that we're done */ -} - -/* kdb debugger routines */ -static void -polled_send_pkt(char * data, int len) -{ - waitForDBDMADone("mace: polled_send_pkt start"); - tx_dbdma(data, len); - waitForDBDMADone("mace: polled_send_pkt end"); - return; -} - -static void -polled_receive_pkt(char *data, int *len, int timeout_ms) -{ - struct kdbCopy cp; - - cp.len = len; - cp.data = data; - - timeout_ms *= 1000; - *len = 0; - while (mace_recv_pkt(kdb_copy, (void *)&cp) == 0) { - if (timeout_ms <= 0) - break; - delay(50); - timeout_ms -= 50; - } - return; -} - -/* Bump to force ethernet data to be 4-byte aligned - * (since the ethernet header is 14 bytes, and the 802.3 header is - * 22 = 14+8 bytes). This assumes that m_data is word-aligned - * (which it is). - */ -#define ETHER_DATA_ALIGN 2 - -/* - * Function: rxpkt - * - * Purpose: - * Called from within mace_recv_pkt to deal with a packet of data. - * rxpkt() allocates an mbuf(+cluser) and passes it up to the stacks. - * Returns: - * 0 if the packet was copied to an mbuf, 1 otherwise - */ -static int -rxpkt(char * data, int len, void * p) -{ - struct ether_header * eh_p = (struct ether_header *)data; - struct ifnet * ifp = &mace.en_arpcom.ac_if; - struct mbuf * m; - - int interesting; - - mace.rxintr++; - - /* mcast, bcast -- we're interested in either */ - interesting = eh_p->ether_dhost[0] & 1; - -#if NBPFILTER > 0 - /* - * Check if there's a bpf filter listening on this interface. - * If so, hand off the raw packet to bpf_tap(). - */ - if (ifp->if_bpf) { - BPF_TAP(ifp->if_bpf, data, len); - - /* - * Keep the packet if it's a broadcast or has our - * physical ethernet address (or if we support - * multicast and it's one). - */ - if ((interesting == 0) && bcmp(eh_p->ether_dhost, mace.macaddr, - sizeof(eh_p->ether_dhost)) != 0) { - return (1); - } - } -#endif - - /* - * We "know" a full-sized packet fits in one cluster. Set up the - * packet header, and if the length is sufficient, attempt to allocate - * a cluster. If that fails, fall back to the old way (m_devget()). - * Here, we take the simple approach of cluster vs. single mbuf. - */ - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == 0) { -#ifdef MACE_DEBUG - printf("mget failed\n"); -#endif - return (1); - } - - if (len > (MHLEN - ETHER_DATA_ALIGN)) - { MCLGET(m, M_DONTWAIT); - if (m->m_flags&M_EXT) /* MCLGET succeeded */ - { m->m_data += ETHER_DATA_ALIGN; - bcopy(data, mtod(m, caddr_t), (unsigned)len); - } else - { -#ifdef MACE_DEBUG - printf("no clusters\n"); -#endif - m_free(m); - m = (struct mbuf *)m_devget(data, len, 0, ifp, 0); - if (m == 0) - return (1); - } - } else - { m->m_data += ETHER_DATA_ALIGN; - bcopy(data, mtod(m, caddr_t), (unsigned)len); - } - - /* - * Current code up the line assumes that the media header's been - * stripped, but we'd like to preserve it, just in case someone - * wants to peek. - */ - m->m_pkthdr.len = len; - m->m_len = len; - m->m_pkthdr.rcvif = ifp; - m->m_data += sizeof(*eh_p); - m->m_len -= sizeof (*eh_p); - m->m_pkthdr.len -= sizeof(*eh_p); - ether_input(ifp, eh_p, m); - - return (0); -} - - -static void -rx_intr() -{ - mace_recv_pkt(rxpkt, 0); -} - -void -mace_dbdma_rx_intr(int unit, void *ignored, void * arp) -{ - if (!mace.ready) - return; - - thread_call_func((thread_call_func_t)rx_intr, 0, TRUE); -} - - -int -mace_ioctl(struct ifnet * ifp,u_long cmd, caddr_t data) -{ - struct arpcom * ar; - unsigned error = 0; - struct ifaddr * ifa = (struct ifaddr *)data; - struct ifreq * ifr = (struct ifreq *)data; - struct sockaddr_in * sin; - - sin = (struct sockaddr_in *)(&((struct ifreq *)data)->ifr_addr); - ar = (struct arpcom *)ifp; - - switch (cmd) { - case SIOCAUTOADDR: - error = in_bootp(ifp, sin, &mace.en_arpcom.ac_enaddr); - break; - - case SIOCSIFADDR: -#if NeXT - ifp->if_flags |= (IFF_UP | IFF_RUNNING); -#else - ifp->if_flags |= IFF_UP; -#endif - switch (ifa->ifa_addr->sa_family) { - case AF_INET: - /* - * See if another station has *our* IP address. - * i.e.: There is an address conflict! If a - * conflict exists, a message is sent to the - * console. - */ - if (IA_SIN(ifa)->sin_addr.s_addr != 0) { /* don't bother for 0.0.0.0 */ - ar->ac_ipaddr = IA_SIN(ifa)->sin_addr; - arpwhohas(ar, &IA_SIN(ifa)->sin_addr); - } - break; - default: - break; - } - break; - - case SIOCSIFFLAGS: - /* - * If interface is marked down and it is running, then stop it - */ - if ((ifp->if_flags & IFF_UP) == 0 && - (ifp->if_flags & IFF_RUNNING) != 0) { - /* - * If interface is marked down and it is running, then - * stop it. - */ - ifp->if_flags &= ~IFF_RUNNING; - } else if ((ifp->if_flags & IFF_UP) != 0 && - (ifp->if_flags & IFF_RUNNING) == 0) { - /* - * If interface is marked up and it is stopped, then - * start it. - */ - ifp->if_flags |= IFF_RUNNING; - } - - /* - * If the state of the promiscuous bit changes, the - * interface must be reset to effect the change. - */ - if (((ifp->if_flags ^ mace.promisc) & IFF_PROMISC) && - (ifp->if_flags & IFF_RUNNING)) { - mace.promisc = ifp->if_flags & IFF_PROMISC; - mace_sync_promisc(ifp); - } - - break; - - case SIOCADDMULTI: - if ((error = ether_addmulti(ifr, ar)) == ENETRESET) - { if ((error = mace_addmulti(ifr, ar)) != 0) - { error = 0; - mace_sync_mcast(ifp); - } - } - break; - - case SIOCDELMULTI: - { - struct ether_addr enaddr[2]; /* [0] - addrlo, [1] - addrhi */ - - if ((error = ether_delmulti(ifr, ar, enaddr)) == ENETRESET) { - if ((error = mace_delmulti(ifr, ar, enaddr)) != 0) { - error = 0; - mace_sync_mcast(ifp); - } - } - } - break; - - default: - error = EINVAL; - break; - } - return (error); -} - -void -mace_init() -{ - struct ifnet * ifp = &mace.en_arpcom.ac_if; - - /* - * Only use in-kernel driver for early debugging (bootargs: kdp=1|3) - */ - if ( (kdp_flag & 1) == 0 ) - { - return; - } - - mace.tx_busy = 0; - mace.txintr = 0; - mace.promisc = 0; - - bzero((caddr_t)ifp, sizeof(struct ifnet)); - bcopy(&mace.macaddr, &mace.en_arpcom.ac_enaddr, NUM_EN_ADDR_BYTES); - - ifp->if_name = "en"; - ifp->if_unit = 0; - ifp->if_private = 0; - ifp->if_ioctl = mace_ioctl; - ifp->if_start = mace_start; - ifp->if_flags = - IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST; -#if NBPFILTER > 0 - bpfattach(&ifp->if_bpf, ifp, DLT_EN10MB, sizeof(struct ether_header)); -#endif - if_attach(ifp); - ether_ifattach(ifp); - - mace.rxintr = 0; - - /* wire in the interrupt routines */ - pmac_register_int(PMAC_DMA_ETHERNET_RX, SPLNET, - mace_dbdma_rx_intr, 0); - pmac_register_int(PMAC_DMA_ETHERNET_TX, SPLNET, - mace_dbdma_tx_intr, 0); - -// pmac_register_int(PMAC_DEV_ETHERNET, SPLNET, mace_pci_intr); - mace.ready = 1; -#ifdef MACE_DEBUG - timeout(mace_watchdog, 0, 10*hz); /* just in case we drop an interrupt */ -#endif - return; -} - -/* - * mace_pci_intr - * - * Service MACE interrupt - */ - -void -mace_pci_intr(int device, void *ssp) -{ - unsigned char ir, retry, frame, packet, length; - - ir = mace.ereg->ir; eieio(); /* Clear Interrupt */ - packet = mace.ereg->mpc; eieio(); - length = mace.ereg->rntpc; eieio(); - - printf("(txI)"); - - if (ir & IR_XMTINT) { - retry = mace.ereg->xmtrc; eieio(); /* Grab transmit retry count */ - frame = mace.ereg->xmtfs; eieio(); -// if (mace.ready) -// mace_dbdma_tx_intr(device, ssp); - } - return; -} - -static void -tx_intr() -{ - mace.txintr++; - mace.tx_busy = 0; - mace_service_queue(&mace.en_arpcom.ac_if); -} - -/* - * mace_dbdma_tx_intr - * - * DBDMA interrupt routine - */ -void -mace_dbdma_tx_intr(int unit, void *ignored, void * arg) -{ - if (!mace.ready) - return; - - thread_call_func((thread_call_func_t)tx_intr, 0, TRUE); - return; -} diff -urN xnu-344.49/bsd/if/ppc/if_en.h xnu-517/bsd/if/ppc/if_en.h --- xnu-344.49/bsd/if/ppc/if_en.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/if/ppc/if_en.h Thu Jan 1 01:00:00 1970 @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * MacOSX Mace driver - * Defines and device state - * Dieter Siegmund (dieter@next.com) Thu Feb 27 18:25:33 PST 1997 - * - ripped off code from MK/LINUX - */ - -#define PG_SIZE 0x1000UL -#define PG_MASK (PG_SIZE - 1UL) - -#define ETHERMTU 1500 -#define ETHER_RX_NUM_DBDMA_BUFS 32 -#define ETHERNET_BUF_SIZE (ETHERMTU + 36) -#define ETHER_MIN_PACKET 64 -#define TX_NUM_DBDMA 6 - -#define DBDMA_ETHERNET_EOP 0x40 - -typedef struct mace_s { - struct arpcom en_arpcom; - struct mace_board * ereg; /* ethernet register set address */ - unsigned char macaddr[NUM_EN_ADDR_BYTES]; /* mac address */ - int chip_id; - dbdma_command_t *rv_dma; - dbdma_command_t *tx_dma; - unsigned char *rv_dma_area; - unsigned char *tx_dma_area; - unsigned char multi_mask[8]; /* Multicast mask */ - unsigned char multi_use[64]; /* Per-mask-bit use count */ - int rv_tail; - int rv_head; - int tx_busy; - int txintr; - int rxintr; - int txwatchdog; - int ready; - int promisc; /* IFF_PROMISC state */ -} mace_t; - diff -urN xnu-344.49/bsd/if/ppc/mace.c xnu-517/bsd/if/ppc/mace.c --- xnu-344.49/bsd/if/ppc/mace.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/if/ppc/mace.c Thu Jan 1 01:00:00 1970 @@ -1,261 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * MACE Device-dependent code (some still lives in if_en.c): - * - * MACE Multicast Address scheme - - * Compute Enet CRC for each Mcast address; take high 6 bits of 32-bit - * crc, giving a "bit index" into a 64-bit register. On packet receipt, - * if corresponding bit is set, accept packet. - * We keep track of requests in a per-hash-value table (16-bit counters - * should be sufficient). Since we're hashing, we only care about the - * hash value of each address. - * - * Apple Confidential - * - * (C) COPYRIGHT Apple Computer, Inc., 1994-1997 - * All Rights Reserved - * - * Justin C. Walker - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "if_en.h" -#include "mace.h" - -extern mace_t mace; - -#define ENET_CRCPOLY 0x04c11db7 - -/* Real fast bit-reversal algorithm, 6-bit values */ -int reverse6[] = -{ 0x0,0x20,0x10,0x30,0x8,0x28,0x18,0x38, - 0x4,0x24,0x14,0x34,0xc,0x2c,0x1c,0x3c, - 0x2,0x22,0x12,0x32,0xa,0x2a,0x1a,0x3a, - 0x6,0x26,0x16,0x36,0xe,0x2e,0x1e,0x3e, - 0x1,0x21,0x11,0x31,0x9,0x29,0x19,0x39, - 0x5,0x25,0x15,0x35,0xd,0x2d,0x1d,0x3d, - 0x3,0x23,0x13,0x33,0xb,0x2b,0x1b,0x3b, - 0x7,0x27,0x17,0x37,0xf,0x2f,0x1f,0x3f -}; - -unsigned int crc416(current, nxtval) -register unsigned int current; -register unsigned short nxtval; -{ register unsigned int counter; - register int highCRCBitSet, lowDataBitSet; - - /* Swap bytes */ - nxtval = ((nxtval & 0x00FF) << 8) | (nxtval >> 8); - - /* Compute bit-by-bit */ - for (counter = 0; counter != 16; ++counter) - { /* is high CRC bit set? */ - if ((current & 0x80000000) == NULL) - highCRCBitSet = 0; - else - highCRCBitSet = 1; - - current = current << 1; - - if ((nxtval & 0x0001) == NULL) - lowDataBitSet = 0; - else - lowDataBitSet = 1; - - nxtval = nxtval >> 1; - - /* do the XOR */ - if (highCRCBitSet ^ lowDataBitSet) - current = current ^ ENET_CRCPOLY; - } - return current; -} - -unsigned int mace_crc(unsigned short *address) -{ register unsigned int newcrc; - - newcrc = crc416(0xffffffff, *address); /* address bits 47 - 32 */ - newcrc = crc416(newcrc, address[1]); /* address bits 31 - 16 */ - newcrc = crc416(newcrc, address[2]); /* address bits 15 - 0 */ - - return(newcrc); -} - -/* - * Add requested mcast addr to Mace's filter. Assume that the first - * address in the arpcom ac_multiaddrs list is the one we're interested in. - */ -int -mace_addmulti(register struct ifreq *ifr, register struct arpcom *ar) -{ register unsigned char *addr; - unsigned int crc; - unsigned char mask; - - addr = ar->ac_multiaddrs->enm_addrlo; - - crc = mace_crc((unsigned short *)addr)&0x3f; /* Big-endian alert! */ - crc = reverse6[crc]; /* Hyperfast bit-reversing algorithm */ - if (mace.multi_use[crc]++) - return(0); /* This bit is already set */ - mask = crc % 8; - mask = (unsigned char)1 << mask; - mace.multi_mask[crc/8] |= mask; - return(1); -} - -int -mace_delmulti(register struct ifreq *ifr, register struct arpcom *ar, - struct ether_addr * enaddr) -{ register unsigned char *addr; - unsigned int crc; - unsigned char mask; - - addr = (char *)enaddr; /* XXX assumes addrlo == addrhi */ - - /* Now, delete the address from the filter copy, as indicated */ - crc = mace_crc((unsigned short *)addr)&0x3f; /* Big-endian alert! */ - crc = reverse6[crc]; /* Hyperfast bit-reversing algorithm */ - if (mace.multi_use[crc] == 0) - return(EINVAL); /* That bit wasn't in use! */ - - if (--mace.multi_use[crc]) - return(0); /* That bit is still in use */ - - mask = crc % 8; - mask = ((unsigned char)1 << mask) ^ 0xff; /* To turn off bit */ - mace.multi_mask[crc/8] &= mask; - return(1); -} - -/* - * Sync the adapter with the software copy of the multicast mask - * (logical address filter). - * If we want all m-cast addresses, we just blast 1's into the filter. - * When we reverse this, we can use the current state of the (software) - * filter, which should have been kept up to date. - */ -void -mace_sync_mcast(register struct ifnet * ifp) -{ register unsigned long temp, temp1; - register int i; - register char *p; - register struct mace_board *ereg = mace.ereg; - - temp = ereg->maccc; - - /* - * Have to deal with early rev of chip for updating LAF - * Don't know if any MacOSX systems still run this rev. - */ - if (mace.chip_id == MACERevA2) - { /* First, turn off receiver */ - temp1 = temp&~MACCC_ENRCV; - ereg->maccc = temp1; - eieio(); - - /* Then, check FIFO - frame being received will complete */ - temp1 = ereg->fifofc; - - mace.ereg->iac = IAC_LOGADDR; - eieio(); - } else - { ereg->iac = IAC_ADDRCHG|IAC_LOGADDR; - eieio(); - - while (temp1 = ereg->iac) - { eieio(); - if ((temp1&IAC_ADDRCHG) == 0) - break; - } - } - - if (ifp->if_flags & IFF_ALLMULTI) /* Then want ALL m-cast pkts */ - { /* set mask to all 1's */ - for (i=0;i<8;i++) - { ereg->ladrf = 0xff; - eieio(); - } - } else - { - /* Assuming everything is big-endian */ - for (i=0, p = &mace.multi_mask[0];i<8;i++) - { ereg->ladrf = *p++; - eieio(); - } - } - - ereg->maccc = temp; /* Reset config ctrlr */ - eieio(); - -} - -void -mace_sync_promisc(register struct ifnet *ifp) -{ - register u_long o_maccc, n_maccc; - register struct mace_board *ereg = mace.ereg; - - /* - * Save current state and disable receive. - */ - o_maccc = ereg->maccc; - n_maccc = o_maccc & ~MACCC_ENRCV; - ereg->maccc = n_maccc; - eieio(); - - /* - * Calculate new desired state - */ - if (ifp->if_flags & IFF_PROMISC) { - /* set PROMISC bit */ - o_maccc |= MACCC_PROM; - } else { - /* clear PROMISC bit */ - o_maccc &= ~MACCC_PROM; - } - - /* - * Note that the "old" mode includes the new promiscuous state now. - */ - ereg->maccc = o_maccc; - eieio(); -} diff -urN xnu-344.49/bsd/if/ppc/mace.h xnu-517/bsd/if/ppc/mace.h --- xnu-344.49/bsd/if/ppc/mace.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/if/ppc/mace.h Thu Jan 1 01:00:00 1970 @@ -1,371 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Copyright 1996 1995 by Open Software Foundation, Inc. 1997 1996 1995 1994 1993 1992 1991 - * All Rights Reserved - * - * Permission to use, copy, modify, and distribute this software and - * its documentation for any purpose and without fee is hereby granted, - * provided that the above copyright notice appears in all copies and - * that both the copyright notice and this permission notice appear in - * supporting documentation. - * - * OSF DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE. - * - * IN NO EVENT SHALL OSF BE LIABLE FOR ANY SPECIAL, INDIRECT, OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM - * LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT, - * NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION - * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - */ -/* - * Copyright 1996 1995 by Apple Computer, Inc. 1997 1996 1995 1994 1993 1992 1991 - * All Rights Reserved - * - * Permission to use, copy, modify, and distribute this software and - * its documentation for any purpose and without fee is hereby granted, - * provided that the above copyright notice appears in all copies and - * that both the copyright notice and this permission notice appear in - * supporting documentation. - * - * APPLE COMPUTER DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE. - * - * IN NO EVENT SHALL APPLE COMPUTER BE LIABLE FOR ANY SPECIAL, INDIRECT, OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM - * LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT, - * NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION - * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -/* - * MKLINUX-1.0DR2 - */ -/* - * PMach Operating System - * Copyright (c) 1995 Santa Clara University - * All Rights Reserved. - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - * File: if_3c501.h - * Author: Philippe Bernadat - * Date: 1989 - * Copyright (c) 1989 OSF Research Institute - * - * 3COM Etherlink 3C501 Mach Ethernet drvier - */ -/* - Copyright 1990 by Open Software Foundation, -Cambridge, MA. - - All Rights Reserved - - Permission to use, copy, modify, and distribute this software and -its documentation for any purpose and without fee is hereby granted, -provided that the above copyright notice appears in all copies and -that both the copyright notice and this permission notice appear in -supporting documentation, and that the name of OSF or Open Software -Foundation not be used in advertising or publicity pertaining to -distribution of the software without specific, written prior -permission. - - OSF DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE -INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, -IN NO EVENT SHALL OSF BE LIABLE FOR ANY SPECIAL, INDIRECT, OR -CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT, -NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION -WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -*/ - -#ifdef KERNEL -#include -#endif - - -#define ENETPAD(n) char n[15] - -/* 0x50f0a000 */ -struct mace_board { - volatile unsigned char rcvfifo; /* 00 receive fifo */ - ENETPAD(epad0); - volatile unsigned char xmtfifo; /* 01 transmit fifo */ - ENETPAD(epad1); - volatile unsigned char xmtfc; /* 02 transmit frame control */ - ENETPAD(epad2); - volatile unsigned char xmtfs; /* 03 transmit frame status */ - ENETPAD(epad3); - volatile unsigned char xmtrc; /* 04 transmit retry count */ - ENETPAD(epad4); - volatile unsigned char rcvfc; /* 05 receive frame control -- 4 bytes */ - ENETPAD(epad5); - volatile unsigned char rcvfs; /* 06 receive frame status */ - ENETPAD(epad6); - volatile unsigned char fifofc; /* 07 fifo frame count */ - ENETPAD(epad7); - volatile unsigned char ir; /* 08 interrupt */ - ENETPAD(epad8); - volatile unsigned char imr; /* 09 interrupt mask */ - ENETPAD(epad9); - volatile unsigned char pr; /* 10 poll */ - ENETPAD(epad10); - volatile unsigned char biucc; /* 11 bus interface unit configuration control */ - ENETPAD(epad11); - volatile unsigned char fifocc; /* 12 fifo configuration control */ - ENETPAD(epad12); - volatile unsigned char maccc; /* 13 media access control configuration control */ - ENETPAD(epad13); - volatile unsigned char plscc; /* 14 physical layer signalling configuration control */ - ENETPAD(epad14); - volatile unsigned char phycc; /* 15 physical layer configuration control */ - ENETPAD(epad15); - volatile unsigned char chipid1; /* 16 chip identification LSB */ - ENETPAD(epad16); - volatile unsigned char chipid2; /* 17 chip identification MSB */ - ENETPAD(epad17); - volatile unsigned char iac; /* 18 internal address configuration */ - ENETPAD(epad18); - volatile unsigned char res1; /* 19 */ - ENETPAD(epad19); - volatile unsigned char ladrf; /* 20 logical address filter -- 8 bytes */ - ENETPAD(epad20); - volatile unsigned char padr; /* 21 physical address -- 6 bytes */ - ENETPAD(epad21); - volatile unsigned char res2; /* 22 */ - ENETPAD(epad22); - volatile unsigned char res3; /* 23 */ - ENETPAD(epad23); - volatile unsigned char mpc; /* 24 missed packet count */ - ENETPAD(epad24); - volatile unsigned char res4; /* 25 */ - ENETPAD(epad25); - volatile unsigned char rntpc; /* 26 runt packet count */ - ENETPAD(epad26); - volatile unsigned char rcvcc; /* 27 receive collision count */ - ENETPAD(epad27); - volatile unsigned char res5; /* 28 */ - ENETPAD(epad28); - volatile unsigned char utr; /* 29 user test */ - ENETPAD(epad29); - volatile unsigned char res6; /* 30 */ - ENETPAD(epad30); - volatile unsigned char res7; /* 31 */ - }; - -/* - * Chip Revisions.. - */ - -#define MACE_REVISION_B0 0x0940 -#define MACE_REVISION_A2 0x0941 - -/* xmtfc */ -#define XMTFC_DRTRY 0X80 -#define XMTFC_DXMTFCS 0x08 -#define XMTFC_APADXNT 0x01 - -/* xmtfs */ -#define XMTFS_XNTSV 0x80 -#define XMTFS_XMTFS 0x40 -#define XMTFS_LCOL 0x20 -#define XMTFS_MORE 0x10 -#define XMTFS_ONE 0x08 -#define XMTFS_DEFER 0x04 -#define XMTFS_LCAR 0x02 -#define XMTFS_RTRY 0x01 - -/* xmtrc */ -#define XMTRC_EXDEF 0x80 - -/* rcvfc */ -#define RCVFC_LLRCV 0x08 -#define RCVFC_M_R 0x04 -#define RCVFC_ASTRPRCV 0x01 - -/* rcvfs */ -#define RCVFS_OFLO 0x80 -#define RCVFS_CLSN 0x40 -#define RCVFS_FRAM 0x20 -#define RCVFS_FCS 0x10 -#define RCVFS_REVCNT 0x0f - -/* fifofc */ -#define FIFOCC_XFW_8 0x00 -#define FIFOCC_XFW_16 0x40 -#define FIFOCC_XFW_32 0x80 -#define FIFOCC_XFW_XX 0xc0 -#define FIFOCC_RFW_16 0x00 -#define FIFOCC_RFW_32 0x10 -#define FIFOCC_RFW_64 0x20 -#define FIFOCC_RFW_XX 0x30 -#define FIFOCC_XFWU 0x08 -#define FIFOCC_RFWU 0x04 -#define FIFOCC_XBRST 0x02 -#define FIFOCC_RBRST 0x01 - - -/* ir */ -#define IR_JAB 0x80 -#define IR_BABL 0x40 -#define IR_CERR 0x20 -#define IR_RCVCCO 0x10 -#define IR_RNTPCO 0x08 -#define IR_MPCO 0x04 -#define IR_RCVINT 0x02 -#define IR_XMTINT 0x01 - -/* imr */ -#define IMR_MJAB 0x80 -#define IMR_MBABL 0x40 -#define IMR_MCERR 0x20 -#define IMR_MRCVCCO 0x10 -#define IMR_MRNTPCO 0x08 -#define IMR_MMPCO 0x04 -#define IMR_MRCVINT 0x02 -#define IMR_MXMTINT 0x01 - -/* pr */ -#define PR_XMTSV 0x80 -#define PR_TDTREQ 0x40 -#define PR_RDTREQ 0x20 - -/* biucc */ -#define BIUCC_BSWP 0x40 -#define BIUCC_XMTSP04 0x00 -#define BIUCC_XMTSP16 0x10 -#define BIUCC_XMTSP64 0x20 -#define BIUCC_XMTSP112 0x30 -#define BIUCC_SWRST 0x01 - -/* fifocc */ -#define FIFOCC_XMTFW08W 0x00 -#define FIFOCC_XMTFW16W 0x40 -#define FIFOCC_XMTFW32W 0x80 - -#define FIFOCC_RCVFW16 0x00 -#define FIFOCC_RCVFW32 0x10 -#define FIFOCC_RCVFW64 0x20 - -#define FIFOCC_XMTFWU 0x08 -#define FIFOCC_RCVFWU 0x04 -#define FIFOCC_XMTBRST 0x02 -#define FIFOCC_RCVBRST 0x01 - -/* maccc */ -#define MACCC_PROM 0x80 -#define MACCC_DXMT2PD 0x40 -#define MACCC_EMBA 0x20 -#define MACCC_DRCVPA 0x08 -#define MACCC_DRCVBC 0x04 -#define MACCC_ENXMT 0x02 -#define MACCC_ENRCV 0x01 - -/* plscc */ -#define PLSCC_XMTSEL 0x08 -#define PLSCC_AUI 0x00 -#define PLSCC_TENBASE 0x02 -#define PLSCC_DAI 0x04 -#define PLSCC_GPSI 0x06 -#define PLSCC_ENPLSIO 0x01 - -/* phycc */ -#define PHYCC_LNKFL 0x80 -#define PHYCC_DLNKTST 0x40 -#define PHYCC_REVPOL 0x20 -#define PHYCC_DAPC 0x10 -#define PHYCC_LRT 0x08 -#define PHYCC_ASEL 0x04 -#define PHYCC_RWAKE 0x02 -#define PHYCC_AWAKE 0x01 - -/* iac */ -#define IAC_ADDRCHG 0x80 -#define IAC_PHYADDR 0x04 -#define IAC_LOGADDR 0x02 - -/* utr */ -#define UTR_RTRE 0x80 -#define UTR_RTRD 0x40 -#define UTR_RPA 0x20 -#define UTR_FCOLL 0x10 -#define UTR_RCVFCSE 0x08 - -#define UTR_NOLOOP 0x00 -#define UTR_EXTLOOP 0x02 -#define UTR_INLOOP 0x04 -#define UTR_INLOOP_M 0x06 - -#define ENET_PHYADDR_LEN 6 -#define ENET_HEADER 14 - -#define BFRSIZ 2048 -#define ETHER_ADD_SIZE 6 /* size of a MAC address */ -#define DSF_LOCK 1 -#define DSF_RUNNING 2 -#define MOD_ENAL 1 -#define MOD_PROM 2 - -/* - * MACE Chip revision codes - */ -#define MACERevA2 0x0941 -#define MACERevB0 0x0940 - -#ifdef KERNEL -int mace_delmulti __P((register struct ifreq *, register struct arpcom *, - struct ether_addr *)); -int mace_addmulti __P((register struct ifreq *, register struct arpcom *)); -void mace_sync_mcast __P((register struct ifnet *)); -void mace_sync_promisc __P((register struct ifnet *)); -#endif /* KERNEL */ - diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_bmap.c xnu-517/bsd/isofs/cd9660/cd9660_bmap.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_bmap.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_bmap.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -103,6 +103,17 @@ return (0); /* + * Associated files have an Apple Double header + */ + if ((ip->i_flag & ISO_ASSOCIATED) && (lblkno > (ADH_BLKS - 1))) { + lblkno -= ADH_BLKS; + *ap->a_bnp = (ip->iso_start + lblkno); + if (ap->a_runp) + *ap->a_runp = 0; + return (0); + } + + /* * Compute the requested block number */ bshift = ip->i_mnt->im_bshift; @@ -137,7 +148,7 @@ } */ *ap; { register struct iso_node *ip; - register struct iso_mnt *imp; + register struct iso_mnt *imp; if (ap->a_vp == NULL) return (EINVAL); @@ -185,6 +196,7 @@ struct iso_node *ip = VTOI(ap->a_vp); size_t cbytes; int devBlockSize = 0; + off_t offset = ap->a_foffset; /* * Check for underlying vnode requests and ensure that logical @@ -195,15 +207,29 @@ VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - *ap->a_bpn = (daddr_t)(ip->iso_start + lblkno(ip->i_mnt, ap->a_foffset)); + /* + * Associated files have an Apple Double header + */ + if (ip->i_flag & ISO_ASSOCIATED) { + if (offset < ADH_SIZE) { + if (ap->a_run) + *ap->a_run = 0; + *ap->a_bpn = -1; + goto out; + } else { + offset -= ADH_SIZE; + } + } + + *ap->a_bpn = (daddr_t)(ip->iso_start + lblkno(ip->i_mnt, offset)); /* * Determine maximum number of contiguous bytes following the * requested offset. */ if (ap->a_run) { - if (ip->i_size > ap->a_foffset) - cbytes = ip->i_size - ap->a_foffset; + if (ip->i_size > offset) + cbytes = ip->i_size - offset; else cbytes = 0; @@ -211,9 +237,9 @@ *ap->a_run = MIN(cbytes, ap->a_size); }; - +out: if (ap->a_poff) - *(int *)ap->a_poff = (long)ap->a_foffset & (devBlockSize - 1); + *(int *)ap->a_poff = (long)offset & (devBlockSize - 1); return (0); } diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_lookup.c xnu-517/bsd/isofs/cd9660/cd9660_lookup.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_lookup.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_lookup.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -64,12 +64,7 @@ * from: @(#)ufs_lookup.c 7.33 (Berkeley) 5/19/91 * * @(#)cd9660_lookup.c 8.5 (Berkeley) 12/5/94 - - - - * HISTORY - * 22-Jan-98 radar 1669467 - ISO 9660 CD support - jwc - + * */ #include @@ -79,7 +74,6 @@ #include #include #include -#include #include #include @@ -137,7 +131,7 @@ struct buf *bp; /* a buffer of directory entries */ struct iso_directory_record *ep = NULL;/* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ - int saveoffset = 0; /* offset of last directory entry in dir */ + int saveoffset = 0; /* offset of last directory entry in dir */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ struct vnode *pdp; /* saved dp during symlink work */ @@ -145,23 +139,22 @@ u_long bmask; /* block offset mask */ int lockparent; /* 1 => lockparent flag is set */ int wantparent; /* 1 => wantparent or lockparent flag */ - int wantrsrc; /* 1 => looking for resource fork */ + int wantassoc; int error; ino_t ino = 0; int reclen; u_short namelen; + int isoflags; char altname[ISO_RRIP_NAMEMAX]; int res; int len; char *name; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; - struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; struct proc *p = cnp->cn_proc; int devBlockSize=0; - long rsrcsize; size_t altlen; bp = NULL; @@ -171,45 +164,29 @@ imp = dp->i_mnt; lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); - wantrsrc = 0; + wantassoc = 0; + /* * Check accessiblity of directory. */ if (vdp->v_type != VDIR) return (ENOTDIR); - if ( (error = VOP_ACCESS(vdp, VEXEC, cred, p)) ) + if ( (error = VOP_ACCESS(vdp, VEXEC, cnp->cn_cred, p)) ) return (error); /* - * Determine if we're looking for a resource fork - * note: this could cause a read off the end of the - * component name buffer in some rare cases. - */ - if ((flags & ISLASTCN) == 0 && - bcmp(&cnp->cn_nameptr[cnp->cn_namelen], - _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC) - 1) == 0) { - flags |= ISLASTCN; - cnp->cn_consume = sizeof(_PATH_RSRCFORKSPEC) - 1; - wantrsrc = 1; - } - /* * We now have a segment name to search for, and a directory to search. * * Before tediously performing a linear scan of the directory, * check the name cache to see if the directory/name pair * we are looking for is known already. - * Note: resource forks are never in the name cache */ - if ((error = cache_lookup(vdp, vpp, cnp)) && !wantrsrc) { + if ((error = cache_lookup(vdp, vpp, cnp))) { int vpid; /* capability number of vnode */ if (error == ENOENT) return (error); -#ifdef PARANOID - if ((vdp->v_flag & VROOT) && (flags & ISDOTDOT)) - panic("cd9660_lookup: .. through root"); -#endif /* * Get the next vnode in the path. * See comment below starting `Step through' for @@ -253,8 +230,15 @@ len = cnp->cn_namelen; name = cnp->cn_nameptr; altname[0] = '\0'; - rsrcsize = 0; - + /* + * A "._" prefix means, we are looking for an associated file + */ + if (imp->iso_ftype != ISO_FTYPE_RRIP && + *name == ASSOCCHAR1 && *(name+1) == ASSOCCHAR2) { + wantassoc = 1; + len -= 2; + name += 2; + } /* * Decode search name into UCS-2 (Unicode) */ @@ -281,7 +265,7 @@ * profiling time and hence has been removed in the interest * of simplicity. */ - bmask = imp->im_bmask; + bmask = imp->im_sector_size - 1; if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff > dp->i_size) { entryoffsetinblock = 0; @@ -291,7 +275,7 @@ dp->i_offset = dp->i_diroff; if ((entryoffsetinblock = dp->i_offset & bmask) && - (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) + (error = VOP_BLKATOFF(vdp, SECTOFF(imp, dp->i_offset), NULL, &bp))) return (error); numdirpasses = 2; iso_nchstats.ncs_2passes++; @@ -308,7 +292,7 @@ if ((dp->i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)) ) + if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp,dp->i_offset), NULL, &bp)) ) return (error); entryoffsetinblock = 0; } @@ -322,38 +306,29 @@ if (reclen == 0) { /* skip to next block, if any */ dp->i_offset = - (dp->i_offset & ~bmask) + imp->logical_block_size; + (dp->i_offset & ~bmask) + imp->im_sector_size; continue; } - if (reclen < ISO_DIRECTORY_RECORD_SIZE) + if (reclen < ISO_DIRECTORY_RECORD_SIZE) { /* illegal entry, stop */ break; - - if (entryoffsetinblock + reclen > imp->logical_block_size) - /* entries are not allowed to cross boundaries */ + } + if (entryoffsetinblock + reclen > imp->im_sector_size) { + /* entries are not allowed to cross sector boundaries */ break; - + } namelen = isonum_711(ep->name_len); + isoflags = isonum_711(ep->flags); if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen) /* illegal entry, stop */ break; - - /* remember the size of resource forks (associated files) */ - if ((isonum_711(ep->flags) & (directoryBit | associatedBit)) == associatedBit) { - if (namelen < sizeof(altname) && ino == 0) { - rsrcsize = isonum_733(ep->size); - bcopy(ep->name, altname, namelen); - altname[namelen] = '\0'; - altlen = namelen; - } - } /* * Check for a name match. */ if (imp->iso_ftype == ISO_FTYPE_RRIP) { - if ( isonum_711(ep->flags) & directoryBit ) + if (isoflags & directoryBit) ino = isodirino(ep, imp); else ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; @@ -364,7 +339,7 @@ goto found; ino = 0; } else { - if ((!(isonum_711(ep->flags) & associatedBit)) == !wantrsrc) { + if ((!(isoflags & associatedBit)) == !wantassoc) { if ((len == 1 && *name == '.') || (flags & ISDOTDOT)) { @@ -382,14 +357,14 @@ goto notfound; } else if (imp->iso_ftype != ISO_FTYPE_JOLIET && !(res = isofncmp(name,len, ep->name,namelen))) { - if ( isonum_711(ep->flags) & directoryBit ) + if ( isoflags & directoryBit ) ino = isodirino(ep, imp); else ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; saveoffset = dp->i_offset; } else if (imp->iso_ftype == ISO_FTYPE_JOLIET && !(res = ucsfncmp((u_int16_t*)name, len, (u_int16_t*) ep->name, namelen))) { - if ( isonum_711(ep->flags) & directoryBit ) + if ( isoflags & directoryBit ) ino = isodirino(ep, imp); else ino = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; @@ -416,7 +391,7 @@ lblkno(imp, saveoffset)) { if (bp != NULL) brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, (off_t)saveoffset, NULL, &bp)) ) + if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp, saveoffset), NULL, &bp)) ) return (error); } entryoffsetinblock = saveoffset & bmask; @@ -443,7 +418,7 @@ /* * Insert name into cache (as non-existent) if appropriate. */ - if ((cnp->cn_flags & MAKEENTRY) && !wantrsrc) + if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); if (nameiop == CREATE || nameiop == RENAME) { /* @@ -452,11 +427,7 @@ */ return (EROFS); } - - if (wantrsrc) - return (ENOTDIR); - else - return (ENOENT); + return (ENOENT); found: if (numdirpasses == 2) @@ -519,10 +490,6 @@ dp->i_ino != ino, ep, p); /* save parent inode number */ VTOI(tdp)->i_parent = VTOI(pdp)->i_number; - if (!wantrsrc && (tdp->v_type == VREG) && (rsrcsize > 0)) { - if (bcmp(ep->name, altname, altlen) == 0) - VTOI(tdp)->i_rsrcsize = rsrcsize; - } brelse(bp); if (error) return (error); @@ -534,7 +501,7 @@ /* * Insert name into cache if appropriate. */ - if ((cnp->cn_flags & MAKEENTRY) && !wantrsrc) + if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); @@ -565,7 +532,11 @@ imp = ip->i_mnt; lbn = lblkno(imp, ap->a_offset); bsize = blksize(imp, ip, lbn); - + if ((bsize != imp->im_sector_size) && + (ap->a_offset & (imp->im_sector_size - 1)) == 0) { + bsize = imp->im_sector_size; + } + if ( (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) ) { brelse(bp); *ap->a_bpp = NULL; diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_mount.h xnu-517/bsd/isofs/cd9660/cd9660_mount.h --- xnu-344.49/bsd/isofs/cd9660/cd9660_mount.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_mount.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -71,16 +71,20 @@ /* * Arguments to mount ISO 9660 filesystems. */ +struct CDTOC; struct iso_args { char *fspec; /* block special device to mount */ struct export_args export; /* network export info */ int flags; /* mounting flags, see below */ int ssector; /* starting sector, 0 for 1st session */ + int toc_length; /* Size of *toc, including the toc.length field */ + struct CDTOC *toc; }; #define ISOFSMNT_NORRIP 0x00000001 /* disable Rock Ridge Ext.*/ #define ISOFSMNT_GENS 0x00000002 /* enable generation numbers */ #define ISOFSMNT_EXTATT 0x00000004 /* enable extended attributes */ #define ISOFSMNT_NOJOLIET 0x00000008 /* disable Joliet Ext.*/ +#define ISOFSMNT_TOC 0x00000010 /* iso_args.toc is valid */ #endif /* __APPLE_API_UNSTABLE */ #endif /* __ISOFS_CD9660_CD9660_MOUNT_H__ */ diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_node.c xnu-517/bsd/isofs/cd9660/cd9660_node.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_node.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_node.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -101,11 +102,9 @@ #define DNOHASH(device, inum) (((device) + ((inum)>>12)) & idvhash) #endif -/* defined in bsd/ufs/ufs/ufs_inode.c */ +/* defined in bsd/vfs/vfs_subr.c */ extern int prtactive; /* 1 => print out reclaim of active vnodes */ -extern void cache_purge (struct vnode *vp); - extern u_char isonullname[]; /* * Initialize hash links for inodes and dnodes. @@ -315,6 +314,8 @@ } if (ip->i_namep != isonullname) FREE(ip->i_namep, M_TEMP); + if (ip->i_riff != NULL) + FREE(ip->i_riff, M_TEMP); FREE_ZONE(vp->v_data, sizeof(struct iso_node), M_ISOFSNODE); vp->v_data = NULL; return (0); diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_node.h xnu-517/bsd/isofs/cd9660/cd9660_node.h --- xnu-344.49/bsd/isofs/cd9660/cd9660_node.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_node.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -114,6 +114,7 @@ struct iso_node *i_next, **i_prev; /* hash chain */ struct vnode *i_vnode; /* vnode associated with this inode */ struct vnode *i_devvp; /* vnode for block I/O */ + u_int32_t i_flag; /* flags, see below */ dev_t i_dev; /* device where inode resides */ ino_t i_number; /* the identity of the inode */ /* we use the actual starting block of the file */ @@ -140,11 +141,15 @@ u_int16_t i_FinderFlags; /* MacOS finder flags */ u_int16_t i_entries; /* count of directory entries */ + + struct riff_header *i_riff; }; #define i_forw i_chain[0] #define i_back i_chain[1] +/* These flags are kept in i_flag. */ +#define ISO_ASSOCIATED 0x0001 /* node is an associated file. */ /* defines VTOI and ITOV macros */ #undef VTOI @@ -162,13 +167,13 @@ int cd9660_access __P((struct vop_access_args *)); int cd9660_getattr __P((struct vop_getattr_args *)); int cd9660_read __P((struct vop_read_args *)); +int cd9660_xa_read __P((struct vop_read_args *)); int cd9660_ioctl __P((struct vop_ioctl_args *)); int cd9660_select __P((struct vop_select_args *)); int cd9660_mmap __P((struct vop_mmap_args *)); int cd9660_seek __P((struct vop_seek_args *)); int cd9660_readdir __P((struct vop_readdir_args *)); int cd9660_readlink __P((struct vop_readlink_args *)); -int cd9660_abortop __P((struct vop_abortop_args *)); int cd9660_inactive __P((struct vop_inactive_args *)); int cd9660_reclaim __P((struct vop_reclaim_args *)); int cd9660_bmap __P((struct vop_bmap_args *)); diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_rrip.c xnu-517/bsd/isofs/cd9660/cd9660_rrip.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_rrip.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_rrip.c Sat Oct 25 00:25:25 2003 @@ -300,7 +300,8 @@ switch (*isodir->name) { default: isofntrans(isodir->name, isonum_711(isodir->name_len), - ana->outbuf, ana->outlen, 1); + ana->outbuf, ana->outlen, 1, + isonum_711(isodir->flags) & associatedBit); break; case 0: *ana->outlen = 1; diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_util.c xnu-517/bsd/isofs/cd9660/cd9660_util.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_util.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_util.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -219,14 +219,24 @@ * translate a filename */ void -isofntrans(infn, infnlen, outfn, outfnlen, original) +isofntrans(infn, infnlen, outfn, outfnlen, original, assoc) u_char *infn, *outfn; int infnlen; u_short *outfnlen; int original; + int assoc; { int fnidx = 0; + /* + * Add a "._" prefix for associated files + */ + if (assoc) { + *outfn++ = ASSOCCHAR1; + *outfn++ = ASSOCCHAR2; + fnidx += 2; + infnlen +=2; + } for (; fnidx < infnlen; fnidx++) { char c = *infn++; @@ -259,12 +269,13 @@ * translate a UCS-2 filename to UTF-8 */ void -ucsfntrans(infn, infnlen, outfn, outfnlen, dir) +ucsfntrans(infn, infnlen, outfn, outfnlen, dir, assoc) u_int16_t *infn; int infnlen; u_char *outfn; u_short *outfnlen; int dir; + int assoc; { if (infnlen == 1) { strcpy(outfn, ".."); @@ -281,6 +292,13 @@ fnidx = infnlen/2; flags = 0; + /* + * Add a "._" prefix for associated files + */ + if (assoc) { + *outfn++ = ASSOCCHAR1; + *outfn++ = ASSOCCHAR2; + } if (!dir) { /* strip file version number */ for (fnidx--; fnidx > 0; fnidx--) { @@ -301,7 +319,7 @@ flags |= UTF_REVERSE_ENDIAN; (void) utf8_encodestr(infn, fnidx * 2, outfn, &outbytes, ISO_JOLIET_NAMEMAX, 0, flags); - *outfnlen = outbytes; + *outfnlen = assoc ? outbytes + 2 : outbytes; } } @@ -317,6 +335,7 @@ { struct iso_node *dp; struct buf *bp = NULL; + struct iso_mnt *imp; struct iso_directory_record *ep; u_long bmask; int error = 0; @@ -327,8 +346,9 @@ long diroffset; dp = VTOI(vdp); - bmask = dp->i_mnt->im_bmask; - logblksize = dp->i_mnt->logical_block_size; + imp = dp->i_mnt; + bmask = imp->im_sector_size - 1; + logblksize = imp->im_sector_size; blkoffset = diroffset = 0; dirs = files = 0; @@ -340,7 +360,7 @@ if ((diroffset & bmask) == 0) { if (bp != NULL) brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, diroffset, NULL, &bp)) ) + if ( (error = VOP_BLKATOFF(vdp, SECTOFF(imp, diroffset), NULL, &bp)) ) break; blkoffset = 0; } @@ -363,6 +383,15 @@ break; } + /* + * Some poorly mastered discs have an incorrect directory + * file size. If the '.' entry has a better size (bigger) + * then use that instead. + */ + if ((diroffset == 0) && (isonum_733(ep->size) > dp->i_size)) { + dp->i_size = isonum_733(ep->size); + } + if ( isonum_711(ep->flags) & directoryBit ) dirs++; else if ((isonum_711(ep->flags) & associatedBit) == 0) @@ -666,16 +695,42 @@ }; if (a & ATTR_VOL_ENCODINGSUSED) *((unsigned long long *)attrbufptr)++ = (unsigned long long)0; if (a & ATTR_VOL_CAPABILITIES) { - ((vol_capabilities_attr_t *)attrbufptr)->capabilities[VOL_CAPABILITIES_FORMAT] = VOL_CAP_FMT_PERSISTENTOBJECTIDS; + ((vol_capabilities_attr_t *)attrbufptr)->capabilities[VOL_CAPABILITIES_FORMAT] = + (imp->iso_ftype == ISO_FTYPE_RRIP ? VOL_CAP_FMT_SYMBOLICLINKS : 0) | + (imp->iso_ftype == ISO_FTYPE_RRIP ? VOL_CAP_FMT_HARDLINKS : 0) | + (imp->iso_ftype == ISO_FTYPE_RRIP || imp->iso_ftype == ISO_FTYPE_JOLIET + ? VOL_CAP_FMT_CASE_SENSITIVE : 0) | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS; ((vol_capabilities_attr_t *)attrbufptr)->capabilities[VOL_CAPABILITIES_INTERFACES] = - VOL_CAP_INT_ATTRLIST | VOL_CAP_INT_NFSEXPORT; + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT; ((vol_capabilities_attr_t *)attrbufptr)->capabilities[VOL_CAPABILITIES_RESERVED1] = 0; ((vol_capabilities_attr_t *)attrbufptr)->capabilities[VOL_CAPABILITIES_RESERVED2] = 0; ((vol_capabilities_attr_t *)attrbufptr)->valid[VOL_CAPABILITIES_FORMAT] = - VOL_CAP_FMT_PERSISTENTOBJECTIDS | VOL_CAP_FMT_SYMBOLICLINKS | VOL_CAP_FMT_HARDLINKS; + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS; ((vol_capabilities_attr_t *)attrbufptr)->valid[VOL_CAPABILITIES_INTERFACES] = - VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_ATTRLIST | VOL_CAP_INT_NFSEXPORT; + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK; ((vol_capabilities_attr_t *)attrbufptr)->valid[VOL_CAPABILITIES_RESERVED1] = 0; ((vol_capabilities_attr_t *)attrbufptr)->valid[VOL_CAPABILITIES_RESERVED2] = 0; diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_vfsops.c xnu-517/bsd/isofs/cd9660/cd9660_vfsops.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_vfsops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -75,7 +75,7 @@ #include #include #include -#include +#include #include #include #include @@ -88,6 +88,38 @@ #include #include +/* + * Minutes, Seconds, Frames (M:S:F) + */ +struct CDMSF { + u_char minute; + u_char second; + u_char frame; +}; + +/* + * Table Of Contents + */ +struct CDTOC_Desc { + u_char session; + u_char ctrl_adr; /* typed to be machine and compiler independent */ + u_char tno; + u_char point; + struct CDMSF address; + u_char zero; + struct CDMSF p; +}; + +struct CDTOC { + u_short length; /* in native cpu endian */ + u_char first_session; + u_char last_session; + struct CDTOC_Desc trackdesc[1]; +}; + +#define MSF_TO_LBA(msf) \ + (((((msf).minute * 60UL) + (msf).second) * 75UL) + (msf).frame - 150) + u_char isonullname[] = "\0"; extern int enodev (); @@ -162,8 +194,14 @@ LIST_INIT(&mp->mnt_vnodelist); args.flags = ISOFSMNT_ROOT; args.ssector = 0; + args.fspec = 0; + args.toc_length = 0; + args.toc = 0; if ((error = iso_mountfs(rootvp, mp, p, &args))) { vrele(rootvp); /* release the reference from bdevvp() */ + + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + FREE(mp->mnt_xinfo_ptr, M_TEMP); FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); return (error); } @@ -246,8 +284,8 @@ return (error); } - /* Set the mount flag to indicate that we support volfs */ - mp->mnt_flag |= MNT_DOVOLFS; + /* Indicate that we don't support volfs */ + mp->mnt_flag &= ~MNT_DOVOLFS; (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); @@ -258,6 +296,119 @@ } /* + * Find the BSD device for the physical disk corresponding to the + * mount point's device. We use this physical device to read whole + * (2352 byte) sectors from the CD to get the content for the video + * files (tracks). + * + * The "path" argument is the path to the block device that the volume + * is being mounted on (args.fspec). It should be of the form: + * /dev/disk1s0 + * where the last "s0" part is stripped off to determine the physical + * device's path. It is assumed to be in user memory. + */ +static struct vnode * +cd9660_phys_device(char *path, struct proc *p) +{ + int err; + char *whole_path = NULL; // path to "whole" device + char *s, *saved; + struct nameidata nd; + struct vnode *result; + size_t actual_size; + + if (path == NULL) + return NULL; + + result = NULL; + + /* Make a copy of the mount from name, then remove trailing "s...". */ + MALLOC(whole_path, char *, MNAMELEN, M_ISOFSMNT, M_WAITOK); + copyinstr(path, whole_path, MNAMELEN-1, &actual_size); + + /* + * I would use strrchr or rindex here, but those are declared __private_extern__, + * and can't be used across component boundaries at this time. + */ + for (s=whole_path, saved=NULL; *s; ++s) + if (*s == 's') + saved = s; + *saved = '\0'; + + /* Lookup the "whole" device. */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, whole_path, p); + err = namei(&nd); + if (err) { + printf("isofs: Cannot find physical device: %s\n", whole_path); + goto done; + } + + /* Open the "whole" device. */ + err = VOP_OPEN(nd.ni_vp, FREAD, FSCRED, p); + if (err) { + vrele(nd.ni_vp); + printf("isofs: Cannot open physical device: %s\n", whole_path); + goto done; + } + + result = nd.ni_vp; + +done: + FREE(whole_path, M_ISOFSMNT); + return result; +} + + +/* + * See if the given CD-ROM XA disc appears to be a Video CD + * (version < 2.0; so, not SVCD). If so, fill in the extent + * information for the MPEGAV directory, set the VCD flag, + * and return true. + */ +static int +cd9660_find_video_dir(struct iso_mnt *isomp) +{ + int result, err; + struct vnode *rootvp = NULL; + struct vnode *videovp = NULL; + struct componentname cn; + char dirname[] = "MPEGAV"; + + result = 0; /* Assume not a video CD */ + + err = cd9660_root(isomp->im_mountp, &rootvp); + if (err) { + printf("cd9660_find_video_dir: cd9660_root failed (%d)\n", err); + return 0; /* couldn't find video dir */ + } + + cn.cn_nameiop = LOOKUP; + cn.cn_flags = LOCKPARENT|ISLASTCN; + cn.cn_proc = current_proc(); + cn.cn_cred = cn.cn_proc->p_ucred; + cn.cn_pnbuf = dirname; + cn.cn_pnlen = sizeof(dirname)-1; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = cn.cn_pnlen; + + err = VOP_LOOKUP(rootvp, &videovp, &cn); + if (err == 0) { + struct iso_node *ip = VTOI(videovp); + result = 1; /* Looks like video CD */ + isomp->video_dir_start = ip->iso_start; + isomp->video_dir_end = ip->iso_start + (ip->i_size >> isomp->im_bshift); + isomp->im_flags2 |= IMF2_IS_VCD; + } + + if (videovp != NULL) + vput(videovp); + if (rootvp != NULL) + vput(rootvp); + + return result; +} + +/* * Common code for mount and mountroot */ static int @@ -336,6 +487,16 @@ printf("cd9660_vfsops.c: iso_mountfs: " "Invalid ID in volume desciptor.\n"); #endif + /* There should be a primary volume descriptor followed by any + * secondary volume descriptors, then an end volume descriptor. + * Some discs are mastered without an end volume descriptor or + * they have the type field set and the volume descriptor ID is + * not set. If we at least found a primary volume descriptor, + * mount the disc. + */ + if (pri != NULL) + break; + error = EINVAL; goto out; } @@ -405,6 +566,7 @@ MALLOC(isomp, struct iso_mnt *, sizeof *isomp, M_ISOFSMNT, M_WAITOK); bzero((caddr_t)isomp, sizeof *isomp); + isomp->im_sector_size = ISO_DEFAULT_BLOCK_SIZE; isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (pri->volume_space_size); /* @@ -444,8 +606,9 @@ /* See if this is a CD-XA volume */ if (bcmp( pri->CDXASignature, ISO_XA_ID, - sizeof(pri->CDXASignature) ) == 0 ) + sizeof(pri->CDXASignature) ) == 0 ) { isomp->im_flags2 |= IMF2_IS_CDXA; + } isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = 0; @@ -467,6 +630,20 @@ isomp->im_devvp = devvp; devvp->v_specflags |= SI_MOUNTEDON; + + /* + * If the logical block size is not 2K then we must + * set the block device's physical block size to this + * disc's logical block size. + * + */ + if (logical_block_size != iso_bsize) { + iso_bsize = logical_block_size; + if ((error = VOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, + (caddr_t)&iso_bsize, FWRITE, p->p_ucred, p))) + goto out; + devvp->v_specsize = iso_bsize; + } /* Check the Rock Ridge Extention support */ if (!(argp->flags & ISOFSMNT_NORRIP)) { @@ -523,13 +700,13 @@ /* * On Joliet CDs use the UCS-2 volume identifier. * - * This name can have up to 15 UCS-2 chars and is - * terminated with 0x0000 or padded with 0x0020. + * This name can have up to 16 UCS-2 chars. */ convflags = UTF_DECOMPOSED; if (BYTE_ORDER != BIG_ENDIAN) convflags |= UTF_REVERSE_ENDIAN; - for (i = 0, uchp = (u_int16_t *)sup->volume_id; i < 15 && uchp[i]; ++i); + uchp = (u_int16_t *)sup->volume_id; + for (i = 0; i < 16 && uchp[i]; ++i); if ((utf8_encodestr((u_int16_t *)sup->volume_id, (i * 2), vol_id, &convbytes, sizeof(vol_id), 0, convflags) == 0) && convbytes && (vol_id[0] != ' ')) { @@ -539,7 +716,7 @@ strp = vol_id + convbytes - 1; while (strp > vol_id && *strp == ' ') *strp-- = '\0'; - bcopy(vol_id, isomp->volume_id, convbytes); + bcopy(vol_id, isomp->volume_id, convbytes + 1); } rootp = (struct iso_directory_record *) @@ -556,6 +733,19 @@ supbp = NULL; } + /* If there was a TOC in the arguments, copy it in. */ + if (argp->flags & ISOFSMNT_TOC) { + MALLOC(isomp->toc, struct CDTOC *, argp->toc_length, M_ISOFSMNT, M_WAITOK); + if ((error = copyin(argp->toc, isomp->toc, argp->toc_length))) + goto out; + } + + /* See if this could be a Video CD */ + if ((isomp->im_flags2 & IMF2_IS_CDXA) && cd9660_find_video_dir(isomp)) { + /* Get the 2352-bytes-per-block device. */ + isomp->phys_devvp = cd9660_phys_device(argp->fspec, p); + } + return (0); out: if (bp) @@ -567,6 +757,8 @@ if (needclose) (void)VOP_CLOSE(devvp, FREAD, NOCRED, p); if (isomp) { + if (isomp->toc) + FREE((caddr_t)isomp->toc, M_ISOFSMNT); FREE((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; } @@ -630,6 +822,17 @@ return(error); vrele(isomp->im_devvp); + + if (isomp->phys_devvp) { + error = VOP_CLOSE(isomp->phys_devvp, FREAD, FSCRED, p); + if (error && !force) + return error; + vrele(isomp->phys_devvp); + } + + if (isomp->toc) + FREE((caddr_t)isomp->toc, M_ISOFSMNT); + FREE((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; @@ -767,7 +970,7 @@ * Get the export permission structure for this tuple. */ np = vfs_export_lookup(mp, &imp->im_export, nam); - if (np == NULL) + if (nam && (np == NULL)) return (EACCES); if ( (error = VFS_VGET(mp, &ifhp->ifid_ino, &nvp)) ) { @@ -781,11 +984,99 @@ return (ESTALE); } *vpp = nvp; - *exflagsp = np->netc_exflags; - *credanonp = &np->netc_anon; + if (np) { + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + } return (0); } +/* + * Scan the TOC for the track which contains the given sector. + * + * If there is no matching track, or no TOC, then return -1. + */ +static int +cd9660_track_for_sector(struct CDTOC *toc, u_int sector) +{ + int i, tracks, result; + + if (toc == NULL) + return -1; + + tracks = toc->length / sizeof(struct CDTOC_Desc); + + result = -1; /* Sentinel in case we don't find the right track. */ + for (i=0; itrackdesc[i].point < 100 && MSF_TO_LBA(toc->trackdesc[i].p) <= sector) { + result = toc->trackdesc[i].point; + } + } + + return result; +} + +/* + * Determine whether the given node is really a video CD video + * file. Return non-zero if it appears to be a video file. + */ +static int +cd9660_is_video_file(struct iso_node *ip, struct iso_mnt *imp) +{ + int lbn; + int track; + + /* Check whether this could really be a Video CD at all */ + if (((imp->im_flags2 & IMF2_IS_VCD) == 0) || + imp->phys_devvp == NULL || + imp->toc == NULL) + { + return 0; /* Doesn't even look like VCD... */ + } + + /* Make sure it is a file */ + if ((ip->inode.iso_mode & S_IFMT) != S_IFREG) + return 0; /* Not even a file... */ + + /* + * And in the right directory. This assumes the same inode + * number convention that cd9660_vget_internal uses (that + * part of the inode number is the block containing the + * file's directory entry). + */ + lbn = lblkno(imp, ip->i_number); + if (lbn < imp->video_dir_start || lbn >= imp->video_dir_end) + return 0; /* Not in the correct directory */ + + /* + * If we get here, the file should be a video file, but + * do a couple of extra sanity checks just to be sure. + * First, verify the form of the name + */ + if (strlen(ip->i_namep) != 11 || /* Wrong length? */ + bcmp(ip->i_namep+7, ".DAT", 4) || /* Wrong extension? */ + (bcmp(ip->i_namep, "AVSEQ", 5) && /* Wrong beginning? */ + bcmp(ip->i_namep, "MUSIC", 5))) + { + return 0; /* Invalid name format */ + } + + /* + * Verify that AVSEQnn.DAT is in track #(nn+1). This would + * not be appropriate for Super Video CD, which allows + * multiple sessions, so the track numbers might not + * match up like this. + */ + track = (ip->i_namep[5] - '0') * 10 + ip->i_namep[6] - '0'; + if (track != (cd9660_track_for_sector(imp->toc, ip->iso_start) - 1)) + { + return 0; /* Wrong number in name */ + } + + /* It must be a video file if we got here. */ + return 1; +} + int cd9660_vget(mp, ino, vpp) struct mount *mp; @@ -936,15 +1227,31 @@ * go get apple extensions to ISO directory record or use * defaults when there are no apple extensions. */ - if ( (isonum_711( isodir->flags ) & directoryBit) == 0 ) { + if ( ((isonum_711( isodir->flags ) & directoryBit) == 0) && + (imp->iso_ftype != ISO_FTYPE_RRIP) ) { /* This is an ISO directory record for a file */ - DRGetTypeCreatorAndFlags( imp, isodir, &ip->i_FileType, - &ip->i_Creator, &ip->i_FinderFlags ); + DRGetTypeCreatorAndFlags(imp, isodir, &ip->i_FileType, + &ip->i_Creator, &ip->i_FinderFlags); + + if (isonum_711(isodir->flags) & associatedBit) + ip->i_flag |= ISO_ASSOCIATED; + } + + /* + * Shadow the ISO 9660 invisible state to the FinderInfo + */ + if (isonum_711(isodir->flags) & existenceBit) { + ip->i_FinderFlags |= fInvisibleBit; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; + /* + * account for AppleDouble header + */ + if (ip->i_flag & ISO_ASSOCIATED) + ip->i_size += ADH_SIZE; /* * if we have a valid name, fill in i_namep with UTF-8 name @@ -965,13 +1272,13 @@ case ISO_FTYPE_JOLIET: ucsfntrans((u_int16_t *)isodir->name, namelen, utf8namep, &namelen, - isonum_711(isodir->flags) & directoryBit); + isonum_711(isodir->flags) & directoryBit, ip->i_flag & ISO_ASSOCIATED); break; default: isofntrans (isodir->name, namelen, utf8namep, &namelen, - imp->iso_ftype == ISO_FTYPE_9660); + imp->iso_ftype == ISO_FTYPE_9660, ip->i_flag & ISO_ASSOCIATED); } utf8namep[namelen] = '\0'; @@ -1005,6 +1312,22 @@ break; } + /* + * See if this is a Video CD file. If so, we must adjust the + * length to account for larger sectors plus the RIFF header. + * We also must substitute the VOP_READ and VOP_PAGEIN functions. + * + * The cd9660_is_video_file routine assumes that the inode has + * been completely set up; it refers to several fields. + * + * This must be done before we release bp, because isodir + * points into bp's data. + */ + if (cd9660_is_video_file(ip, imp)) + { + cd9660_xa_init(vp, isodir); + } + if (bp != 0) brelse(bp); @@ -1158,8 +1481,14 @@ myPtr += 14;/* add in CD-XA fixed record offset (tnx, Phillips) */ myNewAppleExtPtr = (NewAppleExtension *) myPtr; - /* calculate the "real" end of the directory record information */ + /* + * Calculate the "real" end of the directory record information. + * + * Note: We always read the first 4 bytes of the System-Use data, so + * adjust myPtr down so we don't read off the end of the directory! + */ myPtr = ((char *) theDirRecPtr) + (isonum_711(theDirRecPtr->length)); + myPtr -= sizeof(NewAppleExtension) - 1; while( (char *) myNewAppleExtPtr < myPtr ) /* end of directory buffer */ { /* @@ -1169,8 +1498,8 @@ * struct OptionalSystemUse * { * byte Signature[2]; - * byte systemUseID; * byte OSULength; + * byte systemUseID; * byte fileType[4]; # only if HFS * byte fileCreator[4]; # only if HFS * byte finderFlags[2]; # only if HFS diff -urN xnu-344.49/bsd/isofs/cd9660/cd9660_vnops.c xnu-517/bsd/isofs/cd9660/cd9660_vnops.c --- xnu-344.49/bsd/isofs/cd9660/cd9660_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/cd9660_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -91,8 +91,8 @@ #include #include #include - #include +#include #include #include @@ -308,9 +308,52 @@ imp = ip->i_mnt; VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - if (UBCISVALID(vp)) - error = cluster_read(vp, uio, (off_t)ip->i_size, devBlockSize, 0); - else { + if (UBCISVALID(vp)) { + /* + * Copy any part of the Apple Double header. + */ + if ((ip->i_flag & ISO_ASSOCIATED) && (uio->uio_offset < ADH_SIZE)) { + apple_double_header_t header; + int bytes; + + if (uio->uio_offset < sizeof(apple_double_header_t)) { + header.magic = APPLEDOUBLE_MAGIC; + header.version = APPLEDOUBLE_VERSION; + header.count = 2; + header.entries[0].entryID = APPLEDOUBLE_FINDERINFO; + header.entries[0].offset = offsetof(apple_double_header_t, finfo); + header.entries[0].length = 32; + header.entries[1].entryID = APPLEDOUBLE_RESFORK; + header.entries[1].offset = ADH_SIZE; + header.entries[1].length = ip->i_size - ADH_SIZE; + header.finfo.fdType = ip->i_FileType; + header.finfo.fdCreator = ip->i_Creator; + header.finfo.fdFlags = ip->i_FinderFlags; + header.finfo.fdLocation.v = -1; + header.finfo.fdLocation.h = -1; + header.finfo.fdReserved = 0; + + bytes = min(uio->uio_resid, sizeof(apple_double_header_t) - uio->uio_offset); + error = uiomove(((char *) &header) + uio->uio_offset, bytes, uio); + if (error) + return error; + } + if (uio->uio_resid && uio->uio_offset < ADH_SIZE) { + caddr_t buffer; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&buffer, ADH_SIZE)) { + return (ENOMEM); + } + bytes = min(uio->uio_resid, ADH_SIZE - uio->uio_offset); + error = uiomove(((char *) buffer) + uio->uio_offset, bytes, uio); + kmem_free(kernel_map, (vm_offset_t)buffer, ADH_SIZE); + if (error) + return error; + } + } + if (uio->uio_resid > 0) + error = cluster_read(vp, uio, (off_t)ip->i_size, devBlockSize, 0); + } else { do { lbn = lblkno(imp, uio->uio_offset); @@ -363,7 +406,6 @@ struct proc *a_p; } */ *ap; { - printf("You did ioctl for isofs !!\n"); return (ENOTTY); } @@ -505,6 +547,10 @@ /* * Vnode op for readdir + * + * Note that directories are sector aligned (2K) and + * that an entry can cross a logical block but not + * a sector. */ int cd9660_readdir(ap) @@ -536,7 +582,7 @@ dp = VTOI(vdp); imp = dp->i_mnt; - bmask = imp->im_bmask; + bmask = imp->im_sector_size - 1; MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = 0; @@ -550,7 +596,7 @@ idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && - (error = VOP_BLKATOFF(vdp, (off_t)idp->curroff, NULL, &bp))) { + (error = VOP_BLKATOFF(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) { FREE(idp, M_TEMP); return (error); } @@ -565,7 +611,7 @@ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); - if ( (error = VOP_BLKATOFF(vdp, (off_t)idp->curroff, NULL, &bp)) ) + if ((error = VOP_BLKATOFF(vdp, SECTOFF(imp, idp->curroff), NULL, &bp))) break; entryoffsetinblock = 0; } @@ -579,7 +625,7 @@ if (reclen == 0) { /* skip to next block, if any */ idp->curroff = - (idp->curroff & ~bmask) + imp->logical_block_size; + (idp->curroff & ~bmask) + imp->im_sector_size; continue; } @@ -589,7 +635,7 @@ break; } - if (entryoffsetinblock + reclen > imp->logical_block_size) { + if (entryoffsetinblock + reclen > imp->im_sector_size) { error = EINVAL; /* illegal directory, so stop looking */ break; @@ -603,17 +649,20 @@ break; } - /* skip over associated files (Mac OS resource fork) */ - if (isonum_711(ep->flags) & associatedBit) { - idp->curroff += reclen; - entryoffsetinblock += reclen; - continue; + /* + * Some poorly mastered discs have an incorrect directory + * file size. If the '.' entry has a better size (bigger) + * then use that instead. + */ + if ((uio->uio_offset == 0) && (isonum_733(ep->size) > endsearch)) { + dp->i_size = endsearch = isonum_733(ep->size); } if ( isonum_711(ep->flags) & directoryBit ) idp->current.d_fileno = isodirino(ep, imp); else { - idp->current.d_fileno = (bp->b_blkno << imp->im_bshift) + entryoffsetinblock; + idp->current.d_fileno = (bp->b_blkno << imp->im_bshift) + + entryoffsetinblock; } idp->curroff += reclen; @@ -630,7 +679,8 @@ case ISO_FTYPE_JOLIET: ucsfntrans((u_int16_t *)ep->name, idp->current.d_namlen, idp->current.d_name, &namelen, - isonum_711(ep->flags) & directoryBit); + isonum_711(ep->flags) & directoryBit, + isonum_711(ep->flags) & associatedBit); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); @@ -650,7 +700,8 @@ default: isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, - imp->iso_ftype == ISO_FTYPE_9660); + imp->iso_ftype == ISO_FTYPE_9660, + isonum_711(ep->flags) & associatedBit); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); @@ -830,22 +881,6 @@ } /* - * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually - * done. If a buffer has been saved in anticipation of a CREATE, delete it. - */ -int -cd9660_abortop(ap) - struct vop_abortop_args /* { - struct vnode *a_dvp; - struct componentname *a_cnp; - } */ *ap; -{ - if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) - FREE_ZONE(ap->a_cnp->cn_pnbuf, ap->a_cnp->cn_pnlen, M_NAMEI); - return (0); -} - -/* * Lock an inode. */ @@ -1017,22 +1052,65 @@ { struct vnode *vp = ap->a_vp; upl_t pl = ap->a_pl; - size_t size= ap->a_size; + size_t size = ap->a_size; off_t f_offset = ap->a_f_offset; vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; register struct iso_node *ip = VTOI(vp); - int devBlockSize=0, error; + int error = 0; - /* check pageouts are for reg file only and ubc info is present*/ - if (UBCINVALID(vp)) - panic("cd9660_pagein: Not a VREG"); - UBCINFOCHECK("cd9660_pagein", vp); + /* + * Copy the Apple Double header. + */ + if ((ip->i_flag & ISO_ASSOCIATED) && (f_offset == 0) && (size == ADH_SIZE)) { + apple_double_header_t header; + kern_return_t kret; + vm_offset_t ioaddr; + + kret = ubc_upl_map(pl, &ioaddr); + if (kret != KERN_SUCCESS) + panic("cd9660_xa_pagein: ubc_upl_map error = %d", kret); + ioaddr += pl_offset; + bzero((caddr_t)ioaddr, ADH_SIZE); + + header.magic = APPLEDOUBLE_MAGIC; + header.version = APPLEDOUBLE_VERSION; + header.count = 2; + header.entries[0].entryID = APPLEDOUBLE_FINDERINFO; + header.entries[0].offset = offsetof(apple_double_header_t, finfo); + header.entries[0].length = 32; + header.entries[1].entryID = APPLEDOUBLE_RESFORK; + header.entries[1].offset = ADH_SIZE; + header.entries[1].length = ip->i_size - ADH_SIZE; + header.finfo.fdType = ip->i_FileType; + header.finfo.fdCreator = ip->i_Creator; + header.finfo.fdFlags = ip->i_FinderFlags; + header.finfo.fdLocation.v = -1; + header.finfo.fdLocation.h = -1; + header.finfo.fdReserved = 0; + + bcopy((caddr_t)&header, (caddr_t)ioaddr, sizeof(apple_double_header_t)); + + kret = ubc_upl_unmap(pl); + if (kret != KERN_SUCCESS) + panic("cd9660_xa_pagein: ubc_upl_unmap error = %d", kret); - VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); + if ((flags & UPL_NOCOMMIT) == 0) { + ubc_upl_commit_range(pl, pl_offset, size, UPL_COMMIT_FREE_ON_EMPTY); + } + } else { + int devBlockSize = 0; + + /* check pageouts are for reg file only and ubc info is present*/ + if (UBCINVALID(vp)) + panic("cd9660_pagein: Not a VREG"); + UBCINFOCHECK("cd9660_pagein", vp); + + VOP_DEVBLOCKSIZE(ip->i_devvp, &devBlockSize); - error = cluster_pagein(vp, pl, pl_offset, f_offset, size, + error = cluster_pagein(vp, pl, pl_offset, f_offset, size, (off_t)ip->i_size, devBlockSize, flags); + } return (error); } @@ -1169,6 +1247,277 @@ } /* + * Make a RIFF file header for a CD-ROM XA media file. + */ +__private_extern__ void +cd9660_xa_init(struct vnode *vp, struct iso_directory_record *isodir) +{ + u_long sectors; + struct iso_node *ip = VTOI(vp); + struct riff_header *header; + u_char name_len; + char *cdxa; + + MALLOC(header, struct riff_header *, sizeof(struct riff_header), M_TEMP, M_WAITOK); + + sectors = ip->i_size / 2048; + + strncpy(header->riff, "RIFF", 4); + header->fileSize = NXSwapHostLongToLittle(sectors * CDXA_SECTOR_SIZE + sizeof(struct riff_header) - 8); + strncpy(header->cdxa, "CDXA", 4); + strncpy(header->fmt, "fmt ", 4); + header->fmtSize = NXSwapHostLongToLittle(16); + strncpy(header->data, "data", 4); + header->dataSize = NXSwapHostLongToLittle(sectors * CDXA_SECTOR_SIZE); + + /* + * Copy the CD-ROM XA extended directory information into the header. As far as + * I can tell, it's always 14 bytes in the directory record, but allocated 16 bytes + * in the header (the last two being zeroed pad bytes). + */ + name_len = isonum_711(isodir->name_len); + cdxa = &isodir->name[name_len]; + if ((name_len & 0x01) == 0) + ++cdxa; /* Skip pad byte */ + bcopy(cdxa, header->fmtData, 14); + header->fmtData[14] = 0; + header->fmtData[15] = 0; + + /* + * Point this i-node to the "whole sector" device instead of the normal + * device. This allows cd9660_strategy to be ignorant of the block + * (sector) size. + */ + vrele(ip->i_devvp); + ip->i_devvp = ip->i_mnt->phys_devvp; + VREF(ip->i_devvp); + + ip->i_size = sectors * CDXA_SECTOR_SIZE + sizeof(struct riff_header); + ip->i_riff = header; + vp->v_op = cd9660_cdxaop_p; +} + +/* + * Helper routine for VOP_READ and VOP_PAGEIN of CD-ROM XA multimedia files. + * This routine determines the physical location of the file, then reads + * sectors directly from the device into a buffer. It also handles inserting + * the RIFF header at the beginning of the file. + * + * Exactly one of buffer or uio must be non-zero. It will either bcopy to + * buffer, or uiomove via uio. + * + * XXX Should this code be using breadn and vp->v_lastr to support single-block + * read-ahead? Should we try more aggressive read-ahead like cluster_io does? + * + * XXX This could be made to do larger I/O to the device (reading all the + * whole sectors directly into the buffer). That would make the code more + * complex, and the current code only adds 2.5% overhead compared to reading + * from the device directly (at least on my test machine). + */ +static int +cd9660_xa_read_common( + struct vnode *vp, + off_t offset, + size_t amount, + caddr_t buffer, + struct uio *uio) +{ + struct iso_node *ip = VTOI(vp); + struct buf *bp; + off_t diff; /* number of bytes from offset to file's EOF */ + daddr_t block; /* physical disk block containing offset */ + off_t sect_off; /* starting offset into current sector */ + u_int count; /* number of bytes to transfer in current block */ + int error=0; + + /* + * Copy any part of the RIFF header. + */ + if (offset < sizeof(struct riff_header)) { + char *p; + + p = ((char *) ip->i_riff) + offset; + count = min(amount, sizeof(struct riff_header) - offset); + if (buffer) { + bcopy(p, buffer, count); + buffer += count; + } else { + error = uiomove(p, count, uio); + } + amount -= count; + offset += count; + } + if (error) + return error; + + /* + * Loop over (possibly partial) blocks to transfer. + */ + while (error == 0 && amount > 0) { + /* + * Determine number of bytes until EOF. If we've hit + * EOF then return. + */ + diff = ip->i_size - offset; + if (diff <= 0) + return 0; + + /* Get a block from the underlying device */ + block = ip->iso_start + (offset - sizeof(struct riff_header))/CDXA_SECTOR_SIZE; + error = bread(ip->i_devvp, block, CDXA_SECTOR_SIZE, NOCRED, &bp); + if (error) { + brelse(bp); + return error; + } + if (bp->b_resid) { + printf("isofs: cd9660_xa_read_common: bread didn't read full sector\n"); + return EIO; + } + + /* Figure out which part of the block to copy, and copy it */ + sect_off = (offset - sizeof(struct riff_header)) % CDXA_SECTOR_SIZE; + count = min(CDXA_SECTOR_SIZE-sect_off, amount); + if (diff < count) /* Pin transfer amount to EOF */ + count = diff; + + if (buffer) { + bcopy(bp->b_data+sect_off, buffer, count); + buffer += count; + } else { + error = uiomove(bp->b_data+sect_off, count, uio); + } + amount -= count; + offset += count; + + /* + * If we copied through the end of the block, or the end of file, then + * age the device block. This is optimized for sequential access. + */ + if (sect_off+count == CDXA_SECTOR_SIZE || offset == (off_t)ip->i_size) + bp->b_flags |= B_AGE; + brelse(bp); + } + + return error; +} + +/* + * Read from a CD-ROM XA multimedia file. + * + * This uses the same common routine as pagein for doing the actual read + * from the device. + * + * This routine doesn't do any caching beyond what the block device does. + * Even then, cd9660_xa_read_common ages the blocks once we read up to + * the end. + * + * We don't even take advantage if the file has been memory mapped and has + * valid pages already (in which case we could just uiomove from the page + * to the caller). Since we're a read-only filesystem, there can't be + * any cache coherency problems. Multimedia files are expected to be + * large and streamed anyway, so caching file contents probably isn't + * important. + */ +int +cd9660_xa_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + register struct iso_node *ip = VTOI(vp); + off_t offset = uio->uio_offset; + size_t size = uio->uio_resid; + + /* Check for some obvious parameter problems */ + if (offset < 0) + return EINVAL; + if (size == 0) + return 0; + if (offset >= ip->i_size) + return 0; + + /* Pin the size of the read to the file's EOF */ + if (offset + size > ip->i_size) + size = ip->i_size - offset; + + return cd9660_xa_read_common(vp, offset, size, NULL, uio); +} + +/* + * Page in from a CD-ROM XA media file. + * + * Since our device block size isn't a power of two, we can't use + * cluster_pagein. Instead, we have to map the page and read into it. + */ +static int +cd9660_xa_pagein(ap) + struct vop_pagein_args /* { + struct vnode *a_vp, + upl_t a_pl, + vm_offset_t a_pl_offset, + off_t a_f_offset, + size_t a_size, + struct ucred *a_cred, + int a_flags + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + upl_t pl = ap->a_pl; + size_t size= ap->a_size; + off_t f_offset = ap->a_f_offset; + vm_offset_t pl_offset = ap->a_pl_offset; + int flags = ap->a_flags; + register struct iso_node *ip = VTOI(vp); + int error; + kern_return_t kret; + vm_offset_t ioaddr; + + /* check pageins are for reg file only and ubc info is present*/ + if (UBCINVALID(vp)) + panic("cd9660_xa_pagein: Not a VREG"); + UBCINFOCHECK("cd9660_xa_pagein", vp); + + if (size <= 0) + panic("cd9660_xa_pagein: size = %d", size); + + kret = ubc_upl_map(pl, &ioaddr); + if (kret != KERN_SUCCESS) + panic("cd9660_xa_pagein: ubc_upl_map error = %d", kret); + + ioaddr += pl_offset; + + /* Make sure pagein doesn't extend past EOF */ + if (f_offset + size > ip->i_size) + size = ip->i_size - f_offset; /* pin size to EOF */ + + /* Read the data in using the underlying device */ + error = cd9660_xa_read_common(vp, f_offset, size, (caddr_t)ioaddr, NULL); + + /* Zero fill part of page past EOF */ + if (ap->a_size > size) + bzero((caddr_t)ioaddr+size, ap->a_size-size); + + kret = ubc_upl_unmap(pl); + if (kret != KERN_SUCCESS) + panic("cd9660_xa_pagein: ubc_upl_unmap error = %d", kret); + + if ((flags & UPL_NOCOMMIT) == 0) + { + if (error) + ubc_upl_abort_range(pl, pl_offset, ap->a_size, UPL_ABORT_FREE_ON_EMPTY); + else + ubc_upl_commit_range(pl, pl_offset, ap->a_size, UPL_COMMIT_FREE_ON_EMPTY); + } + + return error; +} + +/* * Global vfs data structures for isofs */ #define cd9660_create \ @@ -1244,7 +1593,7 @@ { &vop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ { &vop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ { &vop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ - { &vop_abortop_desc, (VOPFUNC)cd9660_abortop }, /* abortop */ + { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ @@ -1271,6 +1620,65 @@ }; struct vnodeopv_desc cd9660_vnodeop_opv_desc = { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; + +/* + * The VOP table for CD-ROM XA (media) files is almost the same + * as for ordinary files, except for read, and pagein. + * Note that cd9660_xa_read doesn't use cluster I/O, so cmap + * isn't needed, and isn't implemented. Similarly, it doesn't + * do bread() on CD XA vnodes, so bmap, blktooff, offtoblk + * aren't needed. + */ +int (**cd9660_cdxaop_p)(void *); +struct vnodeopv_entry_desc cd9660_cdxaop_entries[] = { + { &vop_default_desc, (VOPFUNC)vn_default_error }, + { &vop_lookup_desc, (VOPFUNC)cd9660_lookup }, /* lookup */ + { &vop_create_desc, (VOPFUNC)cd9660_create }, /* create */ + { &vop_mknod_desc, (VOPFUNC)cd9660_mknod }, /* mknod */ + { &vop_open_desc, (VOPFUNC)cd9660_open }, /* open */ + { &vop_close_desc, (VOPFUNC)cd9660_close }, /* close */ + { &vop_access_desc, (VOPFUNC)cd9660_access }, /* access */ + { &vop_getattr_desc, (VOPFUNC)cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, (VOPFUNC)cd9660_setattr }, /* setattr */ + { &vop_read_desc, (VOPFUNC)cd9660_xa_read }, /* read */ + { &vop_write_desc, (VOPFUNC)cd9660_write }, /* write */ + { &vop_lease_desc, (VOPFUNC)cd9660_lease_check },/* lease */ + { &vop_ioctl_desc, (VOPFUNC)cd9660_ioctl }, /* ioctl */ + { &vop_select_desc, (VOPFUNC)cd9660_select }, /* select */ + { &vop_mmap_desc, (VOPFUNC)cd9660_mmap }, /* mmap */ + { &vop_fsync_desc, (VOPFUNC)cd9660_fsync }, /* fsync */ + { &vop_seek_desc, (VOPFUNC)cd9660_seek }, /* seek */ + { &vop_remove_desc, (VOPFUNC)cd9660_remove }, /* remove */ + { &vop_link_desc, (VOPFUNC)cd9660_link }, /* link */ + { &vop_rename_desc, (VOPFUNC)cd9660_rename }, /* rename */ + { &vop_copyfile_desc, (VOPFUNC)cd9660_copyfile },/* copyfile */ + { &vop_mkdir_desc, (VOPFUNC)cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, (VOPFUNC)cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, (VOPFUNC)cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, (VOPFUNC)cd9660_readdir }, /* readdir */ + { &vop_readlink_desc, (VOPFUNC)cd9660_readlink },/* readlink */ + { &vop_inactive_desc, (VOPFUNC)cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, (VOPFUNC)cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, (VOPFUNC)cd9660_lock }, /* lock */ + { &vop_unlock_desc, (VOPFUNC)cd9660_unlock }, /* unlock */ + { &vop_strategy_desc, (VOPFUNC)cd9660_strategy },/* strategy */ + { &vop_print_desc, (VOPFUNC)cd9660_print }, /* print */ + { &vop_islocked_desc, (VOPFUNC)cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, (VOPFUNC)cd9660_pathconf },/* pathconf */ + { &vop_advlock_desc, (VOPFUNC)cd9660_advlock }, /* advlock */ + { &vop_blkatoff_desc, (VOPFUNC)cd9660_blkatoff },/* blkatoff */ + { &vop_valloc_desc, (VOPFUNC)cd9660_valloc }, /* valloc */ + { &vop_vfree_desc, (VOPFUNC)cd9660_vfree }, /* vfree */ + { &vop_truncate_desc, (VOPFUNC)cd9660_truncate },/* truncate */ + { &vop_update_desc, (VOPFUNC)cd9660_update }, /* update */ + { &vop_bwrite_desc, (VOPFUNC)vn_bwrite }, + { &vop_pagein_desc, (VOPFUNC)cd9660_xa_pagein }, /* Pagein */ + { &vop_pageout_desc, (VOPFUNC)cd9660_pageout }, /* Pageout */ + { &vop_getattrlist_desc, (VOPFUNC)cd9660_getattrlist }, /* getattrlist */ + { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc cd9660_cdxaop_opv_desc = + { &cd9660_cdxaop_p, cd9660_cdxaop_entries }; /* * Special device vnode ops diff -urN xnu-344.49/bsd/isofs/cd9660/iso.h xnu-517/bsd/isofs/cd9660/iso.h --- xnu-344.49/bsd/isofs/cd9660/iso.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/isofs/cd9660/iso.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -261,6 +261,7 @@ int logical_block_size; int im_bshift; int im_bmask; + int im_sector_size; int volume_space_size; struct netexport im_export; @@ -276,6 +277,10 @@ struct timespec creation_date; /* needed for getattrlist */ struct timespec modification_date; /* needed for getattrlist */ u_char volume_id[32]; /* name of volume */ + struct vnode *phys_devvp; /* device for 2352-byte blocks */ + struct CDTOC *toc; /* the TOC, or NULL for none */ + int video_dir_start; /* start sector of the "MPEGAV" dir */ + int video_dir_end; /* sector following end of "MPEGAV" dir */ }; /* bit settings for iso_mnt.im_flags2 */ @@ -286,6 +291,9 @@ */ #define IMF2_IS_CDXA 0x00000001 +/* CD is Video CD (version < 2.0) */ +#define IMF2_IS_VCD 0x00000002 + #define VFSTOISOFS(mp) ((struct iso_mnt *)((mp)->mnt_data)) #define blkoff(imp, loc) ((loc) & (imp)->im_bmask) @@ -293,6 +301,10 @@ #define lblkno(imp, loc) ((loc) >> (imp)->im_bshift) #define blksize(imp, ip, lbn) ((imp)->logical_block_size) +#define SECTOFF(imp, off) \ + (off_t)(((off) / (imp)->im_sector_size) * (imp)->im_sector_size) + + int cd9660_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); int cd9660_start __P((struct mount *, int, struct proc *)); @@ -316,6 +328,7 @@ #if FIFO extern int (**cd9660_fifoop_p)(void *); #endif +extern int (**cd9660_cdxaop_p)(void *); static __inline int isonum_711(p) @@ -389,8 +402,8 @@ int isofncmp __P((u_char *, int, u_char *, int)); int ucsfncmp __P((u_int16_t *, int, u_int16_t *, int)); -void isofntrans __P((u_char *, int, u_char *, u_short *, int)); -void ucsfntrans __P((u_int16_t *, int, u_char *, u_short *, int)); +void isofntrans __P((u_char *, int, u_char *, u_short *, int, int)); +void ucsfntrans __P((u_int16_t *, int, u_char *, u_short *, int, int)); ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *)); int attrcalcsize __P((struct attrlist *attrlist)); void packattrblk __P((struct attrlist *alist, struct vnode *vp, @@ -398,9 +411,68 @@ /* - * Associated files have a leading '='. + * Associated files have a leading "._". + */ +#define ASSOCCHAR1 '.' +#define ASSOCCHAR2 '_' + +/* + * This header is prepended on media tracks, such as Video CD MPEG files. + */ +struct riff_header { + char riff[4]; // "RIFF" + u_int32_t fileSize; // little endian file size, not including this field or sig + char cdxa[4]; // "CDXA" + char fmt[4]; // "fmt " + u_int32_t fmtSize; // always 16 (XXX this is an assumption) + char fmtData[16]; // CDXA extension of ISO directory entry, padded to 16 bytes + char data[4]; // "data" + u_int32_t dataSize; // number of sectors * 2352, little endian +}; + +#define CDXA_SECTOR_SIZE 2352 + + +/* + * AppleDouble constants */ -#define ASSOCCHAR '=' +#define APPLEDOUBLE_MAGIC 0x00051607 +#define APPLEDOUBLE_VERSION 0x00020000 + +#define APPLEDOUBLE_DATAFORK 1 +#define APPLEDOUBLE_RESFORK 2 +#define APPLEDOUBLE_FINDERINFO 9 + +/* + * Note that 68k alignment is needed to make sure that the first + * AppleDoubleEntry (after the numEntries below) is *immediately* + * after the numEntries, and not padded by 2 bytes. + * + * Consult RFC 1740 for details on AppleSingle/AppleDouble formats. + */ +#pragma options align=mac68k + +struct apple_double_entry { + u_int32_t entryID; + u_int32_t offset; + u_int32_t length; +}; +typedef struct apple_double_entry apple_double_entry_t; + +struct apple_double_header { + u_int32_t magic; + u_int32_t version; + u_int8_t filler[16]; + u_int16_t count; + apple_double_entry_t entries[2]; /* FinderInfo + ResourceFork */ + struct finder_info finfo; +}; +typedef struct apple_double_header apple_double_header_t; + +#define ADH_SIZE 4096 +#define ADH_BLKS 2 + +#pragma options align=reset #endif /* __APPLE_API_PRIVATE */ #endif /* ! _ISO_H_ */ diff -urN xnu-344.49/bsd/kern/bsd_init.c xnu-517/bsd/kern/bsd_init.c --- xnu-344.49/bsd/kern/bsd_init.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/bsd_init.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -69,13 +69,6 @@ * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ -/* - * HISTORY - * 16-Apr-98 A. Ramesh at Apple - * Created for Apple Core from DR2 init_main.c. - */ - -#include #include #include @@ -88,7 +81,10 @@ #include #include #include -#include +#include +#include + +#include #include #include @@ -103,11 +99,11 @@ #include #include -#include +#include /* for ux_exception_port */ #include #include -#include +#include /* for pseudo_inits */ #include #include @@ -120,11 +116,12 @@ #include #include -extern shared_region_mapping_t system_shared_region; extern int app_profile; /* on/off switch for pre-heat cache */ char copyright[] = -"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n"; +"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\t" +"The Regents of the University of California. " +"All rights reserved.\n\n"; extern void ux_handler(); @@ -155,9 +152,12 @@ long dumplo; /* offset into dumpdev */ long hostid; char hostname[MAXHOSTNAMELEN]; -int hostnamelen; +int hostnamelen; char domainname[MAXDOMNAMELEN]; -int domainnamelen; +int domainnamelen; +char classichandler[32] = {0}; +long classichandler_fsid = -1L; +long classichandler_fileid = -1L; char rootdevice[16]; /* hfs device names have at least 9 chars */ struct timeval boottime; /* GRODY! This has to go... */ @@ -170,7 +170,7 @@ struct vnode *rootvp; int boothowto = RB_DEBUG; -#define BSD_PAGABLE_MAP_SIZE (4 * 512 * 1024) +#define BSD_PAGABLE_MAP_SIZE (16 * 512 * 1024) vm_map_t bsd_pageable_map; vm_map_t mb_map; semaphore_t execve_semaphore; @@ -183,8 +183,8 @@ extern char init_task_failure_data[]; extern void time_zone_slock_init(void); -funnel_t * kernel_flock; -funnel_t * network_flock; +funnel_t *kernel_flock; +funnel_t *network_flock; int disable_funnel = 0; /* disables split funnel */ int enable_funnel = 0; /* disables split funnel */ @@ -194,13 +194,10 @@ * soon as a stack and segmentation * have been established. * Functions: - * clear and free user core * turn on clock * hand craft 0th process * call all initialization routines - * fork - process 0 to schedule - * - process 1 execute bootstrap - * - process 2 to page out + * hand craft 1st user process */ /* @@ -247,19 +244,8 @@ extern void uthread_zone_init(); - -#if 1 /* split funnel is enabled by default */ PE_parse_boot_arg("dfnl", &disable_funnel); -#else - /* split funnel is disabled befault */ - disable_funnel = 1; - PE_parse_boot_arg("efnl", &enable_funnel); - if (enable_funnel) { - /* enable only if efnl is set in bootarg */ - disable_funnel = 0; - } -#endif kernel_flock = funnel_alloc(KERNEL_FUNNEL); if (kernel_flock == (funnel_t *)0 ) { @@ -339,11 +325,23 @@ p->p_cred = &cred0; p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ + + TAILQ_INIT(&p->aio_activeq); + TAILQ_INIT(&p->aio_doneq); + p->aio_active_count = 0; + p->aio_done_count = 0; + + /* Set the audit info for this process */ + audit_proc_init(p); /* Create the file descriptor table. */ filedesc0.fd_refcnt = 1+1; /* +1 so shutdown will not _FREE_ZONE */ p->p_fd = &filedesc0; filedesc0.fd_cmask = cmask; + filedesc0.fd_knlistsize = -1; + filedesc0.fd_knlist = NULL; + filedesc0.fd_knhash = NULL; + filedesc0.fd_knhashmask = 0; /* Create the limits structures. */ p->p_limit = &limit0; @@ -352,6 +350,7 @@ limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE; limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; limit0.pl_rlimit[RLIMIT_STACK] = vm_initial_limit_stack; limit0.pl_rlimit[RLIMIT_DATA] = vm_initial_limit_data; limit0.pl_rlimit[RLIMIT_CORE] = vm_initial_limit_core; @@ -404,6 +403,18 @@ /* Initialize syslog */ log_init(); + /* + * Initializes security event auditing. + * XXX: Should/could this occur later? + */ + audit_init(); + + /* Initialize kqueues */ + knote_init(); + + /* Initialize for async IO */ + aio_init(); + /* POSIX Shm and Sem */ pshm_cache_init(); psem_cache_init(); @@ -431,7 +442,7 @@ /* kick off timeout driven events by calling first time */ thread_wakeup(&lbolt); - timeout(lightning_bolt,0,hz); + timeout((void (*)(void *))lightning_bolt, 0, hz); bsd_autoconf(); @@ -459,7 +470,7 @@ * read the time after clock_initialize_calendar() * and before nfs mount */ - microtime(&time); + microtime((struct timeval *)&time); bsd_hardclockinit = -1; /* start ticking */ @@ -507,13 +518,11 @@ devfs_kernel_mount("/dev"); } -#endif DEVFS +#endif /* DEVFS */ /* Initialize signal state for process 0. */ siginit(p); - /* printf("Launching user process\n"); */ - bsd_utaskbootstrap(); /* invoke post-root-mount hook */ @@ -531,6 +540,7 @@ struct uthread *ut; kern_return_t kr; thread_act_t th_act; + shared_region_mapping_t system_region; proc_name("init", p); @@ -560,8 +570,14 @@ bsd_hardclockinit = 1; /* Start bsd hardclock */ bsd_init_task = get_threadtask(th_act); init_task_failure_data[0] = 0; - shared_region_mapping_ref(system_shared_region); - vm_set_shared_region(get_threadtask(th_act), system_shared_region); + system_region = lookup_default_shared_region(ENV_DEFAULT_ROOT, + machine_slot[cpu_number()].cpu_type); + if (system_region == NULL) { + shared_file_boot_time_init(ENV_DEFAULT_ROOT, + machine_slot[cpu_number()].cpu_type); + } else { + vm_set_shared_region(get_threadtask(th_act), system_region); + } load_init_program(p); /* turn on app-profiling i.e. pre-heating */ app_profile = 1; @@ -602,7 +618,7 @@ } -#include // for MAXPARTITIONS +#include /* for MAXPARTITIONS */ setconf() { @@ -649,11 +665,7 @@ ut = (struct uthread *)get_bsdthread_info(th_act); ut->uu_sigmask = 0; - thread_hold(th_act); - (void)thread_stop(getshuttle_thread(th_act)); act_set_astbsd(th_act); - thread_release(th_act); - thread_unstop(getshuttle_thread(th_act)); (void) thread_resume(th_act); } @@ -675,6 +687,7 @@ else strcat(init_args,"-s"); } + if (PE_parse_boot_arg("-b", namep)) { boothowto |= RB_NOBOOTRC; len = strlen(init_args); @@ -708,6 +721,14 @@ strcat(init_args,"-x"); } + if (PE_parse_boot_arg("-d", namep)) { + len = strlen(init_args); + if(len != 0) + strcat(init_args," -d"); + else + strcat(init_args,"-d"); + } + PE_parse_boot_arg("srv", &srv); PE_parse_boot_arg("ncl", &ncl); PE_parse_boot_arg("nbuf", &nbuf); @@ -720,7 +741,6 @@ int oldfnl, int newfnl) { - thread_t cur_thread; boolean_t funnel_state_prev; int curfnl; funnel_t * curflock; @@ -747,8 +767,6 @@ if((curflock = thread_funnel_get()) == THR_FUNNEL_NULL) { panic("thread_funnel_switch: no funnel held"); } - - cur_thread = current_thread(); if ((oldfnl == NETWORK_FUNNEL) && (curflock != network_flock)) panic("thread_funnel_switch: network funnel not held"); diff -urN xnu-344.49/bsd/kern/bsd_stubs.c xnu-517/bsd/kern/bsd_stubs.c --- xnu-344.49/bsd/kern/bsd_stubs.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/bsd_stubs.c Sat Oct 25 00:25:25 2003 @@ -43,7 +43,7 @@ if (kernel_memory_allocate(mbmap, &addr, size, 0, KMA_NOPAGEWAIT|KMA_KOBJECT) == KERN_SUCCESS) - return((void *)addr); + return(addr); else return(0); @@ -101,6 +101,9 @@ sizeof(struct bdevsw)) == 0) break; } + } else { + /* NB: Not used below unless index is in range */ + devsw = &bdevsw[index]; } if ((index < 0) || (index >= nblkdev) || @@ -123,7 +126,7 @@ struct bdevsw *devsw; if (index == -1) { - devsw = bdevsw; + devsw = &bdevsw[1]; /* Start at slot 1 - this is a hack to fix the index=1 hack */ /* yes, start at 1 to avoid collision with volfs (Radar 2842228) */ for(index=1; index < nblkdev; index++, devsw++) { if(memcmp((char *)devsw, diff -urN xnu-344.49/bsd/kern/init_sysent.c xnu-517/bsd/kern/init_sysent.c --- xnu-344.49/bsd/kern/init_sysent.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/init_sysent.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-1999, 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1995-1999, 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -127,6 +127,9 @@ int socket(); int connect(); int getpriority(); +#ifdef __ppc__ +int osigreturn(); +#endif int sigreturn(); int bind(); int setsockopt(); @@ -302,6 +305,9 @@ int sem_init(); int sem_destroy(); +int fmod_watch_enable(); +int fmod_watch(); + int issetugid(); int utrace(); int pread(); @@ -314,10 +320,49 @@ int pthread_sigmask(); int __disable_threadsignal(); +int nfsclnt(); +int fhopen(); + +int aio_cancel(); +int aio_error(); +int aio_fsync(); +int aio_read(); +int aio_return(); +int aio_suspend(); +int aio_write(); +int lio_listio(); + +int kqueue(); +int kqueue_portset_np(); +int kqueue_from_portset_np(); +int kevent(); + +int audit(); +int auditon(); +int auditsvc(); +int getauid(); +int setauid(); +int getaudit(); +int setaudit(); +int getaudit_addr(); +int setaudit_addr(); +int auditctl(); + /* * System call switch table. */ +/* + * N.B. + * The argument count numbers in this table are actually + * the number of UInt32 words that comprise the arguments + * not the number of arguments + * + * This value is not currently used on PPC but Intel Darwin + * does use it and will not work correctly if the values + * are wrong + */ + struct sysent sysent[] = { syss(nosys,0), /* 0 = indir */ syss(exit,1), /* 1 = exit */ @@ -396,7 +441,7 @@ syss(sstk,1), /* 70 = sstk */ compat(smmap,6), /* 71 = old mmap */ syss(ovadvise,1), /* 72 = old vadvise */ - syss(munmap,2), /* 73 = munmap */ + sysnofnl(munmap,2), /* 73 = munmap */ syss(mprotect,3), /* 74 = mprotect */ syss(madvise,3), /* 75 = madvise */ syss(nosys,0), /* 76 was obsolete vhangup */ @@ -407,7 +452,7 @@ sysp(getpgrp,0), /* 81 = getpgrp */ sysp(setpgid,2), /* 82 = setpgid */ syss(setitimer,3), /* 83 = setitimer */ - compat(wait,0), /* 84 = old wait */ + compat(wait,1), /* 84 = old wait */ syss(swapon,1), /* 85 = swapon */ syss(getitimer,2), /* 86 = getitimer */ compat(gethostname,2), /* 87 = old gethostname */ @@ -426,7 +471,11 @@ sysp(getpriority,2), /* 100 = getpriority */ comaptnet(send,4), /* 101 = old send */ comaptnet(recv,4), /* 102 = old recv */ - syss(sigreturn,1), /* 103 = sigreturn */ +#ifdef __ppc__ + syss(osigreturn,1), /* 103 = sigreturn ; compat for jaguar*/ +#else + syss(sigreturn,1), /* 103 = sigreturn */ +#endif sysnets(bind,3), /* 104 = bind */ sysnets(setsockopt,5), /* 105 = setsockopt */ sysnets(listen,2), /* 106 = listen */ @@ -439,6 +488,18 @@ comaptnet(recvmsg,3), /* 113 = recvmsg */ comaptnet(sendmsg,3), /* 114 = sendmsg */ syss(nosys,0), /* 115 = old vtrace */ + +/* + * N.B. + * The argument count numbers in this table are actually + * the number of UInt32 words that comprise the arguments + * not the number of arguments + * + * This value is not currently used on PPC but Intel Darwin + * does use it and will not work correctly if the values + * are wrong + */ + #ifdef __ppc__ sysnofnl(ppc_gettimeofday,2), /* 116 = gettimeofday */ #else @@ -481,11 +542,11 @@ syss(getpgid,1), /* 151 = getpgid */ sysp(setprivexec,1),/* 152 = setprivexec */ #ifdef DOUBLE_ALIGN_PARAMS + syss(pread,6), /* 153 = pread */ + syss(pwrite,6), /* 154 = pwrite */ +#else syss(pread,5), /* 153 = pread */ syss(pwrite,5), /* 154 = pwrite */ -#else - syss(pread,4), /* 153 = pread */ - syss(pwrite,4), /* 154 = pwrite */ #endif syss(nfssvc,2), /* 155 = nfs_svc */ compat(getdirentries,4), /* 156 = old getdirentries */ @@ -499,7 +560,7 @@ syss(nosys,0), /* 164 */ #if QUOTA syss(quotactl, 4), /* 165 = quotactl */ -#else QUOTA +#else /* QUOTA */ syss(nosys, 0), /* 165 = not configured */ #endif /* QUOTA */ syss(nosys,0), /* 166 was exportfs */ @@ -516,11 +577,15 @@ syss(nosys,0), /* 177 */ syss(nosys,0), /* 178 */ syss(nosys,0), /* 179 */ - syss(kdebug_trace,6), /* 180 */ + sysnofnl(kdebug_trace,6), /* 180 */ syss(setgid,1), /* 181 */ syss(setegid,1), /* 182 */ syss(seteuid,1), /* 183 */ +#ifdef __ppc__ + syss(sigreturn, 2), /* 184 = nosys */ +#else syss(nosys,0), /* 184 = nosys */ +#endif syss(nosys,0), /* 185 = nosys */ syss(nosys,0), /* 186 = nosys */ syss(nosys,0), /* 187 = nosys */ @@ -529,6 +594,18 @@ syss(lstat,2), /* 190 = lstat */ syss(pathconf,2), /* 191 = pathconf */ syss(fpathconf,2), /* 192 = fpathconf */ + +/* + * N.B. + * The argument count numbers in this table are actually + * the number of UInt32 words that comprise the arguments + * not the number of arguments + * + * This value is not currently used on PPC but Intel Darwin + * does use it and will not work correctly if the values + * are wrong + */ + #if COMPAT_GETFSSTAT syss(getfsstat,3), /* 193 = getfsstat */ #else @@ -568,8 +645,8 @@ sysnets(ATPgetreq,3), /* 211 = ATPgetreq*/ sysnets(ATPgetrsp,2), /* 212 = ATPgetrsp*/ syss(nosys,0), /* 213 = Reserved for AppleTalk */ - syss(nosys,0), /* 214 = Reserved for AppleTalk */ - syss(nosys,0), /* 215 = Reserved for AppleTalk */ + syss(kqueue_from_portset_np,1), /* 214 = kqueue_from_portset_np */ + syss(kqueue_portset_np,1), /* 215 = kqueue_portset_np */ #else syss(nosys,0), /* 206 = Reserved for AppleTalk */ syss(nosys,0), /* 207 = Reserved for AppleTalk */ @@ -592,6 +669,18 @@ * We expect all filesystems to recognize the call and report that it is * not supported or to actually implement it. */ + +/* + * N.B. + * The argument count numbers in this table are actually + * the number of UInt32 words that comprise the arguments + * not the number of arguments + * + * This value is not currently used on PPC but Intel Darwin + * does use it and will not work correctly if the values + * are wrong + */ + syss(nosys,3), /* 216 = HFS make complex file call (multipel forks */ syss(nosys,2), /* 217 = HFS statv extended stat call for HFS */ syss(nosys,2), /* 218 = HFS lstatv extended lstat call for HFS */ @@ -607,7 +696,7 @@ #endif /* __APPLE_API_OBSOLETE */ syss(searchfs,6), /* 225 = HFS searchfs to implement catalog searching */ syss(delete,1), /* 226 = private delete (Carbon semantics) */ - syss(copyfile,4), /* 227 = copyfile - orignally for AFP */ + syss(copyfile,6), /* 227 = copyfile - orignally for AFP */ syss(nosys,0), /* 228 */ syss(nosys,0), /* 229 */ syss(nosys,0), /* 230 */ @@ -627,8 +716,8 @@ syss(nosys,0), /* 244 */ syss(nosys,0), /* 245 */ syss(nosys,0), /* 246 */ - syss(nosys,0), /* 247 */ - syss(nosys,0), /* 248 */ + syss(nfsclnt,2), /* 247 = nfsclnt*/ + syss(fhopen,2), /* 248 = fhopen */ syss(nosys,0), /* 249 */ syss(minherit,3), /* 250 = minherit */ syss(semsys,5), /* 251 = semsys */ @@ -669,8 +758,8 @@ syss(nosys,0), /* 286 */ syss(nosys,0), /* 287 */ syss(nosys,0), /* 288 */ - syss(nosys,0), /* 289 */ - syss(nosys,0), /* 290 */ + syss(fmod_watch_enable, 1), /* 289 = fmod_watching */ + syss(fmod_watch, 4), /* 290 = fmod_watch */ syss(nosys,0), /* 291 */ syss(nosys,0), /* 292 */ syss(nosys,0), /* 293 */ @@ -693,14 +782,14 @@ syss(getsid,1), /* 310 = getsid */ syss(nosys,0), /* 311 */ syss(nosys,0), /* 312 */ - syss(nosys,0), /* 313 */ - syss(nosys,0), /* 314 */ - syss(nosys,0), /* 315 */ - syss(nosys,0), /* 316 */ - syss(nosys,0), /* 317 */ - syss(nosys,0), /* 318 */ - syss(nosys,0), /* 319 */ - syss(nosys,0), /* 320 */ + sysnofnl(aio_fsync,1), /* 313 = aio_fsync */ + sysnofnl(aio_return,1), /* 314 = aio_return */ + sysnofnl(aio_suspend,3), /* 315 = aio_suspend */ + sysnofnl(aio_cancel,2), /* 316 = aio_cancel */ + sysnofnl(aio_error,1), /* 317 = aio_error */ + sysnofnl(aio_read,1), /* 318 = aio_read */ + sysnofnl(aio_write,1), /* 319 = aio_write */ + sysnofnl(lio_listio,4), /* 320 = lio_listio */ syss(nosys,0), /* 321 */ syss(nosys,0), /* 322 */ syss(nosys,0), /* 323 */ @@ -729,6 +818,38 @@ syss(nosys,0), /* 346 */ syss(nosys,0), /* 347 */ syss(nosys,0), /* 348 */ - syss(nosys,0) /* 349 */ + syss(nosys,0), /* 349 */ + syss(audit,2), /* 350 */ + syss(auditon,3), /* 351 */ + syss(auditsvc,2), /* 352 */ + syss(getauid,1), /* 353 */ + syss(setauid,1), /* 354 */ + syss(getaudit,1), /* 355 */ + syss(setaudit,1), /* 356 */ + syss(getaudit_addr,2), /* 357 */ + syss(setaudit_addr,2), /* 358 */ + syss(auditctl,1), /* 359 */ + syss(nosys,0), /* 360 */ + syss(nosys,0), /* 361 */ + syss(kqueue,0), /* 362 = kqueue */ + syss(kevent,6), /* 363 = kevent */ + syss(nosys,0), /* 364 */ + syss(nosys,0), /* 365 */ + syss(nosys,0), /* 366 */ + syss(nosys,0), /* 367 */ + syss(nosys,0), /* 368 */ + syss(nosys,0) /* 369 */ + +/* + * N.B. + * The argument count numbers in this table are actually + * the number of UInt32 words that comprise the arguments + * not the number of arguments + * + * This value is not currently used on PPC but Intel Darwin + * does use it and will not work correctly if the values + * are wrong + */ + }; int nsysent = sizeof(sysent) / sizeof(sysent[0]); diff -urN xnu-344.49/bsd/kern/kdebug.c xnu-517/bsd/kern/kdebug.c --- xnu-344.49/bsd/kern/kdebug.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kdebug.c Sat Oct 25 00:25:25 2003 @@ -28,6 +28,7 @@ #define HZ 100 #include #include +#include #include #include @@ -46,7 +47,7 @@ unsigned int kdebug_enable = 0; /* track timestamps for security server's entropy needs */ -mach_timespec_t * kd_entropy_buffer = 0; +uint64_t * kd_entropy_buffer = 0; unsigned int kd_entropy_bufsize = 0; unsigned int kd_entropy_count = 0; unsigned int kd_entropy_indx = 0; @@ -97,7 +98,8 @@ /* task to string structure */ struct tts { - task_t *task; + task_t *task; /* from procs task */ + pid_t pid; /* from procs p_pid */ char task_comm[20]; /* from procs p_comm */ }; @@ -159,7 +161,7 @@ { if (kd_entropy_indx < kd_entropy_count) { - ml_get_timebase((unsigned long long *) &kd_entropy_buffer [ kd_entropy_indx]); + kd_entropy_buffer [ kd_entropy_indx] = mach_absolute_time(); kd_entropy_indx++; } @@ -231,23 +233,17 @@ kd->arg2 = arg2; kd->arg3 = arg3; kd->arg4 = arg4; - kd->arg5 = (int)current_thread(); + kd->arg5 = (int)current_act(); if (cpu_number()) kd->arg5 |= KDBG_CPU_MASK; - ml_get_timebase((unsigned long long *)&kd->timestamp); + now = kd->timestamp = mach_absolute_time(); /* Watch for out of order timestamps */ - now = (((unsigned long long)kd->timestamp.tv_sec) << 32) | - (unsigned long long)((unsigned int)(kd->timestamp.tv_nsec)); if (now < kd_prev_timebase) { - /* timestamps are out of order -- adjust */ - kd_prev_timebase++; - tsp = (mach_timespec_t *)&kd_prev_timebase; - kd->timestamp.tv_sec = tsp->tv_sec; - kd->timestamp.tv_nsec = tsp->tv_nsec; + kd->timestamp = ++kd_prev_timebase; } else { @@ -353,19 +349,14 @@ kd->arg3 = arg3; kd->arg4 = arg4; kd->arg5 = arg5; - ml_get_timebase((unsigned long long *)&kd->timestamp); + now = kd->timestamp = mach_absolute_time(); /* Watch for out of order timestamps */ - now = (((unsigned long long)kd->timestamp.tv_sec) << 32) | - (unsigned long long)((unsigned int)(kd->timestamp.tv_nsec)); if (now < kd_prev_timebase) { /* timestamps are out of order -- adjust */ - kd_prev_timebase++; - tsp = (mach_timespec_t *)&kd_prev_timebase; - kd->timestamp.tv_sec = tsp->tv_sec; - kd->timestamp.tv_nsec = tsp->tv_nsec; + kd->timestamp = ++kd_prev_timebase; } else { @@ -421,11 +412,11 @@ kdebug_nolog = 1; if ((kdebug_flags & KDBG_INIT) && (kdebug_flags & KDBG_BUFINIT) && kd_bufsize && kd_buffer) - kmem_free(kernel_map, (char *)kd_buffer, kd_bufsize); + kmem_free(kernel_map, (vm_offset_t)kd_buffer, kd_bufsize); if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - kmem_free(kernel_map, (char *)kd_mapptr, kd_mapsize); + kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; @@ -437,6 +428,17 @@ return(ret); } +void kdbg_trace_data(struct proc *proc, long *arg_pid) +{ + if (!proc) + *arg_pid = 0; + else + *arg_pid = proc->p_pid; + + return; +} + + void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4) { int i; @@ -484,11 +486,20 @@ if(t->count < t->maxcount) { mapptr=&t->map[t->count]; - mapptr->thread = (unsigned int)getshuttle_thread(th_act); - mapptr->valid = 1; + mapptr->thread = (unsigned int)th_act; (void) strncpy (mapptr->command, t->atts->task_comm, sizeof(t->atts->task_comm)-1); mapptr->command[sizeof(t->atts->task_comm)-1] = '\0'; + + /* + Some kernel threads have no associated pid. + We still need to mark the entry as valid. + */ + if (t->atts->pid) + mapptr->valid = t->atts->pid; + else + mapptr->valid = 1; + t->count++; } } @@ -527,14 +538,20 @@ kd_mapsize = kd_mapcount * sizeof(kd_threadmap); if((kmem_alloc(kernel_map, & kd_maptomem, (vm_size_t)kd_mapsize) == KERN_SUCCESS)) + { kd_mapptr = (kd_threadmap *) kd_maptomem; + bzero(kd_mapptr, kd_mapsize); + } else kd_mapptr = (kd_threadmap *) 0; tts_mapsize = tts_count * sizeof(struct tts); if((kmem_alloc(kernel_map, & tts_maptomem, (vm_size_t)tts_mapsize) == KERN_SUCCESS)) + { tts_mapptr = (struct tts *) tts_maptomem; + bzero(tts_mapptr, tts_mapsize); + } else tts_mapptr = (struct tts *) 0; @@ -553,6 +570,7 @@ if (task_reference_try(p->task)) { tts_mapptr[i].task = p->task; + tts_mapptr[i].pid = p->p_pid; (void)strncpy(&tts_mapptr[i].task_comm, p->p_comm, sizeof(tts_mapptr[i].task_comm) - 1); i++; } @@ -573,9 +591,9 @@ { akrt.atts = &tts_mapptr[i]; task_act_iterate_wth_args(tts_mapptr[i].task, kdbg_resolve_map, &akrt); - task_deallocate(tts_mapptr[i].task); + task_deallocate((task_t) tts_mapptr[i].task); } - kmem_free(kernel_map, (char *)tts_mapptr, tts_mapsize); + kmem_free(kernel_map, (vm_offset_t)tts_mapptr, tts_mapsize); } } @@ -591,14 +609,14 @@ kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); - kmem_free(kernel_map, (char *)kd_buffer, kd_bufsize); + kmem_free(kernel_map, (vm_offset_t)kd_buffer, kd_bufsize); kd_buffer = (kd_buf *)0; kd_bufsize = 0; kd_prev_timebase = 0LL; /* Clean up the thread map buffer */ kdebug_flags &= ~KDBG_MAPINIT; - kmem_free(kernel_map, (char *)kd_mapptr, kd_mapsize); + kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); kd_mapptr = (kd_threadmap *) 0; kd_mapsize = 0; kd_mapcount = 0; @@ -819,7 +837,7 @@ if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { - kmem_free(kernel_map, (char *)kd_mapptr, kd_mapsize); + kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; @@ -848,11 +866,11 @@ if (kmem_alloc(kernel_map, &kd_entropy_buftomem, (vm_size_t)kd_entropy_bufsize) == KERN_SUCCESS) { - kd_entropy_buffer = (mach_timespec_t *)kd_entropy_buftomem; + kd_entropy_buffer = (uint64_t *) kd_entropy_buftomem; } else { - kd_entropy_buffer = (mach_timespec_t *) 0; + kd_entropy_buffer = (uint64_t *) 0; kd_entropy_count = 0; kd_entropy_indx = 0; return (EINVAL); @@ -885,8 +903,8 @@ kd_entropy_count = 0; kd_entropy_indx = 0; kd_entropy_buftomem = 0; - kmem_free(kernel_map, (char *)kd_entropy_buffer, kd_entropy_bufsize); - kd_entropy_buffer = (mach_timespec_t *) 0; + kmem_free(kernel_map, (vm_offset_t)kd_entropy_buffer, kd_entropy_bufsize); + kd_entropy_buffer = (uint64_t *) 0; return(ret); } @@ -1025,9 +1043,9 @@ kdbg_mapinit(); break; case KERN_KDSETBUF: - /* We allow a maximum buffer size of 25% of memory */ + /* We allow a maximum buffer size of 25% of either ram or max mapped address, whichever is smaller */ /* 'value' is the desired number of trace entries */ - max_entries = (mem_size/4) / sizeof(kd_buf); + max_entries = (sane_size/4) / sizeof(kd_buf); if (value <= max_entries) nkdbufs = value; else @@ -1203,4 +1221,11 @@ } /* end if KDBG_BUFINIT */ } /* end if count */ return (EINVAL); +} + +unsigned char *getProcName(struct proc *proc); +unsigned char *getProcName(struct proc *proc) { + + return (unsigned char *) &proc->p_comm; /* Return pointer to the proc name */ + } diff -urN xnu-344.49/bsd/kern/kern_aio.c xnu-517/bsd/kern/kern_aio.c --- xnu-344.49/bsd/kern/kern_aio.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/kern/kern_aio.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,2180 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + + +/* + * todo: + * 1) ramesh is looking into how to replace taking a reference on + * the user's map (vm_map_reference()) since it is believed that + * would not hold the process for us. + * 2) david is looking into a way for us to set the priority of the + * worker threads to match that of the user's thread when the + * async IO was queued. + */ + + +/* + * This file contains support for the POSIX 1003.1B AIO/LIO facility. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#define AIO_work_queued 1 +#define AIO_worker_wake 2 +#define AIO_completion_sig 3 +#define AIO_completion_cleanup_wait 4 +#define AIO_completion_cleanup_wake 5 +#define AIO_completion_suspend_wake 6 +#define AIO_fsync_delay 7 +#define AIO_cancel 10 +#define AIO_cancel_async_workq 11 +#define AIO_cancel_sync_workq 12 +#define AIO_cancel_activeq 13 +#define AIO_cancel_doneq 14 +#define AIO_fsync 20 +#define AIO_read 30 +#define AIO_write 40 +#define AIO_listio 50 +#define AIO_error 60 +#define AIO_error_val 61 +#define AIO_error_activeq 62 +#define AIO_error_workq 63 +#define AIO_return 70 +#define AIO_return_val 71 +#define AIO_return_activeq 72 +#define AIO_return_workq 73 +#define AIO_exec 80 +#define AIO_exit 90 +#define AIO_exit_sleep 91 +#define AIO_close 100 +#define AIO_close_sleep 101 +#define AIO_suspend 110 +#define AIO_suspend_sleep 111 +#define AIO_worker_thread 120 + +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif + +/* + * aio requests queue up on the aio_async_workq or lio_sync_workq (for + * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq + * (proc.aio_activeq) when one of our worker threads start the IO. + * And finally, requests move to the per process aio_doneq (proc.aio_doneq) + * when the IO request completes. The request remains on aio_doneq until + * user process calls aio_return or the process exits, either way that is our + * trigger to release aio resources. + */ +struct aio_anchor_cb +{ + int aio_async_workq_count; /* entries on aio_async_workq */ + int lio_sync_workq_count; /* entries on lio_sync_workq */ + int aio_active_count; /* entries on all active queues (proc.aio_activeq) */ + int aio_done_count; /* entries on all done queues (proc.aio_doneq) */ + TAILQ_HEAD( , aio_workq_entry ) aio_async_workq; + TAILQ_HEAD( , aio_workq_entry ) lio_sync_workq; +}; +typedef struct aio_anchor_cb aio_anchor_cb; + + +/* + * Notes on aio sleep / wake channels. + * We currently pick a couple fields within the proc structure that will allow + * us sleep channels that currently do not collide with any other kernel routines. + * At this time, for binary compatibility reasons, we cannot create new proc fields. + */ +#define AIO_SUSPEND_SLEEP_CHAN p_estcpu +#define AIO_CLEANUP_SLEEP_CHAN p_pctcpu + + +/* + * aysnc IO locking macros used to protect critical sections. + */ +#define AIO_LOCK usimple_lock( &aio_lock ) +#define AIO_UNLOCK usimple_unlock( &aio_lock ) + + +/* + * LOCAL PROTOTYPES + */ +static int aio_active_requests_for_process( struct proc *procp ); +static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp ); +static int aio_free_request( aio_workq_entry *entryp, vm_map_t the_map ); +static int aio_get_all_queues_count( void ); +static int aio_get_process_count( struct proc *procp ); +static aio_workq_entry * aio_get_some_work( void ); +static boolean_t aio_last_group_io( aio_workq_entry *entryp ); +static void aio_mark_requests( aio_workq_entry *entryp ); +static int aio_queue_async_request( struct proc *procp, + struct aiocb *aiocbp, + int kindOfIO ); +static int aio_validate( aio_workq_entry *entryp ); +static void aio_work_thread( void ); +static int do_aio_cancel( struct proc *p, + int fd, + struct aiocb *aiocbp, + boolean_t wait_for_completion, + boolean_t disable_notification ); +static void do_aio_completion( aio_workq_entry *entryp ); +static int do_aio_fsync( aio_workq_entry *entryp ); +static int do_aio_read( aio_workq_entry *entryp ); +static int do_aio_write( aio_workq_entry *entryp ); +static boolean_t is_already_queued( struct proc *procp, + struct aiocb *aiocbp ); +static int lio_create_async_entry( struct proc *procp, + struct aiocb *aiocbp, + struct sigevent *sigp, + long group_tag, + aio_workq_entry **entrypp ); +static int lio_create_sync_entry( struct proc *procp, + struct aiocb *aiocbp, + long group_tag, + aio_workq_entry **entrypp ); + +/* + * EXTERNAL PROTOTYPES + */ + +/* in ...bsd/kern/sys_generic.c */ +extern struct file* holdfp( struct filedesc* fdp, int fd, int flag ); +extern int dofileread( struct proc *p, struct file *fp, int fd, + void *buf, size_t nbyte, off_t offset, + int flags, int *retval ); +extern int dofilewrite( struct proc *p, struct file *fp, int fd, + const void *buf, size_t nbyte, off_t offset, + int flags, int *retval ); +extern vm_map_t vm_map_switch( vm_map_t map ); + + +/* + * aio external global variables. + */ +extern int aio_max_requests; /* AIO_MAX - configurable */ +extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */ +extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */ + + +/* + * aio static variables. + */ +static aio_anchor_cb aio_anchor; +static simple_lock_data_t aio_lock; +static struct zone *aio_workq_zonep; + + +/* + * syscall input parameters + */ +#ifndef _SYS_SYSPROTO_H_ + +struct aio_cancel_args { + int fd; + struct aiocb *aiocbp; +}; + +struct aio_error_args { + struct aiocb *aiocbp; +}; + +struct aio_fsync_args { + int op; + struct aiocb *aiocbp; +}; + +struct aio_read_args { + struct aiocb *aiocbp; +}; + +struct aio_return_args { + struct aiocb *aiocbp; +}; + +struct aio_suspend_args { + struct aiocb *const *aiocblist; + int nent; + const struct timespec *timeoutp; +}; + +struct aio_write_args { + struct aiocb *aiocbp; +}; + +struct lio_listio_args { + int mode; + struct aiocb *const *aiocblist; + int nent; + struct sigevent *sigp; +}; + +#endif /* _SYS_SYSPROTO_H_ */ + + +/* + * aio_cancel - attempt to cancel one or more async IO requests currently + * outstanding against file descriptor uap->fd. If uap->aiocbp is not + * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp + * is NULL then all outstanding async IO request for the given file + * descriptor are cancelled (if possible). + */ + +int +aio_cancel( struct proc *p, struct aio_cancel_args *uap, int *retval ) +{ + struct aiocb my_aiocb; + int result; + boolean_t funnel_state; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, 0, 0, 0 ); + + /* quick check to see if there are any async IO requests queued up */ + AIO_LOCK; + result = aio_get_all_queues_count( ); + AIO_UNLOCK; + if ( result < 1 ) { + result = EBADF; + goto ExitRoutine; + } + + *retval = -1; + if ( uap->aiocbp != NULL ) { + result = copyin( uap->aiocbp, &my_aiocb, sizeof(my_aiocb) ); + if ( result != 0 ) { + result = EAGAIN; + goto ExitRoutine; + } + + /* NOTE - POSIX standard says a mismatch between the file */ + /* descriptor passed in and the file descriptor embedded in */ + /* the aiocb causes unspecified results. We return EBADF in */ + /* that situation. */ + if ( uap->fd != my_aiocb.aio_fildes ) { + result = EBADF; + goto ExitRoutine; + } + } + + /* current BSD code assumes funnel lock is held */ + funnel_state = thread_funnel_set( kernel_flock, TRUE ); + result = do_aio_cancel( p, uap->fd, uap->aiocbp, FALSE, FALSE ); + (void) thread_funnel_set( kernel_flock, funnel_state ); + + if ( result != -1 ) { + *retval = result; + result = 0; + goto ExitRoutine; + } + + result = EBADF; + +ExitRoutine: + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, result, 0, 0 ); + + return( result ); + +} /* aio_cancel */ + + +/* + * _aio_close - internal function used to clean up async IO requests for + * a file descriptor that is closing. + * NOTE - kernel funnel lock is held when we get called. + * THIS MAY BLOCK. + */ + +__private_extern__ void +_aio_close( struct proc *p, int fd ) +{ + int error, count; + + /* quick check to see if there are any async IO requests queued up */ + AIO_LOCK; + count = aio_get_all_queues_count( ); + AIO_UNLOCK; + if ( count < 1 ) + return; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START, + (int)p, fd, 0, 0, 0 ); + + /* cancel all async IO requests on our todo queues for this file descriptor */ + error = do_aio_cancel( p, fd, NULL, TRUE, FALSE ); + if ( error == AIO_NOTCANCELED ) { + /* + * AIO_NOTCANCELED is returned when we find an aio request for this process + * and file descriptor on the active async IO queue. Active requests cannot + * be cancelled so we must wait for them to complete. We will get a special + * wake up call on our channel used to sleep for ALL active requests to + * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used + * when we must wait for all active aio requests. + */ + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE, + (int)p, fd, 0, 0, 0 ); + + tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_close", 0 ); + } + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END, + (int)p, fd, 0, 0, 0 ); + + return; + +} /* _aio_close */ + + +/* + * aio_error - return the error status associated with the async IO + * request referred to by uap->aiocbp. The error status is the errno + * value that would be set by the corresponding IO request (read, wrtie, + * fdatasync, or sync). + */ + +int +aio_error( struct proc *p, struct aio_error_args *uap, int *retval ) +{ + aio_workq_entry *entryp; + int error; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, 0, 0, 0 ); + + AIO_LOCK; + + /* quick check to see if there are any async IO requests queued up */ + if ( aio_get_all_queues_count( ) < 1 ) { + error = EINVAL; + goto ExitRoutine; + } + + /* look for a match on our queue of async IO requests that have completed */ + TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { + if ( entryp->uaiocbp == uap->aiocbp ) { + *retval = entryp->errorval; + error = 0; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + + /* look for a match on our queue of active async IO requests */ + TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) { + if ( entryp->uaiocbp == uap->aiocbp ) { + *retval = EINPROGRESS; + error = 0; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + + /* look for a match on our queue of todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) { + *retval = EINPROGRESS; + error = 0; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_workq)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + error = EINVAL; + +ExitRoutine: + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, error, 0, 0 ); + AIO_UNLOCK; + + return( error ); + +} /* aio_error */ + + +/* + * aio_fsync - asynchronously force all IO operations associated + * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and + * queued at the time of the call to the synchronized completion state. + * NOTE - we do not support op O_DSYNC at this point since we do not support the + * fdatasync() call. + */ + +int +aio_fsync( struct proc *p, struct aio_fsync_args *uap, int *retval ) +{ + int error; + int fsync_kind; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, uap->op, 0, 0 ); + + *retval = 0; + if ( uap->op == O_SYNC ) + fsync_kind = AIO_FSYNC; +#if 0 // we don't support fdatasync() call yet + else if ( uap->op == O_DSYNC ) + fsync_kind = AIO_DSYNC; +#endif + else { + *retval = -1; + error = EINVAL; + goto ExitRoutine; + } + + error = aio_queue_async_request( p, uap->aiocbp, fsync_kind ); + if ( error != 0 ) + *retval = -1; + +ExitRoutine: + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, error, 0, 0 ); + + return( error ); + +} /* aio_fsync */ + + +/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the + * file descriptor (uap->aiocbp->aio_fildes) into the buffer + * (uap->aiocbp->aio_buf). + */ + +int +aio_read( struct proc *p, struct aio_read_args *uap, int *retval ) +{ + int error; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, 0, 0, 0 ); + + *retval = 0; + + error = aio_queue_async_request( p, uap->aiocbp, AIO_READ ); + if ( error != 0 ) + *retval = -1; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, error, 0, 0 ); + + return( error ); + +} /* aio_read */ + + +/* + * aio_return - return the return status associated with the async IO + * request referred to by uap->aiocbp. The return status is the value + * that would be returned by corresponding IO request (read, wrtie, + * fdatasync, or sync). This is where we release kernel resources + * held for async IO call associated with the given aiocb pointer. + */ + +int +aio_return( struct proc *p, struct aio_return_args *uap, register_t *retval ) +{ + aio_workq_entry *entryp; + int error; + boolean_t lock_held; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, 0, 0, 0 ); + + AIO_LOCK; + lock_held = TRUE; + *retval = 0; + + /* quick check to see if there are any async IO requests queued up */ + if ( aio_get_all_queues_count( ) < 1 ) { + error = EINVAL; + goto ExitRoutine; + } + + /* look for a match on our queue of async IO requests that have completed */ + TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { + if ( entryp->uaiocbp == uap->aiocbp ) { + TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count--; + p->aio_done_count--; + + *retval = entryp->returnval; + + /* we cannot free requests that are still completing */ + if ( (entryp->flags & AIO_COMPLETION) == 0 ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + AIO_UNLOCK; + lock_held = FALSE; + aio_free_request( entryp, my_map ); + } + else + /* tell completion code to free this request */ + entryp->flags |= AIO_DO_FREE; + error = 0; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + + /* look for a match on our queue of active async IO requests */ + TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) { + if ( entryp->uaiocbp == uap->aiocbp ) { + error = EINPROGRESS; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + + /* look for a match on our queue of todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( p == entryp->procp && entryp->uaiocbp == uap->aiocbp ) { + error = EINPROGRESS; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_workq)) | DBG_FUNC_NONE, + (int)p, (int)uap->aiocbp, *retval, 0, 0 ); + goto ExitRoutine; + } + } + error = EINVAL; + +ExitRoutine: + if ( lock_held ) + AIO_UNLOCK; + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, error, 0, 0 ); + + return( error ); + +} /* aio_return */ + + +/* + * _aio_exec - internal function used to clean up async IO requests for + * a process that is going away due to exec(). We cancel any async IOs + * we can and wait for those already active. We also disable signaling + * for cancelled or active aio requests that complete. + * NOTE - kernel funnel lock is held when we get called. + * This routine MAY block! + */ + +__private_extern__ void +_aio_exec( struct proc *p ) +{ + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START, + (int)p, 0, 0, 0, 0 ); + + _aio_exit( p ); + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END, + (int)p, 0, 0, 0, 0 ); + + return; + +} /* _aio_exec */ + + +/* + * _aio_exit - internal function used to clean up async IO requests for + * a process that is terminating (via exit() or exec() ). We cancel any async IOs + * we can and wait for those already active. We also disable signaling + * for cancelled or active aio requests that complete. This routine MAY block! + * NOTE - kernel funnel lock is held when we get called. + */ + +__private_extern__ void +_aio_exit( struct proc *p ) +{ + int error, count; + aio_workq_entry *entryp; + + /* quick check to see if there are any async IO requests queued up */ + AIO_LOCK; + count = aio_get_all_queues_count( ); + AIO_UNLOCK; + if ( count < 1 ) { + return; + } + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START, + (int)p, 0, 0, 0, 0 ); + + /* + * cancel async IO requests on the todo work queue and wait for those + * already active to complete. + */ + error = do_aio_cancel( p, 0, NULL, TRUE, TRUE ); + if ( error == AIO_NOTCANCELED ) { + /* + * AIO_NOTCANCELED is returned when we find an aio request for this process + * on the active async IO queue. Active requests cannot be cancelled so we + * must wait for them to complete. We will get a special wake up call on + * our channel used to sleep for ALL active requests to complete. This sleep + * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all + * active aio requests. + */ + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE, + (int)p, 0, 0, 0, 0 ); + + tsleep( &p->AIO_CLEANUP_SLEEP_CHAN, PRIBIO, "aio_exit", 0 ); + } + + /* release all aio resources used by this process */ + AIO_LOCK; + entryp = TAILQ_FIRST( &p->aio_doneq ); + while ( entryp != NULL ) { + aio_workq_entry *next_entryp; + + next_entryp = TAILQ_NEXT( entryp, aio_workq_link ); + TAILQ_REMOVE( &p->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count--; + p->aio_done_count--; + + /* we cannot free requests that are still completing */ + if ( (entryp->flags & AIO_COMPLETION) == 0 ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + AIO_UNLOCK; + aio_free_request( entryp, my_map ); + + /* need to start over since aio_doneq may have been */ + /* changed while we were away. */ + AIO_LOCK; + entryp = TAILQ_FIRST( &p->aio_doneq ); + continue; + } + else + /* tell completion code to free this request */ + entryp->flags |= AIO_DO_FREE; + entryp = next_entryp; + } + AIO_UNLOCK; + +ExitRoutine: + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END, + (int)p, 0, 0, 0, 0 ); + + return; + +} /* _aio_exit */ + + +/* + * do_aio_cancel - cancel async IO requests (if possible). We get called by + * aio_cancel, close, and at exit. + * There are three modes of operation: 1) cancel all async IOs for a process - + * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd + * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given + * aiocbp. + * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all + * target async IO requests, AIO_NOTCANCELED if we could not cancel all + * target async IO requests, and AIO_ALLDONE if all target async IO requests + * were already complete. + * WARNING - do not deference aiocbp in this routine, it may point to user + * land data that has not been copied in (when called from aio_cancel() ) + * NOTE - kernel funnel lock is held when we get called. + */ + +static int +do_aio_cancel( struct proc *p, int fd, struct aiocb *aiocbp, + boolean_t wait_for_completion, boolean_t disable_notification ) +{ + aio_workq_entry *entryp; + int result; + + result = -1; + + /* look for a match on our queue of async todo work. */ + AIO_LOCK; + entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq ); + while ( entryp != NULL ) { + aio_workq_entry *next_entryp; + + next_entryp = TAILQ_NEXT( entryp, aio_workq_link ); + if ( p == entryp->procp ) { + if ( (aiocbp == NULL && fd == 0) || + (aiocbp != NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + /* we found a match so we remove the entry from the */ + /* todo work queue and place it on the done queue */ + TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); + aio_anchor.aio_async_workq_count--; + entryp->errorval = ECANCELED; + entryp->returnval = -1; + if ( disable_notification ) + entryp->flags |= AIO_DISABLE; /* flag for special completion processing */ + result = AIO_CANCELED; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); + + TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count++; + p->aio_done_count++; + entryp->flags |= AIO_COMPLETION; + AIO_UNLOCK; + + /* do completion processing for this request */ + do_aio_completion( entryp ); + + AIO_LOCK; + entryp->flags &= ~AIO_COMPLETION; + if ( (entryp->flags & AIO_DO_FREE) != 0 ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + AIO_UNLOCK; + aio_free_request( entryp, my_map ); + } + else + AIO_UNLOCK; + + if ( aiocbp != NULL ) { + return( result ); + } + + /* need to start over since aio_async_workq may have been */ + /* changed while we were away doing completion processing. */ + AIO_LOCK; + entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq ); + continue; + } + } + entryp = next_entryp; + } /* while... */ + + /* + * look for a match on our queue of synchronous todo work. This will + * be a rare occurrence but could happen if a process is terminated while + * processing a lio_listio call. + */ + entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq ); + while ( entryp != NULL ) { + aio_workq_entry *next_entryp; + + next_entryp = TAILQ_NEXT( entryp, aio_workq_link ); + if ( p == entryp->procp ) { + if ( (aiocbp == NULL && fd == 0) || + (aiocbp != NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + /* we found a match so we remove the entry from the */ + /* todo work queue and place it on the done queue */ + TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link ); + aio_anchor.lio_sync_workq_count--; + entryp->errorval = ECANCELED; + entryp->returnval = -1; + if ( disable_notification ) + entryp->flags |= AIO_DISABLE; /* flag for special completion processing */ + result = AIO_CANCELED; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_sync_workq)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); + + TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count++; + p->aio_done_count++; + if ( aiocbp != NULL ) { + AIO_UNLOCK; + return( result ); + } + } + } + entryp = next_entryp; + } /* while... */ + + /* + * look for a match on our queue of active async IO requests and + * return AIO_NOTCANCELED result. + */ + TAILQ_FOREACH( entryp, &p->aio_activeq, aio_workq_link ) { + if ( (aiocbp == NULL && fd == 0) || + (aiocbp != NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + result = AIO_NOTCANCELED; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); + + if ( wait_for_completion ) + entryp->flags |= AIO_WAITING; /* flag for special completion processing */ + if ( disable_notification ) + entryp->flags |= AIO_DISABLE; /* flag for special completion processing */ + if ( aiocbp != NULL ) { + AIO_UNLOCK; + return( result ); + } + } + } + + /* + * if we didn't find any matches on the todo or active queues then look for a + * match on our queue of async IO requests that have completed and if found + * return AIO_ALLDONE result. + */ + if ( result == -1 ) { + TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { + if ( (aiocbp == NULL && fd == 0) || + (aiocbp != NULL && entryp->uaiocbp == aiocbp) || + (aiocbp == NULL && fd == entryp->aiocb.aio_fildes) ) { + result = AIO_ALLDONE; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); + + if ( aiocbp != NULL ) { + AIO_UNLOCK; + return( result ); + } + } + } + } + AIO_UNLOCK; + + return( result ); + +} /* do_aio_cancel */ + + +/* + * aio_suspend - suspend the calling thread until at least one of the async + * IO operations referenced by uap->aiocblist has completed, until a signal + * interrupts the function, or uap->timeoutp time interval (optional) has + * passed. + * Returns 0 if one or more async IOs have completed else -1 and errno is + * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt + * woke us up. + */ + +int +aio_suspend( struct proc *p, struct aio_suspend_args *uap, int *retval ) +{ + int error; + int i, count; + uint64_t abstime; + struct timespec ts; + struct timeval tv; + aio_workq_entry *entryp; + struct aiocb * *aiocbpp; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START, + (int)p, uap->nent, 0, 0, 0 ); + + *retval = -1; + abstime = 0; + aiocbpp = NULL; + + /* quick check to see if there are any async IO requests queued up */ + AIO_LOCK; + count = aio_get_all_queues_count( ); + AIO_UNLOCK; + if ( count < 1 ) { + error = EINVAL; + goto ExitThisRoutine; + } + + if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) { + error = EINVAL; + goto ExitThisRoutine; + } + + if ( uap->timeoutp != NULL ) { + error = copyin( (void *)uap->timeoutp, &ts, sizeof(ts) ); + if ( error != 0 ) { + error = EAGAIN; + goto ExitThisRoutine; + } + + if ( ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) { + error = EINVAL; + goto ExitThisRoutine; + } + + nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, + &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } + + MALLOC( aiocbpp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK ); + if ( aiocbpp == NULL ) { + error = EAGAIN; + goto ExitThisRoutine; + } + + /* check list of aio requests to see if any have completed */ + for ( i = 0; i < uap->nent; i++ ) { + struct aiocb *aiocbp; + + /* copyin in aiocb pointer from list */ + error = copyin( (void *)(uap->aiocblist + i), (aiocbpp + i), sizeof(aiocbp) ); + if ( error != 0 ) { + error = EAGAIN; + goto ExitThisRoutine; + } + + /* NULL elements are legal so check for 'em */ + aiocbp = *(aiocbpp + i); + if ( aiocbp == NULL ) + continue; + + /* return immediately if any aio request in the list is done */ + AIO_LOCK; + TAILQ_FOREACH( entryp, &p->aio_doneq, aio_workq_link ) { + if ( entryp->uaiocbp == aiocbp ) { + *retval = 0; + error = 0; + AIO_UNLOCK; + goto ExitThisRoutine; + } + } + AIO_UNLOCK; + } /* for ( ; i < uap->nent; ) */ + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE, + (int)p, uap->nent, 0, 0, 0 ); + + /* + * wait for an async IO to complete or a signal fires or timeout expires. + * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal + * interrupts us. If an async IO completes before a signal fires or our + * timeout expires, we get a wakeup call from aio_work_thread(). We do not + * use tsleep() here in order to avoid getting kernel funnel lock. + */ + assert_wait( (event_t) &p->AIO_SUSPEND_SLEEP_CHAN, THREAD_ABORTSAFE ); + if ( abstime > 0 ) { + thread_set_timer_deadline( abstime ); + } + error = thread_block( THREAD_CONTINUE_NULL ); + if ( error == THREAD_AWAKENED ) { + /* got our wakeup call from aio_work_thread() */ + if ( abstime > 0 ) { + thread_cancel_timer(); + } + *retval = 0; + error = 0; + } + else if ( error == THREAD_TIMED_OUT ) { + /* our timeout expired */ + error = EAGAIN; + } + else { + /* we were interrupted */ + if ( abstime > 0 ) { + thread_cancel_timer(); + } + error = EINTR; + } + +ExitThisRoutine: + if ( aiocbpp != NULL ) + FREE( aiocbpp, M_TEMP ); + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END, + (int)p, uap->nent, error, 0, 0 ); + + return( error ); + +} /* aio_suspend */ + + +/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the + * file descriptor (uap->aiocbp->aio_fildes) from the buffer + * (uap->aiocbp->aio_buf). + */ + +int +aio_write( struct proc *p, struct aio_write_args *uap, int *retval ) +{ + int error; + + *retval = 0; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START, + (int)p, (int)uap->aiocbp, 0, 0, 0 ); + + error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE ); + if ( error != 0 ) + *retval = -1; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END, + (int)p, (int)uap->aiocbp, error, 0, 0 ); + + return( error ); + +} /* aio_write */ + + +/* + * lio_listio - initiate a list of IO requests. We process the list of aiocbs + * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT). + * The caller gets error and return status for each aiocb in the list via aio_error + * and aio_return. We must keep completed requests until released by the + * aio_return call. + */ + +int +lio_listio( struct proc *p, struct lio_listio_args *uap, int *retval ) +{ + int i; + int call_result; + int result; + long group_tag; + aio_workq_entry * *entryp_listp; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START, + (int)p, uap->nent, uap->mode, 0, 0 ); + + entryp_listp = NULL; + call_result = -1; + *retval = -1; + if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) { + call_result = EINVAL; + goto ExitRoutine; + } + + if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) { + call_result = EINVAL; + goto ExitRoutine; + } + + /* + * we use group_tag to mark IO requests for delayed completion processing + * which means we wait until all IO requests in the group have completed + * before we either return to the caller when mode is LIO_WAIT or signal + * user when mode is LIO_NOWAIT. + */ + group_tag = random(); + + /* + * allocate a list of aio_workq_entry pointers that we will use to queue + * up all our requests at once while holding our lock. + */ + MALLOC( entryp_listp, void *, (uap->nent * sizeof(struct aiocb *)), M_TEMP, M_WAITOK ); + if ( entryp_listp == NULL ) { + call_result = EAGAIN; + goto ExitRoutine; + } + + /* process list of aio requests */ + for ( i = 0; i < uap->nent; i++ ) { + struct aiocb *my_aiocbp; + + *(entryp_listp + i) = NULL; + + /* copyin in aiocb pointer from list */ + result = copyin( (void *)(uap->aiocblist + i), &my_aiocbp, sizeof(my_aiocbp) ); + if ( result != 0 ) { + call_result = EAGAIN; + continue; + } + + /* NULL elements are legal so check for 'em */ + if ( my_aiocbp == NULL ) + continue; + + if ( uap->mode == LIO_NOWAIT ) + result = lio_create_async_entry( p, my_aiocbp, uap->sigp, + group_tag, (entryp_listp + i) ); + else + result = lio_create_sync_entry( p, my_aiocbp, group_tag, + (entryp_listp + i) ); + + if ( result != 0 && call_result == -1 ) + call_result = result; + } + + /* + * we need to protect this section since we do not want any of these grouped + * IO requests to begin until we have them all on the queue. + */ + AIO_LOCK; + for ( i = 0; i < uap->nent; i++ ) { + aio_workq_entry *entryp; + + /* NULL elements are legal so check for 'em */ + entryp = *(entryp_listp + i); + if ( entryp == NULL ) + continue; + + /* check our aio limits to throttle bad or rude user land behavior */ + if ( aio_get_all_queues_count( ) >= aio_max_requests || + aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process || + is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + result = EAGAIN; + AIO_UNLOCK; + aio_free_request( entryp, my_map ); + AIO_LOCK; + continue; + } + + /* place the request on the appropriate queue */ + if ( uap->mode == LIO_NOWAIT ) { + TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); + aio_anchor.aio_async_workq_count++; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE, + (int)p, (int)entryp->uaiocbp, 0, 0, 0 ); + } + else { + TAILQ_INSERT_TAIL( &aio_anchor.lio_sync_workq, entryp, aio_workq_link ); + aio_anchor.lio_sync_workq_count++; + } + } + AIO_UNLOCK; + + if ( uap->mode == LIO_NOWAIT ) + /* caller does not want to wait so we'll fire off a worker thread and return */ + wakeup_one( &aio_anchor.aio_async_workq ); + else { + aio_workq_entry *entryp; + int error; + + /* + * mode is LIO_WAIT - handle the IO requests now. + */ + AIO_LOCK; + entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq ); + while ( entryp != NULL ) { + if ( p == entryp->procp && group_tag == entryp->group_tag ) { + boolean_t funnel_state; + + TAILQ_REMOVE( &aio_anchor.lio_sync_workq, entryp, aio_workq_link ); + aio_anchor.lio_sync_workq_count--; + AIO_UNLOCK; + + // file system IO code path requires kernel funnel lock + funnel_state = thread_funnel_set( kernel_flock, TRUE ); + if ( (entryp->flags & AIO_READ) != 0 ) { + error = do_aio_read( entryp ); + } + else if ( (entryp->flags & AIO_WRITE) != 0 ) { + error = do_aio_write( entryp ); + } + else if ( (entryp->flags & AIO_FSYNC) != 0 ) { + error = do_aio_fsync( entryp ); + } + else { + printf( "%s - unknown aio request - flags 0x%02X \n", + __FUNCTION__, entryp->flags ); + error = EINVAL; + } + entryp->errorval = error; + if ( error != 0 && call_result == -1 ) + call_result = EIO; + (void) thread_funnel_set( kernel_flock, funnel_state ); + + AIO_LOCK; + /* we're done with the IO request so move it on the done queue */ + TAILQ_INSERT_TAIL( &p->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count++; + p->aio_done_count++; + + /* need to start over since lio_sync_workq may have been changed while we */ + /* were away doing the IO. */ + entryp = TAILQ_FIRST( &aio_anchor.lio_sync_workq ); + continue; + } /* p == entryp->procp */ + + entryp = TAILQ_NEXT( entryp, aio_workq_link ); + } /* while ( entryp != NULL ) */ + AIO_UNLOCK; + } /* uap->mode == LIO_WAIT */ + + /* call_result == -1 means we had no trouble queueing up requests */ + if ( call_result == -1 ) { + call_result = 0; + *retval = 0; + } + +ExitRoutine: + if ( entryp_listp != NULL ) + FREE( entryp_listp, M_TEMP ); + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END, + (int)p, call_result, 0, 0, 0 ); + + return( call_result ); + +} /* lio_listio */ + + +/* + * aio worker thread. this is where all the real work gets done. + * we get a wake up call on sleep channel &aio_anchor.aio_async_workq + * after new work is queued up. + */ + +static void +aio_work_thread( void ) +{ + aio_workq_entry *entryp; + struct uthread *uthread = (struct uthread *)get_bsdthread_info(current_act()); + + for( ;; ) { + entryp = aio_get_some_work(); + if ( entryp == NULL ) { + /* + * aio worker threads wait for some work to get queued up + * by aio_queue_async_request. Once some work gets queued + * it will wake up one of these worker threads just before + * returning to our caller in user land. We do not use + * tsleep() here in order to avoid getting kernel funnel lock. + */ + assert_wait( (event_t) &aio_anchor.aio_async_workq, THREAD_UNINT ); + thread_block( THREAD_CONTINUE_NULL ); + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_wake)) | DBG_FUNC_NONE, + 0, 0, 0, 0, 0 ); + } + else { + int error; + boolean_t funnel_state; + vm_map_t currentmap; + vm_map_t oldmap = VM_MAP_NULL; + task_t oldaiotask = TASK_NULL; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START, + (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 ); + + /* + * Assume the target's address space identity for the duration + * of the IO. + */ + funnel_state = thread_funnel_set( kernel_flock, TRUE ); + + currentmap = get_task_map( (current_proc())->task ); + if ( currentmap != entryp->aio_map ) { + oldaiotask = uthread->uu_aio_task; + uthread->uu_aio_task = entryp->procp->task; + oldmap = vm_map_switch( entryp->aio_map ); + } + + if ( (entryp->flags & AIO_READ) != 0 ) { + error = do_aio_read( entryp ); + } + else if ( (entryp->flags & AIO_WRITE) != 0 ) { + error = do_aio_write( entryp ); + } + else if ( (entryp->flags & AIO_FSYNC) != 0 ) { + error = do_aio_fsync( entryp ); + } + else { + printf( "%s - unknown aio request - flags 0x%02X \n", + __FUNCTION__, entryp->flags ); + error = EINVAL; + } + entryp->errorval = error; + if ( currentmap != entryp->aio_map ) { + (void) vm_map_switch( oldmap ); + uthread->uu_aio_task = oldaiotask; + } + + /* we're done with the IO request so pop it off the active queue and */ + /* push it on the done queue */ + AIO_LOCK; + TAILQ_REMOVE( &entryp->procp->aio_activeq, entryp, aio_workq_link ); + aio_anchor.aio_active_count--; + entryp->procp->aio_active_count--; + TAILQ_INSERT_TAIL( &entryp->procp->aio_doneq, entryp, aio_workq_link ); + aio_anchor.aio_done_count++; + entryp->procp->aio_done_count++; + entryp->flags |= AIO_COMPLETION; + + /* remove our reference to the user land map. */ + if ( VM_MAP_NULL != entryp->aio_map ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + AIO_UNLOCK; /* must unlock before calling vm_map_deallocate() */ + vm_map_deallocate( my_map ); + } + else { + AIO_UNLOCK; + } + + do_aio_completion( entryp ); + (void) thread_funnel_set( kernel_flock, funnel_state ); + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END, + (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval, + entryp->returnval, 0 ); + + AIO_LOCK; + entryp->flags &= ~AIO_COMPLETION; + if ( (entryp->flags & AIO_DO_FREE) != 0 ) { + vm_map_t my_map; + + my_map = entryp->aio_map; + entryp->aio_map = VM_MAP_NULL; + AIO_UNLOCK; + aio_free_request( entryp, my_map ); + } + else + AIO_UNLOCK; + } + } /* for ( ;; ) */ + + /* NOT REACHED */ + +} /* aio_work_thread */ + + +/* + * aio_get_some_work - get the next async IO request that is ready to be executed. + * aio_fsync complicates matters a bit since we cannot do the fsync until all async + * IO requests at the time the aio_fsync call came in have completed. + */ + +static aio_workq_entry * +aio_get_some_work( void ) +{ + aio_workq_entry *entryp; + int skip_count = 0; + + /* pop some work off the work queue and add to our active queue */ + AIO_LOCK; + for ( entryp = TAILQ_FIRST( &aio_anchor.aio_async_workq ); + entryp != NULL; + entryp = TAILQ_NEXT( entryp, aio_workq_link ) ) { + + if ( (entryp->flags & AIO_FSYNC) != 0 ) { + /* leave aio_fsync calls on the work queue if there are IO */ + /* requests on the active queue for the same file descriptor. */ + if ( aio_delay_fsync_request( entryp ) ) { + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); + continue; + } + } + break; + } + + if ( entryp != NULL ) { + TAILQ_REMOVE( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); + aio_anchor.aio_async_workq_count--; + TAILQ_INSERT_TAIL( &entryp->procp->aio_activeq, entryp, aio_workq_link ); + aio_anchor.aio_active_count++; + entryp->procp->aio_active_count++; + } + AIO_UNLOCK; + + return( entryp ); + +} /* aio_get_some_work */ + + +/* + * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at + * this time. Delay will happen when there are any active IOs for the same file + * descriptor that were queued at time the aio_sync call was queued. + * NOTE - AIO_LOCK must be held by caller + */ +static boolean_t +aio_delay_fsync_request( aio_workq_entry *entryp ) +{ + aio_workq_entry *my_entryp; + + TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) { + if ( my_entryp->fsyncp != NULL && + entryp->uaiocbp == my_entryp->fsyncp && + entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) { + return( TRUE ); + } + } + + return( FALSE ); + +} /* aio_delay_fsync_request */ + + +/* + * aio_queue_async_request - queue up an async IO request on our work queue then + * wake up one of our worker threads to do the actual work. We get a reference + * to our caller's user land map in order to keep it around while we are + * processing the request. + */ + +static int +aio_queue_async_request( struct proc *procp, struct aiocb *aiocbp, int kindOfIO ) +{ + aio_workq_entry *entryp; + int result; + + entryp = (aio_workq_entry *) zalloc( aio_workq_zonep ); + if ( entryp == NULL ) { + result = EAGAIN; + goto error_exit; + } + bzero( entryp, sizeof(*entryp) ); + + /* fill in the rest of the aio_workq_entry */ + entryp->procp = procp; + entryp->uaiocbp = aiocbp; + entryp->flags |= kindOfIO; + entryp->aio_map = VM_MAP_NULL; + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { + result = EAGAIN; + goto error_exit; + } + + /* do some more validation on the aiocb and embedded file descriptor */ + result = aio_validate( entryp ); + if ( result != 0 ) + goto error_exit; + + /* get a reference to the user land map in order to keep it around */ + entryp->aio_map = get_task_map( procp->task ); + vm_map_reference( entryp->aio_map ); + + AIO_LOCK; + + if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) { + AIO_UNLOCK; + result = EAGAIN; + goto error_exit; + } + + /* check our aio limits to throttle bad or rude user land behavior */ + if ( aio_get_all_queues_count( ) >= aio_max_requests || + aio_get_process_count( procp ) >= aio_max_requests_per_process ) { + AIO_UNLOCK; + result = EAGAIN; + goto error_exit; + } + + /* + * aio_fsync calls sync up all async IO requests queued at the time + * the aio_fsync call was made. So we mark each currently queued async + * IO with a matching file descriptor as must complete before we do the + * fsync. We set the fsyncp field of each matching async IO + * request with the aiocb pointer passed in on the aio_fsync call to + * know which IOs must complete before we process the aio_fsync call. + */ + if ( (kindOfIO & AIO_FSYNC) != 0 ) + aio_mark_requests( entryp ); + + /* queue up on our aio asynchronous work queue */ + TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); + aio_anchor.aio_async_workq_count++; + + AIO_UNLOCK; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE, + (int)procp, (int)aiocbp, 0, 0, 0 ); + + wakeup_one( &aio_anchor.aio_async_workq ); + + return( 0 ); + +error_exit: + if ( entryp != NULL ) { + /* this entry has not been queued up so no worries about unlocked */ + /* state and aio_map */ + aio_free_request( entryp, entryp->aio_map ); + } + + return( result ); + +} /* aio_queue_async_request */ + + +/* + * lio_create_async_entry - allocate an aio_workq_entry and fill it in. + * If all goes well return 0 and pass the aio_workq_entry pointer back to + * our caller. We get a reference to our caller's user land map in order to keep + * it around while we are processing the request. + * lio_listio calls behave differently at completion they do completion notification + * when all async IO requests have completed. We use group_tag to tag IO requests + * that behave in the delay notification manner. + */ + +static int +lio_create_async_entry( struct proc *procp, struct aiocb *aiocbp, + struct sigevent *sigp, long group_tag, + aio_workq_entry **entrypp ) +{ + aio_workq_entry *entryp; + int result; + + entryp = (aio_workq_entry *) zalloc( aio_workq_zonep ); + if ( entryp == NULL ) { + result = EAGAIN; + goto error_exit; + } + bzero( entryp, sizeof(*entryp) ); + + /* fill in the rest of the aio_workq_entry */ + entryp->procp = procp; + entryp->uaiocbp = aiocbp; + entryp->flags |= AIO_LIO; + entryp->group_tag = group_tag; + entryp->aio_map = VM_MAP_NULL; + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { + result = EAGAIN; + goto error_exit; + } + + /* look for lio_listio LIO_NOP requests and ignore them. */ + /* Not really an error, but we need to free our aio_workq_entry. */ + if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) { + result = 0; + goto error_exit; + } + + /* use sigevent passed in to lio_listio for each of our calls, but only */ + /* do completion notification after the last request completes. */ + if ( sigp != NULL ) { + result = copyin( sigp, &entryp->aiocb.aio_sigevent, sizeof(entryp->aiocb.aio_sigevent) ); + if ( result != 0 ) { + result = EAGAIN; + goto error_exit; + } + } + + /* do some more validation on the aiocb and embedded file descriptor */ + result = aio_validate( entryp ); + if ( result != 0 ) + goto error_exit; + + /* get a reference to the user land map in order to keep it around */ + entryp->aio_map = get_task_map( procp->task ); + vm_map_reference( entryp->aio_map ); + + *entrypp = entryp; + return( 0 ); + +error_exit: + if ( entryp != NULL ) + zfree( aio_workq_zonep, (vm_offset_t) entryp ); + + return( result ); + +} /* lio_create_async_entry */ + + +/* + * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO + * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp + * to mark each async IO that must complete before the fsync is done. We use the uaiocbp + * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests. + * NOTE - AIO_LOCK must be held by caller + */ + +static void +aio_mark_requests( aio_workq_entry *entryp ) +{ + aio_workq_entry *my_entryp; + + TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) { + if ( entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) { + my_entryp->fsyncp = entryp->uaiocbp; + } + } + + TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( entryp->procp == my_entryp->procp && + entryp->aiocb.aio_fildes == my_entryp->aiocb.aio_fildes ) { + my_entryp->fsyncp = entryp->uaiocbp; + } + } + +} /* aio_mark_requests */ + + +/* + * lio_create_sync_entry - allocate an aio_workq_entry and fill it in. + * If all goes well return 0 and pass the aio_workq_entry pointer back to + * our caller. + * lio_listio calls behave differently at completion they do completion notification + * when all async IO requests have completed. We use group_tag to tag IO requests + * that behave in the delay notification manner. + */ + +static int +lio_create_sync_entry( struct proc *procp, struct aiocb *aiocbp, + long group_tag, aio_workq_entry **entrypp ) +{ + aio_workq_entry *entryp; + int result; + + entryp = (aio_workq_entry *) zalloc( aio_workq_zonep ); + if ( entryp == NULL ) { + result = EAGAIN; + goto error_exit; + } + bzero( entryp, sizeof(*entryp) ); + + /* fill in the rest of the aio_workq_entry */ + entryp->procp = procp; + entryp->uaiocbp = aiocbp; + entryp->flags |= AIO_LIO; + entryp->group_tag = group_tag; + entryp->aio_map = VM_MAP_NULL; + result = copyin( aiocbp, &entryp->aiocb, sizeof(entryp->aiocb) ); + if ( result != 0 ) { + result = EAGAIN; + goto error_exit; + } + + /* look for lio_listio LIO_NOP requests and ignore them. */ + /* Not really an error, but we need to free our aio_workq_entry. */ + if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) { + result = 0; + goto error_exit; + } + + result = aio_validate( entryp ); + if ( result != 0 ) { + goto error_exit; + } + + *entrypp = entryp; + return( 0 ); + +error_exit: + if ( entryp != NULL ) + zfree( aio_workq_zonep, (vm_offset_t) entryp ); + + return( result ); + +} /* lio_create_sync_entry */ + + +/* + * aio_free_request - remove our reference on the user land map and + * free the work queue entry resources. + * We are not holding the lock here thus aio_map is passed in and + * zeroed while we did have the lock. + */ + +static int +aio_free_request( aio_workq_entry *entryp, vm_map_t the_map ) +{ + /* remove our reference to the user land map. */ + if ( VM_MAP_NULL != the_map ) { + vm_map_deallocate( the_map ); + } + + zfree( aio_workq_zonep, (vm_offset_t) entryp ); + + return( 0 ); + +} /* aio_free_request */ + + +/* aio_validate - validate the aiocb passed in by one of the aio syscalls. + */ + +static int +aio_validate( aio_workq_entry *entryp ) +{ + boolean_t funnel_state; + struct file *fp; + int flag; + int result; + + result = 0; + + if ( (entryp->flags & AIO_LIO) != 0 ) { + if ( entryp->aiocb.aio_lio_opcode == LIO_READ ) + entryp->flags |= AIO_READ; + else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE ) + entryp->flags |= AIO_WRITE; + else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) + return( 0 ); + else + return( EINVAL ); + } + + flag = FREAD; + if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC)) != 0 ) { + flag = FWRITE; + } + + if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) { + if ( entryp->aiocb.aio_offset < 0 || + entryp->aiocb.aio_nbytes < 0 || + entryp->aiocb.aio_nbytes > INT_MAX || + entryp->aiocb.aio_buf == NULL ) + return( EINVAL ); + } + + /* validate aiocb.aio_sigevent. at this point we only support sigev_notify + * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value, + * sigev_notify_function, and sigev_notify_attributes are ignored. + */ + if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ) { + int signum; + /* make sure we have a valid signal number */ + signum = entryp->aiocb.aio_sigevent.sigev_signo; + if ( signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP ) + return (EINVAL); + } + else if ( entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE ) + return (EINVAL); + + /* validate the file descriptor and that the file was opened + * for the appropriate read / write access. This section requires + * kernel funnel lock. + */ + funnel_state = thread_funnel_set( kernel_flock, TRUE ); + + result = fdgetf( entryp->procp, entryp->aiocb.aio_fildes, &fp ); + if ( result == 0 ) { + if ( (fp->f_flag & flag) == 0 ) { + /* we don't have read or write access */ + result = EBADF; + } + else if ( fp->f_type != DTYPE_VNODE ) { + /* this is not a file */ + result = ESPIPE; + } + } + else { + result = EBADF; + } + + (void) thread_funnel_set( kernel_flock, funnel_state ); + + return( result ); + +} /* aio_validate */ + + +/* + * aio_get_process_count - runs through our queues that hold outstanding + * async IO reqests and totals up number of requests for the given + * process. + * NOTE - caller must hold aio lock! + */ + +static int +aio_get_process_count( struct proc *procp ) +{ + aio_workq_entry *entryp; + int error; + int count; + + /* begin with count of completed async IO requests for this process */ + count = procp->aio_done_count; + + /* add in count of active async IO requests for this process */ + count += procp->aio_active_count; + + /* look for matches on our queue of asynchronous todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( procp == entryp->procp ) { + count++; + } + } + + /* look for matches on our queue of synchronous todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) { + if ( procp == entryp->procp ) { + count++; + } + } + + return( count ); + +} /* aio_get_process_count */ + + +/* + * aio_get_all_queues_count - get total number of entries on all aio work queues. + * NOTE - caller must hold aio lock! + */ + +static int +aio_get_all_queues_count( void ) +{ + int count; + + count = aio_anchor.aio_async_workq_count; + count += aio_anchor.lio_sync_workq_count; + count += aio_anchor.aio_active_count; + count += aio_anchor.aio_done_count; + + return( count ); + +} /* aio_get_all_queues_count */ + + +/* + * do_aio_completion. Handle async IO completion. + */ + +static void +do_aio_completion( aio_workq_entry *entryp ) +{ + /* signal user land process if appropriate */ + if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && + (entryp->flags & AIO_DISABLE) == 0 ) { + + /* + * if group_tag is non zero then make sure this is the last IO request + * in the group before we signal. + */ + if ( entryp->group_tag == 0 || + (entryp->group_tag != 0 && aio_last_group_io( entryp )) ) { + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, + entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 ); + + psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo ); + return; + } + } + + /* + * need to handle case where a process is trying to exit, exec, or close + * and is currently waiting for active aio requests to complete. If + * AIO_WAITING is set then we need to look to see if there are any + * other requests in the active queue for this process. If there are + * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If + * there are some still active then do nothing - we only want to wakeup + * when all active aio requests for the process are complete. + */ + if ( (entryp->flags & AIO_WAITING) != 0 ) { + int active_requests; + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); + + AIO_LOCK; + active_requests = aio_active_requests_for_process( entryp->procp ); + AIO_UNLOCK; + if ( active_requests < 1 ) { + /* no active aio requests for this process, continue exiting */ + + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); + + wakeup_one( &entryp->procp->AIO_CLEANUP_SLEEP_CHAN ); + } + return; + } + + /* + * aio_suspend case when a signal was not requested. In that scenario we + * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel. + * NOTE - the assumption here is that this wakeup call is inexpensive. + * we really only need to do this when an aio_suspend call is pending. + * If we find the wakeup call should be avoided we could mark the + * async IO requests given in the list provided by aio_suspend and only + * call wakeup for them. If we do mark them we should unmark them after + * the aio_suspend wakes up. + */ + KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE, + (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); + + wakeup_one( &entryp->procp->AIO_SUSPEND_SLEEP_CHAN ); + + return; + +} /* do_aio_completion */ + + +/* + * aio_last_group_io - checks to see if this is the last unfinished IO request + * for the given group_tag. Returns TRUE if there are no other active IO + * requests for this group or FALSE if the are active IO requests + * NOTE - AIO_LOCK must be held by caller + */ + +static boolean_t +aio_last_group_io( aio_workq_entry *entryp ) +{ + aio_workq_entry *my_entryp; + + /* look for matches on our queue of active async IO requests */ + TAILQ_FOREACH( my_entryp, &entryp->procp->aio_activeq, aio_workq_link ) { + if ( my_entryp->group_tag == entryp->group_tag ) + return( FALSE ); + } + + /* look for matches on our queue of asynchronous todo work */ + TAILQ_FOREACH( my_entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( my_entryp->group_tag == entryp->group_tag ) + return( FALSE ); + } + + /* look for matches on our queue of synchronous todo work */ + TAILQ_FOREACH( my_entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) { + if ( my_entryp->group_tag == entryp->group_tag ) + return( FALSE ); + } + + return( TRUE ); + +} /* aio_last_group_io */ + + +/* + * do_aio_read + */ +static int +do_aio_read( aio_workq_entry *entryp ) +{ + struct file *fp; + int error; + + fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FREAD ); + if ( fp != NULL ) { + error = dofileread( entryp->procp, fp, entryp->aiocb.aio_fildes, + (void *)entryp->aiocb.aio_buf, + entryp->aiocb.aio_nbytes, + entryp->aiocb.aio_offset, FOF_OFFSET, + &entryp->returnval ); + frele( fp ); + } + else + error = EBADF; + + return( error ); + +} /* do_aio_read */ + + +/* + * do_aio_write + */ +static int +do_aio_write( aio_workq_entry *entryp ) +{ + struct file *fp; + int error; + + fp = holdfp( entryp->procp->p_fd, entryp->aiocb.aio_fildes, FWRITE ); + if ( fp != NULL ) { + error = dofilewrite( entryp->procp, fp, entryp->aiocb.aio_fildes, + (const void *)entryp->aiocb.aio_buf, + entryp->aiocb.aio_nbytes, + entryp->aiocb.aio_offset, FOF_OFFSET, + &entryp->returnval ); + frele( fp ); + } + else + error = EBADF; + + return( error ); + +} /* do_aio_write */ + + +/* + * aio_active_requests_for_process - return number of active async IO + * requests for the given process. + * NOTE - caller must hold aio lock! + */ + +static int +aio_active_requests_for_process( struct proc *procp ) +{ + + return( procp->aio_active_count ); + +} /* aio_active_requests_for_process */ + + +/* + * do_aio_fsync + */ +static int +do_aio_fsync( aio_workq_entry *entryp ) +{ + register struct vnode *vp; + struct file *fp; + int error; + + /* + * NOTE - we will not support AIO_DSYNC until fdatasync() is supported. + * AIO_DSYNC is caught before we queue up a request and flagged as an error. + * The following was shamelessly extracted from fsync() implementation. + */ + error = getvnode( entryp->procp, entryp->aiocb.aio_fildes, &fp ); + if ( error == 0 ) { + vp = (struct vnode *)fp->f_data; + vn_lock( vp, LK_EXCLUSIVE | LK_RETRY, entryp->procp ); + error = VOP_FSYNC( vp, fp->f_cred, MNT_WAIT, entryp->procp ); + VOP_UNLOCK( vp, 0, entryp->procp ); + } + if ( error != 0 ) + entryp->returnval = -1; + + return( error ); + +} /* do_aio_fsync */ + + +/* + * is_already_queued - runs through our queues to see if the given + * aiocbp / process is there. Returns TRUE if there is a match + * on any of our aio queues. + * NOTE - callers must hold aio lock! + */ + +static boolean_t +is_already_queued( struct proc *procp, + struct aiocb *aiocbp ) +{ + aio_workq_entry *entryp; + boolean_t result; + + result = FALSE; + + /* look for matches on our queue of async IO requests that have completed */ + TAILQ_FOREACH( entryp, &procp->aio_doneq, aio_workq_link ) { + if ( aiocbp == entryp->uaiocbp ) { + result = TRUE; + goto ExitThisRoutine; + } + } + + /* look for matches on our queue of active async IO requests */ + TAILQ_FOREACH( entryp, &procp->aio_activeq, aio_workq_link ) { + if ( aiocbp == entryp->uaiocbp ) { + result = TRUE; + goto ExitThisRoutine; + } + } + + /* look for matches on our queue of asynchronous todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.aio_async_workq, aio_workq_link ) { + if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) { + result = TRUE; + goto ExitThisRoutine; + } + } + + /* look for matches on our queue of synchronous todo work */ + TAILQ_FOREACH( entryp, &aio_anchor.lio_sync_workq, aio_workq_link ) { + if ( procp == entryp->procp && aiocbp == entryp->uaiocbp ) { + result = TRUE; + goto ExitThisRoutine; + } + } + +ExitThisRoutine: + return( result ); + +} /* is_already_queued */ + + +/* + * aio initialization + */ +__private_extern__ void +aio_init( void ) +{ + int i; + + simple_lock_init( &aio_lock ); + + AIO_LOCK; + TAILQ_INIT( &aio_anchor.aio_async_workq ); + TAILQ_INIT( &aio_anchor.lio_sync_workq ); + aio_anchor.aio_async_workq_count = 0; + aio_anchor.lio_sync_workq_count = 0; + aio_anchor.aio_active_count = 0; + aio_anchor.aio_done_count = 0; + AIO_UNLOCK; + + i = sizeof( aio_workq_entry ); + aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" ); + + _aio_create_worker_threads( aio_worker_threads ); + + return; + +} /* aio_init */ + + +/* + * aio worker threads created here. + */ +__private_extern__ void +_aio_create_worker_threads( int num ) +{ + int i; + + /* create some worker threads to handle the async IO requests */ + for ( i = 0; i < num; i++ ) { + thread_t myThread; + + myThread = kernel_thread( kernel_task, aio_work_thread ); + if ( THREAD_NULL == myThread ) { + printf( "%s - failed to create a work thread \n", __FUNCTION__ ); + } + } + + return; + +} /* _aio_create_worker_threads */ + +/* + * Return the current activation utask + */ +task_t +get_aiotask(void) +{ + return ((struct uthread *)get_bsdthread_info(current_act()))->uu_aio_task; +} diff -urN xnu-344.49/bsd/kern/kern_audit.c xnu-517/bsd/kern/kern_audit.c --- xnu-344.49/bsd/kern/kern_audit.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/kern/kern_audit.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,1592 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef AUDIT + +/* + * The AUDIT_EXCESSIVELY_VERBOSE define enables a number of + * gratuitously noisy printf's to the console. Due to the + * volume, it should be left off unless you want your system + * to churn a lot whenever the audit record flow gets high. + */ +/* #define AUDIT_EXCESSIVELY_VERBOSE */ +#ifdef AUDIT_EXCESSIVELY_VERBOSE +#define AUDIT_PRINTF(x) printf x +#else +#define AUDIT_PRINTF(X) +#endif + +#if DIAGNOSTIC +#if defined(assert) +#undef assert() +#endif +#define assert(cond) \ + ((void) ((cond) ? 0 : panic("%s:%d (%s)", __FILE__, __LINE__, # cond))) +#else +#include +#endif /* DIAGNOSTIC */ + +/* + * Define the audit control flags. + */ +int audit_enabled; +int audit_suspended; + +/* + * Mutex to protect global variables shared between various threads and + * processes. + */ +static mutex_t *audit_mtx; + +/* + * Queue of audit records ready for delivery to disk. We insert new + * records at the tail, and remove records from the head. + */ +static TAILQ_HEAD(, kaudit_record) audit_q; + +/* + * Condition variable to signal to the worker that it has work to do: + * either new records are in the queue, or a log replacement is taking + * place. + */ +static wait_queue_t audit_wait_queue; + +/* + * When an audit log is rotated, the actual rotation must be performed + * by the audit worker thread, as it may have outstanding writes on the + * current audit log. audit_replacement_vp holds the vnode replacing + * the current vnode. We can't let more than one replacement occur + * at a time, so if more than one thread requests a replacement, only + * one can have the replacement "in progress" at any given moment. If + * a thread tries to replace the audit vnode and discovers a replacement + * is already in progress (i.e., audit_replacement_flag != 0), then it + * will sleep on audit_replacement_cv waiting its turn to perform a + * replacement. When a replacement is completed, this cv is signalled + * by the worker thread so a waiting thread can start another replacement. + * We also store a credential to perform audit log write operations with. + */ +static wait_queue_t audit_replacement_wait_queue; + +static int audit_replacement_flag; +static struct vnode *audit_replacement_vp; +static struct ucred *audit_replacement_cred; + +/* + * Flags to use on audit files when opening and closing. + */ +const static int audit_open_flags = FWRITE | O_APPEND; +const static int audit_close_flags = FWRITE | O_APPEND; + +/* + * XXX: Couldn't find the include file for this, so copied kern_exec.c's + * behavior. + */ +extern task_t kernel_task; + +static void +audit_free(struct kaudit_record *ar) +{ + if (ar->k_ar.ar_arg_upath1 != NULL) { + kmem_free(kernel_map, ar->k_ar.ar_arg_upath1, MAXPATHLEN); + } + if (ar->k_ar.ar_arg_upath2 != NULL) { + kmem_free(kernel_map, ar->k_ar.ar_arg_upath2, MAXPATHLEN); + } + if (ar->k_ar.ar_arg_kpath1 != NULL) { + kmem_free(kernel_map, ar->k_ar.ar_arg_kpath1, MAXPATHLEN); + } + if (ar->k_ar.ar_arg_kpath2 != NULL) { + kmem_free(kernel_map, ar->k_ar.ar_arg_kpath2, MAXPATHLEN); + } + if (ar->k_ar.ar_arg_text != NULL) { + kmem_free(kernel_map, ar->k_ar.ar_arg_text, MAXPATHLEN); + } + if (ar->k_udata != NULL) { + kmem_free(kernel_map, ar->k_udata, ar->k_ulen); + } + kmem_free(kernel_map, ar, sizeof(*ar)); +} + +static int +audit_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, + struct proc *p) +{ + int ret; + struct au_record *bsm; + + /* + * If there is a user audit record attached to the kernel record, + * then write the user record. + */ + /* XXX Need to decide a few things here: IF the user audit + * record is written, but the write of the kernel record fails, + * what to do? Should the kernel record come before or after the + * user record? For now, we write the user record first, and + * we ignore errors. + */ + if (ar->k_udata != NULL) { + vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen, + (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, p); + } + + /* + * Convert the internal kernel record to BSM format and write it + * out if everything's OK. + */ + ret = kaudit_to_bsm(ar, &bsm); + if (ret == BSM_NOAUDIT) + return (0); + + if (ret == BSM_FAILURE) { + AUDIT_PRINTF(("BSM conversion failure\n")); + return (-1); + } + + /* XXX This function can be called with the kernel funnel held, + * which is not optimal. We should break the write functionality + * away from the BSM record generation and have the BSM generation + * done before this function is called. This function will then + * take the BSM record as a parameter. + */ + ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len, + (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, p)); + + kau_free(bsm); + + return (ret); +} + +static void +audit_worker() +{ + int do_replacement_signal, error, release_funnel; + TAILQ_HEAD(, kaudit_record) ar_worklist; + struct kaudit_record *ar, *ar_start, *ar_stop; + struct vnode *audit_vp, *old_vp; + struct ucred *audit_cred, *old_cred; + struct proc *audit_p; + + AUDIT_PRINTF(("audit_worker starting\n")); + + TAILQ_INIT(&ar_worklist); + audit_cred = NULL; + audit_p = current_proc(); + audit_vp = NULL; + + /* + * XXX: Presumably we can assume Mach threads are started without + * holding the BSD kernel funnel? + */ + thread_funnel_set(kernel_flock, FALSE); + + mutex_lock(audit_mtx); + while (1) { + /* + * First priority: replace the audit log target if requested. + * As we actually close the vnode in the worker thread, we + * need to grab the funnel, which means releasing audit_mtx. + * In case another replacement was scheduled while the mutex + * we released, we loop. + * + * XXX It could well be we should drain existing records + * first to ensure that the timestamps and ordering + * are right. + */ + do_replacement_signal = 0; + while (audit_replacement_flag != 0) { + old_cred = audit_cred; + old_vp = audit_vp; + audit_cred = audit_replacement_cred; + audit_vp = audit_replacement_vp; + audit_replacement_cred = NULL; + audit_replacement_vp = NULL; + audit_replacement_flag = 0; + + audit_enabled = (audit_vp != NULL); + + if (old_vp != NULL || audit_vp != NULL) { + mutex_unlock(audit_mtx); + thread_funnel_set(kernel_flock, TRUE); + release_funnel = 1; + } else + release_funnel = 0; + /* + * XXX: What to do about write failures here? + */ + if (old_vp != NULL) { + AUDIT_PRINTF(("Closing old audit file\n")); + vn_close(old_vp, audit_close_flags, old_cred, + audit_p); + crfree(old_cred); + old_cred = NULL; + old_vp = NULL; + AUDIT_PRINTF(("Audit file closed\n")); + } + if (audit_vp != NULL) { + AUDIT_PRINTF(("Opening new audit file\n")); + } + if (release_funnel) { + thread_funnel_set(kernel_flock, FALSE); + mutex_lock(audit_mtx); + } + do_replacement_signal = 1; + } + /* + * Signal that replacement have occurred to wake up and + * start any other replacements started in parallel. We can + * continue about our business in the mean time. We + * broadcast so that both new replacements can be inserted, + * but also so that the source(s) of replacement can return + * successfully. + */ + if (do_replacement_signal) + wait_queue_wakeup_all(audit_replacement_wait_queue, + 0, THREAD_AWAKENED); + + /* + * Next, check to see if we have any records to drain into + * the vnode. If not, go back to waiting for an event. + */ + if (TAILQ_EMPTY(&audit_q)) { + int ret; + + AUDIT_PRINTF(("audit_worker waiting\n")); + ret = wait_queue_assert_wait(audit_wait_queue, 0, + THREAD_UNINT); + mutex_unlock(audit_mtx); + + assert(ret == THREAD_WAITING); + ret = thread_block(THREAD_CONTINUE_NULL); + assert(ret == THREAD_AWAKENED); + AUDIT_PRINTF(("audit_worker woken up\n")); + AUDIT_PRINTF(("audit_worker: new vp = %p; value of flag %d\n", + audit_replacement_vp, audit_replacement_flag)); + + mutex_lock(audit_mtx); + continue; + } + + /* + * If we have records, but there's no active vnode to + * write to, drain the record queue. Generally, we + * prevent the unnecessary allocation of records + * elsewhere, but we need to allow for races between + * conditional allocation and queueing. Go back to + * waiting when we're done. + * + * XXX: We go out of our way to avoid calling audit_free() + * with the audit_mtx held, to avoid a lock order reversal + * as free() may grab the funnel. This will be fixed at + * some point. + */ + if (audit_vp == NULL) { + while ((ar = TAILQ_FIRST(&audit_q))) { + TAILQ_REMOVE(&audit_q, ar, k_q); + TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q); + } + mutex_unlock(audit_mtx); + while ((ar = TAILQ_FIRST(&ar_worklist))) { + TAILQ_REMOVE(&ar_worklist, ar, k_q); + audit_free(ar); + } + mutex_lock(audit_mtx); + continue; + } + + /* + * We have both records to write, and an active vnode + * to write to. Dequeue a record, and start the write. + * Eventually, it might make sense to dequeue several + * records and perform our own clustering, if the lower + * layers aren't doing it automatically enough. + * + * XXX: We go out of our way to avoid calling audit_free() + * with the audit_mtx held, to avoid a lock order reversal + * as free() may grab the funnel. This will be fixed at + * some point. + */ + while ((ar = TAILQ_FIRST(&audit_q))) { + TAILQ_REMOVE(&audit_q, ar, k_q); + TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q); + } + mutex_unlock(audit_mtx); + release_funnel = 0; + while ((ar = TAILQ_FIRST(&ar_worklist))) { + TAILQ_REMOVE(&ar_worklist, ar, k_q); + if (audit_vp != NULL) { + /* + * XXX: What should happen if there's a write + * error here? + */ + if (!release_funnel) { + thread_funnel_set(kernel_flock, TRUE); + release_funnel = 1; + } + VOP_LEASE(audit_vp, audit_p, audit_cred, + LEASE_WRITE); + error = audit_write(audit_vp, ar, audit_cred, + audit_p); + if (error) + printf("audit_worker: write error %d\n", + error); + } + audit_free(ar); + } + if (release_funnel) + thread_funnel_set(kernel_flock, FALSE); + mutex_lock(audit_mtx); + } +} + +void +audit_init(void) +{ + + /* Verify that the syscall to audit event table is the same + * size as the system call table. + */ + if (nsys_au_event != nsysent) { + printf("Security auditing service initialization failed, "); + printf("audit event table doesn't match syscall table.\n"); + return; + } + + printf("Security auditing service present\n"); + TAILQ_INIT(&audit_q); + audit_enabled = 0; + audit_suspended = 0; + audit_replacement_cred = NULL; + audit_replacement_flag = 0; + audit_replacement_vp = NULL; + audit_mtx = mutex_alloc(ETAP_NO_TRACE); + audit_wait_queue = wait_queue_alloc(SYNC_POLICY_FIFO); + audit_replacement_wait_queue = wait_queue_alloc(SYNC_POLICY_FIFO); + + /* Initialize the BSM audit subsystem. */ + kau_init(); + + kernel_thread(kernel_task, audit_worker); +} + +static void +audit_rotate_vnode(struct ucred *cred, struct vnode *vp) +{ + int ret; + + /* + * If other parallel log replacements have been requested, we wait + * until they've finished before continuing. + */ + mutex_lock(audit_mtx); + while (audit_replacement_flag != 0) { + + AUDIT_PRINTF(("audit_rotate_vnode: sleeping to wait for " + "flag\n")); + ret = wait_queue_assert_wait(audit_replacement_wait_queue, 0, + THREAD_UNINT); + mutex_unlock(audit_mtx); + + assert(ret == THREAD_WAITING); + ret = thread_block(THREAD_CONTINUE_NULL); + assert(ret == THREAD_AWAKENED); + AUDIT_PRINTF(("audit_rotate_vnode: woken up (flag %d)\n", + audit_replacement_flag)); + + mutex_lock(audit_mtx); + } + audit_replacement_cred = cred; + audit_replacement_flag = 1; + audit_replacement_vp = vp; + + /* + * Wake up the audit worker to perform the exchange once we + * release the mutex. + */ + wait_queue_wakeup_one(audit_wait_queue, 0, THREAD_AWAKENED); + + /* + * Wait for the audit_worker to broadcast that a replacement has + * taken place; we know that once this has happened, our vnode + * has been replaced in, so we can return successfully. + */ + AUDIT_PRINTF(("audit_rotate_vnode: waiting for news of " + "replacement\n")); + ret = wait_queue_assert_wait(audit_replacement_wait_queue, 0, + THREAD_UNINT); + mutex_unlock(audit_mtx); + + assert(ret == THREAD_WAITING); + ret = thread_block(THREAD_CONTINUE_NULL); + assert(ret == THREAD_AWAKENED); + AUDIT_PRINTF(("audit_rotate_vnode: change acknowledged by " + "audit_worker (flag " "now %d)\n", audit_replacement_flag)); +} + +/* + * Drain the audit queue and close the log at shutdown. + */ +void +audit_shutdown(void) +{ + + audit_rotate_vnode(NULL, NULL); +} + +static __inline__ struct uthread * +curuthread(void) +{ + + return (get_bsdthread_info(current_act())); +} + +static __inline__ struct kaudit_record * +currecord(void) +{ + + return (curuthread()->uu_ar); +} + +/********************************** + * Begin system calls. * + **********************************/ +/* + * System call to allow a user space application to submit a BSM audit + * record to the kernel for inclusion in the audit log. This function + * does little verification on the audit record that is submitted. + * + * XXXAUDIT: Audit preselection for user records does not currently + * work, since we pre-select only based on the AUE_audit event type, + * not the event type submitted as part of the user audit data. + */ +struct audit_args { + void * record; + int length; +}; +/* ARGSUSED */ +int +audit(struct proc *p, struct audit_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + void * rec; + struct kaudit_record *ar; + + ar = currecord(); + + /* XXX: What's the proper error code if a user audit record can't + * be written due to auditing off, or otherwise unavailable? + */ + if (ar == NULL) + return (ENOTSUP); + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + + if (uap->length > MAX_AUDIT_RECORD_SIZE) + return (EINVAL); + + error = kmem_alloc(kernel_map, (vm_offset_t *)&rec, uap->length); + if (error != KERN_SUCCESS) + return(ENOMEM); + + error = copyin(uap->record, rec, uap->length); + if (error) + goto free_out; + + /* Verify the record */ + if (bsm_rec_verify(rec) == 0) { + error = EINVAL; + goto free_out; + } + + /* Attach the user audit record to the kernel audit record. Because + * this system call is an auditable event, we will write the user + * record along with the record for this audit event. + */ + ar->k_udata = rec; + ar->k_ulen = uap->length; + return (0); + +free_out: + kmem_free(kernel_map, (vm_offset_t)rec, uap->length); + return (error); +} + +/* + * System call to manipulate auditing. + */ +struct auditon_args { + int cmd; + void * data; + int length; +}; +/* ARGSUSED */ +int +auditon(struct proc *p, struct auditon_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + return (ENOSYS); +} + +/* + * System call to pass in file descriptor for audit log. + */ +struct auditsvc_args { + int fd; + int limit; +}; +/* ARGSUSED */ +int +auditsvc(struct proc *p, struct auditsvc_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + return (ENOSYS); +} + +/* + * System calls to manage the user audit information. + * XXXAUDIT May need to lock the proc structure. + */ +struct getauid_args { + au_id_t *auid; +}; +/* ARGSUSED */ +int +getauid(struct proc *p, struct getauid_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + + error = copyout((void *)&p->p_au->ai_auid, (void *)uap->auid, + sizeof(*uap->auid)); + if (error) + return (error); + + return (0); +} + +struct setauid_args { + au_id_t *auid; +}; +/* ARGSUSED */ +int +setauid(struct proc *p, struct setauid_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + + error = copyin((void *)uap->auid, (void *)&p->p_au->ai_auid, + sizeof(p->p_au->ai_auid)); + if (error) + return (error); + + audit_arg_auid(p->p_au->ai_auid); + return (0); +} + +/* + * System calls to get and set process audit information. + */ +struct getaudit_args { + struct auditinfo *auditinfo; +}; +/* ARGSUSED */ +int +getaudit(struct proc *p, struct getaudit_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + error = copyout((void *)p->p_au, (void *)uap->auditinfo, + sizeof(*uap->auditinfo)); + if (error) + return (error); + + return (0); +} + +struct setaudit_args { + struct auditinfo *auditinfo; +}; +/* ARGSUSED */ +int +setaudit(struct proc *p, struct setaudit_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + error = copyin((void *)uap->auditinfo, (void *)p->p_au, + sizeof(*p->p_au)); + if (error) + return (error); + + return (0); +} + +struct getaudit_addr_args { + struct auditinfo_addr *auditinfo_addr; + int length; +}; +/* ARGSUSED */ +int +getaudit_addr(struct proc *p, struct getaudit_addr_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + return (ENOSYS); +} + +struct setaudit_addr_args { + struct auditinfo_addr *auditinfo_addr; + int length; +}; +/* ARGSUSED */ +int +setaudit_addr(struct proc *p, struct setaudit_addr_args *uap, register_t *retval) +{ + register struct pcred *pc = p->p_cred; + int error; + + error = suser(pc->pc_ucred, &p->p_acflag); + if (error) + return (error); + return (ENOSYS); +} + +/* + * Syscall to manage audit files. + * + * XXX: Should generate an audit event. + */ +struct auditctl_args { + char *path; +}; +/* ARGSUSED */ +int +auditctl(struct proc *p, struct auditctl_args *uap) +{ + struct kaudit_record *ar; + struct nameidata nd; + struct ucred *cred; + struct vnode *vp; + int error, flags, ret; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + + vp = NULL; + cred = NULL; + + /* + * If a path is specified, open the replacement vnode, perform + * validity checks, and grab another reference to the current + * credential. + */ + if (uap->path != NULL) { + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + uap->path, p); + flags = audit_open_flags; + error = vn_open(&nd, flags, 0); + if (error) + goto out; + VOP_UNLOCK(nd.ni_vp, 0, p); + vp = nd.ni_vp; + if (vp->v_type != VREG) { + vn_close(vp, audit_close_flags, p->p_ucred, p); + error = EINVAL; + goto out; + } + cred = p->p_ucred; + crhold(cred); + } + + audit_rotate_vnode(cred, vp); +out: + return (error); +} + +/********************************** + * End of system calls. * + **********************************/ + +/* + * MPSAFE + */ +struct kaudit_record * +audit_new(int event, struct proc *p, struct uthread *uthread) +{ + struct kaudit_record *ar; + int no_record; + + /* + * Eventually, there may be certain classes of events that + * we will audit regardless of the audit state at the time + * the record is created. These events will generally + * correspond to changes in the audit state. The dummy + * code below is from our first prototype, but may also + * be used in the final version (with modified event numbers). + */ +#if 0 + if (event != AUDIT_EVENT_FILESTOP && event != AUDIT_EVENT_FILESTART) { +#endif + mutex_lock(audit_mtx); + no_record = (audit_suspended || !audit_enabled); + mutex_unlock(audit_mtx); + if (no_record) + return (NULL); +#if 0 + } +#endif + + /* + * Eventually, we might want to have global event filtering + * by event type here. + */ + + /* + * XXX: Process-based event preselection should occur here. + * Currently, we only post-select. + */ + + /* + * Initialize the audit record header. + * XXX: Should probably use a zone; whatever we use must be + * safe to call from the non-BSD side of the house. + * XXX: We may want to fail-stop if allocation fails. + */ + (void)kmem_alloc(kernel_map, &ar, sizeof(*ar)); + if (ar == NULL) + return NULL; + + bzero(ar, sizeof(*ar)); + ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC; + ar->k_ar.ar_event = event; + nanotime(&ar->k_ar.ar_starttime); + + /* Export the subject credential. */ + cru2x(p->p_ucred, &ar->k_ar.ar_subj_cred); + ar->k_ar.ar_subj_ruid = p->p_cred->p_ruid; + ar->k_ar.ar_subj_rgid = p->p_cred->p_rgid; + ar->k_ar.ar_subj_egid = p->p_ucred->cr_groups[0]; + ar->k_ar.ar_subj_auid = p->p_au->ai_auid; + ar->k_ar.ar_subj_pid = p->p_pid; + bcopy(p->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN); + bcopy(&p->p_au->ai_mask, &ar->k_ar.ar_subj_amask, + sizeof(p->p_au->ai_mask)); + + return (ar); +} + +/* + * MPSAFE + * XXXAUDIT: So far, this is unused, and should probably be GC'd. + */ +void +audit_abort(struct kaudit_record *ar) +{ + + audit_free(ar); +} + +/* + * MPSAFE + */ +void +audit_commit(struct kaudit_record *ar, int error, int retval) +{ + + if (ar == NULL) + return; + + ar->k_ar.ar_errno = error; + ar->k_ar.ar_retval = retval; + + /* + * We might want to do some system-wide post-filtering + * here at some point. + */ + + /* + * Timestamp system call end. + */ + nanotime(&ar->k_ar.ar_endtime); + + /* + * XXXAUDIT: The number of outstanding uncommitted audit records is + * limited by the number of concurrent threads servicing system + * calls in the kernel. However, there is currently no bound on + * the size of the committed records in the audit event queue + * before they are sent to disk. Probably, there should be a fixed + * size bound (perhaps configurable), and if that bound is reached, + * threads should sleep in audit_commit() until there's room. + */ + mutex_lock(audit_mtx); + /* + * Note: it could be that some records initiated while audit was + * enabled should still be committed? + */ + if (audit_suspended || !audit_enabled) { + mutex_unlock(audit_mtx); + audit_free(ar); + return; + } + TAILQ_INSERT_TAIL(&audit_q, ar, k_q); + wait_queue_wakeup_one(audit_wait_queue, 0, THREAD_AWAKENED); + mutex_unlock(audit_mtx); +} + +/* + * Calls to set up and tear down audit structures associated with + * each system call. + */ +void +audit_syscall_enter(unsigned short code, struct proc *proc, + struct uthread *uthread) +{ + int audit_event; + + assert(uthread->uu_ar == NULL); + + audit_event = sys_au_event[code]; + + /* + * Allocate an audit record, if desired, and store in the BSD + * thread for later use. + */ + if (audit_event != AUE_NULL) { +#if 0 + AUDIT_PRINTF(("Allocated record type %d for syscall %d\n", + audit_event, code)); +#endif + if (au_preselect(audit_event, &proc->p_au->ai_mask, + AU_PRS_FAILURE | AU_PRS_SUCCESS)) { + uthread->uu_ar = audit_new(audit_event, proc, uthread); + } else { + uthread->uu_ar = NULL; + } + } +} + +void +audit_syscall_exit(int error, struct proc *proc, struct uthread *uthread) +{ + int retval; + + /* + * Commit the audit record as desired; once we pass the record + * into audit_commit(), the memory is owned by the audit + * subsystem. + * The return value from the system call is stored on the user + * thread. If there was an error, the return value is set to -1, + * imitating the behavior of the cerror routine. + */ + if (error) + retval = -1; + else + retval = uthread->uu_rval[0]; + + audit_commit(uthread->uu_ar, error, retval); + if (uthread->uu_ar != NULL) + AUDIT_PRINTF(("audit record committed by pid %d\n", proc->p_pid)); + uthread->uu_ar = NULL; + +} + +/* + * Calls to manipulate elements of the audit record structure from system + * call code. Macro wrappers will prevent this functions from being + * entered if auditing is disabled, avoiding the function call cost. We + * check the thread audit record pointer anyway, as the audit condition + * could change, and pre-selection may not have allocated an audit + * record for this event. + */ +void +audit_arg_accmode(int accmode) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_accmode = accmode; + ar->k_ar.ar_valid_arg |= ARG_ACCMODE; +} + +void +audit_arg_cmode(int cmode) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_cmode = cmode; + ar->k_ar.ar_valid_arg |= ARG_CMODE; +} + +void +audit_arg_fd(int fd) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_fd = fd; + ar->k_ar.ar_valid_arg |= ARG_FD; +} + +void +audit_arg_fflags(int fflags) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_fflags = fflags; + ar->k_ar.ar_valid_arg |= ARG_FFLAGS; +} + +void +audit_arg_gid(gid_t gid, gid_t egid, gid_t rgid, gid_t sgid) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_gid = gid; + ar->k_ar.ar_arg_egid = egid; + ar->k_ar.ar_arg_rgid = rgid; + ar->k_ar.ar_arg_sgid = sgid; + ar->k_ar.ar_valid_arg |= (ARG_GID | ARG_EGID | ARG_RGID | ARG_SGID); +} + +void +audit_arg_uid(uid_t uid, uid_t euid, uid_t ruid, uid_t suid) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_uid = uid; + ar->k_ar.ar_arg_euid = euid; + ar->k_ar.ar_arg_ruid = ruid; + ar->k_ar.ar_arg_suid = suid; + ar->k_ar.ar_valid_arg |= (ARG_UID | ARG_EUID | ARG_RUID | ARG_SUID); +} + +void +audit_arg_groupset(gid_t *gidset, u_int gidset_size) +{ + int i; + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + for (i = 0; i < gidset_size; i++) + ar->k_ar.ar_arg_groups.gidset[i] = gidset[i]; + ar->k_ar.ar_arg_groups.gidset_size = gidset_size; + ar->k_ar.ar_valid_arg |= ARG_GROUPSET; +} + +void +audit_arg_login(char *login) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + +#if 0 + /* + * XXX: Add strlcpy() to Darwin for improved safety. + */ + strlcpy(ar->k_ar.ar_arg_login, login, MAXLOGNAME); +#else + strcpy(ar->k_ar.ar_arg_login, login); +#endif + + ar->k_ar.ar_valid_arg |= ARG_LOGIN; +} + +void +audit_arg_mask(int mask) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_mask = mask; + ar->k_ar.ar_valid_arg |= ARG_MASK; +} + +void +audit_arg_mode(mode_t mode) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_mode = mode; + ar->k_ar.ar_valid_arg |= ARG_MODE; +} + +void +audit_arg_dev(int dev) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_dev = dev; + ar->k_ar.ar_valid_arg |= ARG_DEV; +} + +void +audit_arg_owner(uid_t uid, gid_t gid) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_uid = uid; + ar->k_ar.ar_arg_gid = gid; + ar->k_ar.ar_valid_arg |= (ARG_UID | ARG_GID); +} + +void +audit_arg_pid(pid_t pid) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_pid = pid; + ar->k_ar.ar_valid_arg |= ARG_PID; +} + +void +audit_arg_signum(u_int signum) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_signum = signum; + ar->k_ar.ar_valid_arg |= ARG_SIGNUM; +} + +void +audit_arg_socket(int sodomain, int sotype, int soprotocol) +{ + + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_sockinfo.sodomain = sodomain; + ar->k_ar.ar_arg_sockinfo.sotype = sotype; + ar->k_ar.ar_arg_sockinfo.soprotocol = soprotocol; + ar->k_ar.ar_valid_arg |= ARG_SOCKINFO; +} + +void +audit_arg_sockaddr(struct proc *p, struct sockaddr *so) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL || p == NULL || so == NULL) + return; + + bcopy(so, &ar->k_ar.ar_arg_sockaddr, sizeof(ar->k_ar.ar_arg_sockaddr)); + switch (so->sa_family) { + case AF_INET: + ar->k_ar.ar_valid_arg |= ARG_SADDRINET; + break; + case AF_INET6: + ar->k_ar.ar_valid_arg |= ARG_SADDRINET6; + break; + case AF_UNIX: + audit_arg_upath(p, ((struct sockaddr_un *)so)->sun_path, + ARG_UPATH1); + ar->k_ar.ar_valid_arg |= ARG_SADDRUNIX; + break; + } +} + +void +audit_arg_auid(uid_t auid) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_auid = auid; + ar->k_ar.ar_valid_arg |= ARG_AUID; +} + +void +audit_arg_text(char *text) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + /* Invalidate the text string */ + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_TEXT); + if (text == NULL) + return; + + if (ar->k_ar.ar_arg_text == NULL) { + kmem_alloc(kernel_map, &ar->k_ar.ar_arg_text, MAXPATHLEN); + if (ar->k_ar.ar_arg_text == NULL) + return; + } + + strcpy(ar->k_ar.ar_arg_text, text); + ar->k_ar.ar_valid_arg |= ARG_TEXT; +} + +void +audit_arg_cmd(int cmd) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_cmd = cmd; + ar->k_ar.ar_valid_arg |= ARG_CMD; +} + +void +audit_arg_svipc_cmd(int cmd) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_svipc_cmd = cmd; + ar->k_ar.ar_valid_arg |= ARG_SVIPC_CMD; +} + +void +audit_arg_svipc_perm(struct ipc_perm *perm) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + bcopy(perm, &ar->k_ar.ar_arg_svipc_perm, + sizeof(ar->k_ar.ar_arg_svipc_perm)); + ar->k_ar.ar_valid_arg |= ARG_SVIPC_PERM; +} + +void +audit_arg_svipc_id(int id) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_svipc_id = id; + ar->k_ar.ar_valid_arg |= ARG_SVIPC_ID; +} + +void +audit_arg_svipc_addr(void * addr) +{ + struct kaudit_record *ar; + + ar = currecord(); + if (ar == NULL) + return; + + ar->k_ar.ar_arg_svipc_addr = addr; + ar->k_ar.ar_valid_arg |= ARG_SVIPC_ADDR; +} + +/* + * Initialize the audit information for the a process, presumably the first + * process in the system. + * XXX It is not clear what the initial values should be for audit ID, + * session ID, etc. + */ +void +audit_proc_init(struct proc *p) +{ + MALLOC_ZONE(p->p_au, struct auditinfo *, sizeof(*p->p_au), + M_SUBPROC, M_WAITOK); + + bzero((void *)p->p_au, sizeof(*p->p_au)); +} + +/* + * Copy the audit info from the parent process to the child process when + * a fork takes place. + * XXX Need to check for failure from the memory allocation, in here + * as well as in any functions that use the process auditing info. + */ +void +audit_proc_fork(struct proc *parent, struct proc *child) +{ + /* Always set up the audit information pointer as this function + * should only be called when the proc is new. If proc structures + * are ever cached and reused, then this behavior will leak memory. + */ + MALLOC_ZONE(child->p_au, struct auditinfo *, sizeof(*child->p_au), + M_SUBPROC, M_WAITOK); + + bcopy(parent->p_au, child->p_au, sizeof(*child->p_au)); +} + +/* + * Free the auditing structure for the process. + */ +void +audit_proc_free(struct proc *p) +{ + FREE_ZONE((void *)p->p_au, sizeof(*p->p_au), M_SUBPROC); + p->p_au = NULL; +} + +/* + * Store a path as given by the user process for auditing into the audit + * record stored on the user thread. This function will allocate the memory to + * store the path info if not already available. This memory will be + * freed when the audit record is freed. + */ +void +audit_arg_upath(struct proc *p, char *upath, u_int64_t flags) +{ + struct kaudit_record *ar; + char **pathp; + + if (p == NULL || upath == NULL) + return; /* nothing to do! */ + + if (flags & (ARG_UPATH1 | ARG_UPATH2) == 0) + return; + + ar = currecord(); + if (ar == NULL) /* This will be the case for unaudited system calls */ + return; + + if (flags & ARG_UPATH1) { + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH1); + pathp = &ar->k_ar.ar_arg_upath1; + } + else { + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH2); + pathp = &ar->k_ar.ar_arg_upath2; + } + + if (*pathp == NULL) { + kmem_alloc(kernel_map, pathp, MAXPATHLEN); + if (*pathp == NULL) + return; + } + + canon_path(p, upath, *pathp); + + if (flags & ARG_UPATH1) + ar->k_ar.ar_valid_arg |= ARG_UPATH1; + else + ar->k_ar.ar_valid_arg |= ARG_UPATH2; +} + +/* + * Function to save the path and vnode attr information into the audit + * record. + * + * It is assumed that the caller will hold any vnode locks necessary to + * perform a VOP_GETATTR() on the passed vnode. + * + * XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but + * always provides access to the generation number as we need that + * to construct the BSM file ID. + * XXX: We should accept the process argument from the caller, since + * it's very likely they already have a reference. + * XXX: Error handling in this function is poor. + */ +void +audit_arg_vnpath(struct vnode *vp, u_int64_t flags) +{ + struct kaudit_record *ar; + struct vattr vattr; + int error; + int len; + char **pathp; + struct vnode_au_info *vnp; + struct proc *p; + + if (vp == NULL) + return; + + ar = currecord(); + if (ar == NULL) /* This will be the case for unaudited system calls */ + return; + + if (flags & (ARG_VNODE1 | ARG_VNODE2) == 0) + return; + + p = current_proc(); + + if (flags & ARG_VNODE1) { + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_KPATH1); + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1); + pathp = &ar->k_ar.ar_arg_kpath1; + vnp = &ar->k_ar.ar_arg_vnode1; + } + else { + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_KPATH2); + ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2); + pathp = &ar->k_ar.ar_arg_kpath2; + vnp = &ar->k_ar.ar_arg_vnode2; + } + + if (*pathp == NULL) { + kmem_alloc(kernel_map, pathp, MAXPATHLEN); + if (*pathp == NULL) + return; + } + + /* Copy the path looked up by the vn_getpath() function */ + len = MAXPATHLEN; + vn_getpath(vp, *pathp, &len); + if (flags & ARG_VNODE1) + ar->k_ar.ar_valid_arg |= ARG_KPATH1; + else + ar->k_ar.ar_valid_arg |= ARG_KPATH2; + + /* + * XXX: We'd assert the vnode lock here, only Darwin doesn't + * appear to have vnode locking assertions. + */ + error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + if (error) { + /* XXX: How to handle this case? */ + return; + } + + vnp->vn_mode = vattr.va_mode; + vnp->vn_uid = vattr.va_uid; + vnp->vn_gid = vattr.va_gid; + vnp->vn_dev = vattr.va_rdev; + vnp->vn_fsid = vattr.va_fsid; + vnp->vn_fileid = vattr.va_fileid; + vnp->vn_gen = vattr.va_gen; + if (flags & ARG_VNODE1) + ar->k_ar.ar_valid_arg |= ARG_VNODE1; + else + ar->k_ar.ar_valid_arg |= ARG_VNODE2; + +} + +#else /* !AUDIT */ + +void +audit_init(void) +{ + +} + +void +audit_shutdown(void) +{ + +} + +int +audit(struct proc *p, struct audit_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +auditon(struct proc *p, struct auditon_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +auditsvc(struct proc *p, struct auditsvc_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +getauid(struct proc *p, struct getauid_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +setauid(struct proc *p, struct setauid_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +getaudit(struct proc *p, struct getaudit_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +setaudit(struct proc *p, struct setaudit_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +getaudit_addr(struct proc *p, struct getaudit_addr_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +setaudit_addr(struct proc *p, struct setaudit_addr_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +int +auditctl(struct proc *p, struct auditctl_args *uap, register_t *retval) +{ + return (ENOSYS); +} + +void +audit_proc_init(struct proc *p) +{ + +} + +void +audit_proc_fork(struct proc *parent, struct proc *child) +{ + +} + +void +audit_proc_free(struct proc *p) +{ + +} + +#endif /* AUDIT */ diff -urN xnu-344.49/bsd/kern/kern_bsm_audit.c xnu-517/bsd/kern/kern_bsm_audit.c --- xnu-344.49/bsd/kern/kern_bsm_audit.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/kern/kern_bsm_audit.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* The number of BSM records allocated. */ +static int bsm_rec_count = 0; + +/* + * Records that can be recycled are maintained in the list given below + * The maximum number of elements that can be present in this list is + * bounded by MAX_AUDIT_RECORDS. Memory allocated for these records are never + * freed + */ +LIST_HEAD(, au_record) bsm_free_q; + +/* + * Lock for serializing access to the list of audit records. + */ +static mutex_t *bsm_audit_mutex; + +/* + * Initialize the BSM auditing subsystem. + */ +void +kau_init(void) +{ + printf("BSM auditing present\n"); + LIST_INIT(&bsm_free_q); + bsm_audit_mutex = mutex_alloc(ETAP_NO_TRACE); +} + +/* + * This call reserves memory for the audit record. + * Memory must be guaranteed before any auditable event can be + * generated. + * The au_record structure maintains a reference to the + * memory allocated above and also the list of tokens associated + * with this record + */ +struct au_record * +kau_open(void) +{ + struct au_record *rec = NULL; + + /* + * Find an unused record, remove it from the free list, mark as used + */ + mutex_lock(bsm_audit_mutex); + if (!LIST_EMPTY(&bsm_free_q)) { + rec = LIST_FIRST(&bsm_free_q); + LIST_REMOVE(rec, au_rec_q); + } + mutex_unlock(bsm_audit_mutex); + + if (rec == NULL) { + mutex_lock(bsm_audit_mutex); + if (bsm_rec_count >= MAX_AUDIT_RECORDS) { + /* XXX We need to increase size of MAX_AUDIT_RECORDS */ + mutex_unlock(bsm_audit_mutex); + return NULL; + } + mutex_unlock(bsm_audit_mutex); + + /* + * Create a new BSM kernel record. + */ + kmem_alloc(kernel_map, &rec, sizeof(*rec)); + if(rec == NULL) { + return NULL; + } + kmem_alloc(kernel_map, &rec->data, + MAX_AUDIT_RECORD_SIZE * sizeof(u_char)); + if((rec->data) == NULL) { + kmem_free(kernel_map, rec, sizeof(*rec)); + return NULL; + } + mutex_lock(bsm_audit_mutex); + bsm_rec_count++; + mutex_unlock(bsm_audit_mutex); + } + memset(rec->data, 0, MAX_AUDIT_RECORD_SIZE); + + TAILQ_INIT(&rec->token_q); + rec->len = 0; + rec->used = 1; + + return rec; +} + +/* + * Store the token with the record descriptor + * + */ +int kau_write(struct au_record *rec, struct au_token *tok) +{ + if(tok == NULL) { + return -1; /* Invalid Token */ + } + + /* Add the token to the tail */ + /* + * XXX Not locking here -- we should not be writing to + * XXX the same audit record from different threads + */ + TAILQ_INSERT_TAIL(&rec->token_q, tok, tokens); + + rec->len += tok->len; /* grow record length by token size bytes */ + + return 0; +} + +/* + * Close out the audit record by adding the header token, identifying + * any missing tokens. Write out the tokens to the record memory. + */ +int kau_close(struct au_record *rec, struct timespec *ctime, short event) +{ + u_char *dptr; + size_t tot_rec_size; + token_t *cur, *hdr, *trail; + int retval = 0; + + tot_rec_size = rec->len + HEADER_SIZE + TRAILER_SIZE; + if(tot_rec_size <= MAX_AUDIT_RECORD_SIZE) { + /* Create the header token */ + hdr = kau_to_header32(ctime, tot_rec_size, event, 0); + + if(hdr != NULL) { + /* Add to head of list */ + TAILQ_INSERT_HEAD(&rec->token_q, hdr, tokens); + + trail = au_to_trailer(tot_rec_size); + if(trail != NULL) { + TAILQ_INSERT_TAIL(&rec->token_q, trail, tokens); + } + } + /* Serialize token data to the record */ + + rec->len = tot_rec_size; + dptr = rec->data; + TAILQ_FOREACH(cur, &rec->token_q, tokens) { + memcpy(dptr, cur->t_data, cur->len); + dptr += cur->len; + } + } +} + +/* + * Free a BSM audit record by releasing all the tokens and clearing the + * audit record information. + */ +void kau_free(struct au_record *rec) +{ + struct au_token *tok; + + /* Free the token list */ + while ((tok = TAILQ_FIRST(&rec->token_q))) { + TAILQ_REMOVE(&rec->token_q, tok, tokens); + kmem_free(kernel_map, tok->t_data, tok->len); + kmem_free(kernel_map, tok, sizeof(struct au_token)); + } + + rec->used = 0; + rec->len = 0; + + mutex_lock(bsm_audit_mutex); + + /* Add the record to the freelist */ + LIST_INSERT_HEAD(&bsm_free_q, rec, au_rec_q); + + mutex_unlock(bsm_audit_mutex); + +} + +/* + * XXX May want turn some (or all) of these macros into functions in order + * to reduce the generated code sized. + */ +#define UPATH1_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_UPATH1) { \ + tok = au_to_path(ar->ar_arg_upath1); \ + kau_write(rec, tok); \ + } \ + } while (0) + +#define UPATH2_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_UPATH2) { \ + tok = au_to_path(ar->ar_arg_upath2); \ + kau_write(rec, tok); \ + } \ + } while (0) + +#define KPATH1_VNODE1_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_KPATH1) { \ + tok = au_to_path(ar->ar_arg_kpath1); \ + kau_write(rec, tok); \ + } \ + if (ar->ar_valid_arg & ARG_VNODE1) { \ + fill_vattr(&vattr, &ar->ar_arg_vnode1); \ + tok = au_to_attr32(&vattr); \ + kau_write(rec, tok); \ + } \ + } while (0) + +#define KPATH1_VNODE1_OR_UPATH1_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_KPATH1) { \ + tok = au_to_path(ar->ar_arg_kpath1); \ + kau_write(rec, tok); \ + } else { \ + UPATH1_TOKENS; \ + } \ + if (ar->ar_valid_arg & ARG_VNODE1) { \ + fill_vattr(&vattr, &ar->ar_arg_vnode1); \ + tok = au_to_attr32(&vattr); \ + kau_write(rec, tok); \ + } \ + } while (0) + +#define KPATH2_VNODE2_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_KPATH2) { \ + tok = au_to_path(ar->ar_arg_kpath2); \ + kau_write(rec, tok); \ + } \ + if (ar->ar_valid_arg & ARG_VNODE2) { \ + fill_vattr(&vattr, &ar->ar_arg_vnode2); \ + tok = au_to_attr32(&vattr); \ + kau_write(rec, tok); \ + } \ + } while (0) + +#define FD_KPATH1_VNODE1_TOKENS \ + do { \ + if (ar->ar_valid_arg & ARG_KPATH1) { \ + tok = au_to_path(ar->ar_arg_kpath1); \ + kau_write(rec, tok); \ + if (ar->ar_valid_arg & ARG_VNODE1) { \ + fill_vattr(&vattr, &ar->ar_arg_vnode1); \ + tok = au_to_attr32(&vattr); \ + kau_write(rec, tok); \ + } \ + } else { \ + tok = au_to_arg32(1, "no path: fd", ar->ar_arg_fd); \ + kau_write(rec, tok); \ + } \ + } while (0) + +/* + * Convert an internal kernel audit record to a BSM record and return + * a success/failure indicator. The BSM record is passed as an out + * parameter to this function. + * Return conditions: + * BSM_SUCCESS: The BSM record is valid + * BSM_FAILURE: Failure; the BSM record is NULL. + * BSM_NOAUDIT: The event is not auditable for BSM; the BSM record is NULL. + */ +int +kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) +{ + struct au_token *tok, *subj_tok; + struct au_record *rec; + au_tid_t tid; + struct audit_record *ar; + struct vattr vattr; + int sorf; + int ctr; + + *pau = NULL; + if (kar == NULL) + return (BSM_FAILURE); + + ar = &kar->k_ar; + + /* + * Decide whether to create the BSM audit record by checking the + * error value from the system call and using the appropriate + * user audit mask. + */ + if (ar->ar_errno) + sorf = AU_PRS_FAILURE; + else + sorf = AU_PRS_SUCCESS; + + if (au_preselect(ar->ar_event, &ar->ar_subj_amask, sorf) == 0) + return (BSM_NOAUDIT); + + rec = kau_open(); + if (rec == NULL) + return (BSM_FAILURE); + + /* Create the subject token */ + tid.port = ar->ar_subj_term.port; + tid.machine = ar->ar_subj_term.machine; + subj_tok = au_to_subject32(ar->ar_subj_auid, /* audit ID */ + ar->ar_subj_cred.cr_uid, /* eff uid */ + ar->ar_subj_egid, /* eff group id */ + ar->ar_subj_ruid, /* real uid */ + ar->ar_subj_rgid, /* real group id */ + ar->ar_subj_pid, /* process id */ + ar->ar_subj_asid, /* session ID */ + &tid); + + /* The logic inside each case fills in the tokens required for the + * event, except for the header, trailer, and return tokens. The + * header and trailer tokens are added by the kau_close() function. + * The return token is added outside of the switch statement. + */ + switch(ar->ar_event) { + + /* + * Socket-related events. + */ + case AUE_ACCEPT: + case AUE_BIND: + case AUE_CONNECT: + case AUE_RECVFROM: + case AUE_RECVMSG: + case AUE_SENDMSG: + case AUE_SENDTO: + tok = au_to_arg32(1, "fd", ar->ar_arg_fd); + kau_write(rec, tok); + if (ar->ar_valid_arg & ARG_SADDRINET) { + tok = au_to_sock_inet( + (struct sockaddr_in *)&ar->ar_arg_sockaddr); + kau_write(rec, tok); + } + if (ar->ar_valid_arg & ARG_SADDRUNIX) { + tok = au_to_sock_unix( + (struct sockaddr_un *)&ar->ar_arg_sockaddr); + kau_write(rec, tok); + UPATH1_TOKENS; + } + /* XXX Need to handle ARG_SADDRINET6 */ + break; + + case AUE_SOCKET: + case AUE_SOCKETPAIR: + tok = au_to_arg32(1,"domain", ar->ar_arg_sockinfo.sodomain); + kau_write(rec, tok); + tok = au_to_arg32(2,"type", ar->ar_arg_sockinfo.sotype); + kau_write(rec, tok); + tok = au_to_arg32(3,"protocol", ar->ar_arg_sockinfo.soprotocol); + kau_write(rec, tok); + break; + + case AUE_SETSOCKOPT: + case AUE_SHUTDOWN: + tok = au_to_arg32(1, "fd", ar->ar_arg_fd); + kau_write(rec, tok); + break; + + case AUE_SETAUID: + tok = au_to_arg32(2, "setauid", ar->ar_arg_auid); + kau_write(rec, tok); + /* fall through */ + case AUE_ADJTIME: + case AUE_AUDIT: + case AUE_EXIT: + case AUE_GETAUID: + case AUE_GETFSSTAT: + case AUE_PIPE: + case AUE_SETPGRP: + case AUE_SETRLIMIT: + /* Header, subject, and return tokens added at end */ + break; + + case AUE_ACCESS: + case AUE_CHDIR: + case AUE_CHROOT: + case AUE_EXECVE: + case AUE_GETATTRLIST: + case AUE_GETFH: + case AUE_LSTAT: + case AUE_MKFIFO: + case AUE_PATHCONF: + case AUE_READLINK: + case AUE_REVOKE: + case AUE_RMDIR: + case AUE_SEARCHFS: + case AUE_SETATTRLIST: + case AUE_STAT: + case AUE_STATFS: + case AUE_TRUNCATE: + case AUE_UNDELETE: + case AUE_UNLINK: + case AUE_UTIMES: + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_CHFLAGS: + tok = au_to_arg32(2, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_CHMOD: + tok = au_to_arg32(2, "new file mode", ar->ar_arg_mode); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_CHOWN: + tok = au_to_arg32(2, "new file uid", ar->ar_arg_uid); + kau_write(rec, tok); + tok = au_to_arg32(3, "new file gid", ar->ar_arg_gid); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_EXCHANGEDATA: + KPATH1_VNODE1_OR_UPATH1_TOKENS; + KPATH2_VNODE2_TOKENS; + break; + +/* + * XXXAUDIT: Close is not audited in the kernel yet. + case AUE_CLOSE: + tok = au_to_arg32(2, "fd", ar->ar_arg_fd); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; +*/ + case AUE_FCHMOD: + tok = au_to_arg32(2, "new file mode", ar->ar_arg_mode); + kau_write(rec, tok); + FD_KPATH1_VNODE1_TOKENS; + break; + + case AUE_FCHDIR: + case AUE_FPATHCONF: + case AUE_FSTAT: /* XXX Need to handle sockets and shm */ + case AUE_FSTATFS: + case AUE_FTRUNCATE: + case AUE_FUTIMES: + case AUE_GETDIRENTRIES: + case AUE_GETDIRENTRIESATTR: + FD_KPATH1_VNODE1_TOKENS; + break; + + case AUE_FCHOWN: + tok = au_to_arg32(2, "new file uid", ar->ar_arg_uid); + kau_write(rec, tok); + tok = au_to_arg32(3, "new file gid", ar->ar_arg_gid); + kau_write(rec, tok); + FD_KPATH1_VNODE1_TOKENS; + break; + + case AUE_FCNTL: + if (ar->ar_arg_cmd == F_GETLK || ar->ar_arg_cmd == F_SETLK || + ar->ar_arg_cmd == F_SETLKW) { + tok = au_to_arg32(2, "cmd", ar->ar_arg_cmd); + kau_write(rec, tok); + FD_KPATH1_VNODE1_TOKENS; + } + break; + + case AUE_FCHFLAGS: + tok = au_to_arg32(2, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + FD_KPATH1_VNODE1_TOKENS; + break; + + case AUE_FLOCK: + tok = au_to_arg32(2, "operation", ar->ar_arg_cmd); + kau_write(rec, tok); + FD_KPATH1_VNODE1_TOKENS; + break; + + case AUE_LINK: + case AUE_RENAME: + KPATH1_VNODE1_OR_UPATH1_TOKENS; + UPATH2_TOKENS; + break; + + case AUE_MKDIR: + tok = au_to_arg32(2, "mode", ar->ar_arg_mode); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_MKNOD: + tok = au_to_arg32(2, "mode", ar->ar_arg_mode); + kau_write(rec, tok); + tok = au_to_arg32(3, "dev", ar->ar_arg_dev); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_MOUNT: + /* XXX Need to handle NFS mounts */ + tok = au_to_arg32(3, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + if (ar->ar_arg_text != NULL) { + tok = au_to_text(ar->ar_arg_text); + kau_write(rec, tok); + } + /* fall through */ + case AUE_UMOUNT: + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_MSGCTL: + ar->ar_event = msgctl_to_event(ar->ar_arg_svipc_cmd); + /* Fall through */ + case AUE_MSGRCV: + case AUE_MSGSND: + tok = au_to_arg32(1, "msg ID", ar->ar_arg_svipc_id); + kau_write(rec, tok); + if (ar->ar_errno != EINVAL) { + tok = au_to_ipc(AT_IPC_MSG, ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + + case AUE_MSGGET: + if (ar->ar_errno == 0) { + tok = au_to_ipc(AT_IPC_MSG, ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + + case AUE_OPEN_R: + case AUE_OPEN_RC: + case AUE_OPEN_RTC: + case AUE_OPEN_RT: + case AUE_OPEN_RW: + case AUE_OPEN_RWC: + case AUE_OPEN_RWTC: + case AUE_OPEN_RWT: + case AUE_OPEN_W: + case AUE_OPEN_WC: + case AUE_OPEN_WTC: + case AUE_OPEN_WT: + /* The open syscall always writes a OPEN_R event; convert the + * file flags to the proper type of event. + */ + ar->ar_event = flags_to_openevent(ar->ar_arg_fflags); + UPATH1_TOKENS; /* Save the user space path */ + KPATH1_VNODE1_TOKENS; /* Audit the kernel path as well */ + break; + + case AUE_QUOTACTL: + tok = au_to_arg32(2, "command", ar->ar_arg_cmd); + kau_write(rec, tok); + tok = au_to_arg32(3, "uid", ar->ar_arg_uid); + kau_write(rec, tok); + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_SEMCTL: + ar->ar_event = semctl_to_event(ar->ar_arg_svipc_cmd); + /* Fall through */ + case AUE_SEMOP: + tok = au_to_arg32(1, "sem ID", ar->ar_arg_svipc_id); + kau_write(rec, tok); + if (ar->ar_errno != EINVAL) { + tok = au_to_ipc(AT_IPC_SEM, ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + case AUE_SEMGET: + if (ar->ar_errno == 0) { + tok = au_to_ipc(AT_IPC_SEM, ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + case AUE_SETEGID: + tok = au_to_arg32(1, "gid", ar->ar_arg_egid); + kau_write(rec, tok); + break; + case AUE_SETEUID: + tok = au_to_arg32(1, "uid", ar->ar_arg_euid); + kau_write(rec, tok); + break; + case AUE_SETGID: + tok = au_to_arg32(1, "gid", ar->ar_arg_gid); + kau_write(rec, tok); + break; + case AUE_SETUID: + tok = au_to_arg32(1, "uid", ar->ar_arg_uid); + kau_write(rec, tok); + break; + case AUE_SETGROUPS: + if (ar->ar_valid_arg & ARG_GROUPSET) { + for(ctr = 0; ctr < ar->ar_arg_groups.gidset_size; ctr++) + { + tok = au_to_arg32(1, "setgroups", ar->ar_arg_groups.gidset[ctr]); + kau_write(rec, tok); + } + } + break; + case AUE_SHMAT: + tok = au_to_arg32(1, "shmid", ar->ar_arg_svipc_id); + kau_write(rec, tok); + tok = au_to_arg32(2, "shmaddr", (int)ar->ar_arg_svipc_addr); + kau_write(rec, tok); + if (ar->ar_valid_arg & ARG_SVIPC_PERM) { + tok = au_to_ipc(AT_IPC_SHM, ar->ar_arg_svipc_id); + kau_write(rec, tok); + tok = au_to_ipc_perm(&ar->ar_arg_svipc_perm); + kau_write(rec, tok); + } + break; + + case AUE_SHMCTL: + tok = au_to_arg32(1, "shmid", ar->ar_arg_svipc_id); + kau_write(rec, tok); + switch (ar->ar_arg_svipc_cmd) { + case IPC_STAT: + ar->ar_event = AUE_SHMCTL_STAT; + if (ar->ar_valid_arg & ARG_SVIPC_PERM) { + tok = au_to_ipc(AT_IPC_SHM, + ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + case IPC_RMID: + ar->ar_event = AUE_SHMCTL_RMID; + if (ar->ar_valid_arg & ARG_SVIPC_PERM) { + tok = au_to_ipc(AT_IPC_SHM, + ar->ar_arg_svipc_id); + kau_write(rec, tok); + } + break; + case IPC_SET: + ar->ar_event = AUE_SHMCTL_SET; + if (ar->ar_valid_arg & ARG_SVIPC_PERM) { + tok = au_to_ipc(AT_IPC_SHM, + ar->ar_arg_svipc_id); + kau_write(rec, tok); + tok = au_to_ipc_perm(&ar->ar_arg_svipc_perm); + kau_write(rec, tok); + } + break; + default: + break; /* We will audit a bad command */ + } + break; + + case AUE_SHMDT: + tok = au_to_arg32(1, "shmaddr", (int)ar->ar_arg_svipc_addr); + kau_write(rec, tok); + break; + + case AUE_SHMGET: + /* This is unusual; the return value is in an argument token */ + tok = au_to_arg32(0, "shmid", ar->ar_arg_svipc_id); + kau_write(rec, tok); + if (ar->ar_valid_arg & ARG_SVIPC_PERM) { + tok = au_to_ipc(AT_IPC_SHM, ar->ar_arg_svipc_id); + kau_write(rec, tok); + tok = au_to_ipc_perm(&ar->ar_arg_svipc_perm); + kau_write(rec, tok); + } + break; + + case AUE_SYMLINK: + if (ar->ar_valid_arg & ARG_TEXT) { + tok = au_to_text(ar->ar_arg_text); + kau_write(rec, tok); + } + KPATH1_VNODE1_OR_UPATH1_TOKENS; + break; + + case AUE_UMASK: + tok = au_to_arg32(1, "new mask", ar->ar_arg_mask); + kau_write(rec, tok); + tok = au_to_arg32(0, "prev mask", ar->ar_retval); + kau_write(rec, tok); + break; + + default: /* We shouldn't fall through to here. */ + printf("BSM conversion requested for unknown event %d\n", + ar->ar_event); + kau_free(rec); + return BSM_NOAUDIT; + } + + kau_write(rec, subj_tok); + tok = au_to_return32((char)ar->ar_errno, ar->ar_retval); + kau_write(rec, tok); /* Every record gets a return token */ + + kau_close(rec, &ar->ar_endtime, ar->ar_event); + + *pau = rec; + return BSM_SUCCESS; +} + +/* + * Verify that a record is a valid BSM record. This verification is + * simple now, but may be expanded on sometime in the future. + * Return 1 if the record is good, 0 otherwise. + * + */ +int +bsm_rec_verify(caddr_t rec) +{ + /* + * Check the token ID of the first token; it has to be a header + * token. + */ + /* XXXAUDIT There needs to be a token structure to map a token. + * XXXAUDIT 'Shouldn't be simply looking at the first char. + */ + if ( ((char)*rec != AU_HEADER_32_TOKEN) && + ((char)*rec != AU_HEADER_EX_32_TOKEN) && + ((char)*rec != AU_HEADER_64_TOKEN) && + ((char)*rec != AU_HEADER_EX_64_TOKEN) ) { + return (0); + } + return (1); +} diff -urN xnu-344.49/bsd/kern/kern_bsm_klib.c xnu-517/bsd/kern/kern_bsm_klib.c --- xnu-344.49/bsd/kern/kern_bsm_klib.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/kern/kern_bsm_klib.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Initialize the system call to audit event mapping table. This table + * must be kept in sync with the system call table. This table is meant to + * be directly accessed. + * XXX This should be improved, though, to make it independent of the syscall + * table (but we don't want to traverse a large table for every system call + * to find a match). Ultimately, it would be best to place the audit event + * number in the system call table. + */ +au_event_t sys_au_event[] = { + AUE_NULL, /* 0 = indir */ + AUE_EXIT, /* 1 = exit */ + AUE_NULL, /* 2 = fork */ + AUE_NULL, /* 3 = read */ + AUE_NULL, /* 4 = write */ + AUE_OPEN_R, /* 5 = open */ + AUE_NULL, /* 6 = close */ + AUE_NULL, /* 7 = wait4 */ + AUE_NULL, /* 8 = old creat */ + AUE_LINK, /* 9 = link */ + AUE_UNLINK, /* 10 = unlink */ + AUE_NULL, /* 11 was obsolete execv */ + AUE_CHDIR, /* 12 = chdir */ + AUE_FCHDIR, /* 13 = fchdir */ + AUE_MKNOD, /* 14 = mknod */ + AUE_CHMOD, /* 15 = chmod */ + AUE_CHOWN, /* 16 = chown; now 3 args */ + AUE_NULL, /* 17 = old break */ +#if COMPAT_GETFSSTAT + AUE_NULL, /* 18 = ogetfsstat */ +#else + AUE_GETFSSTAT, /* 18 = getfsstat */ +#endif + AUE_NULL, /* 19 = old lseek */ + AUE_NULL, /* 20 = getpid */ + AUE_NULL, /* 21 was obsolete mount */ + AUE_NULL, /* 22 was obsolete umount */ + AUE_SETUID, /* 23 = setuid */ + AUE_NULL, /* 24 = getuid */ + AUE_NULL, /* 25 = geteuid */ + AUE_NULL, /* 26 = ptrace */ + AUE_RECVMSG, /* 27 = recvmsg */ + AUE_SENDMSG, /* 28 = sendmsg */ + AUE_RECVFROM, /* 29 = recvfrom */ + AUE_ACCEPT, /* 30 = accept */ + AUE_NULL, /* 31 = getpeername */ + AUE_NULL, /* 32 = getsockname */ + AUE_ACCESS, /* 33 = access */ + AUE_CHFLAGS, /* 34 = chflags */ + AUE_FCHFLAGS, /* 35 = fchflags */ + AUE_NULL, /* 36 = sync */ + AUE_NULL, /* 37 = kill */ + AUE_NULL, /* 38 = old stat */ + AUE_NULL, /* 39 = getppid */ + AUE_NULL, /* 40 = old lstat */ + AUE_NULL, /* 41 = dup */ + AUE_PIPE, /* 42 = pipe */ + AUE_NULL, /* 43 = getegid */ + AUE_NULL, /* 44 = profil */ + AUE_NULL, /* 45 = ktrace */ + AUE_NULL, /* 46 = sigaction */ + AUE_NULL, /* 47 = getgid */ + AUE_NULL, /* 48 = sigprocmask */ + AUE_NULL, /* 49 = getlogin */ + AUE_NULL, /* 50 = setlogin */ + AUE_NULL, /* 51 = turn acct off/on */ + AUE_NULL, /* 52 = sigpending */ + AUE_NULL, /* 53 = sigaltstack */ + AUE_NULL, /* 54 = ioctl */ + AUE_NULL, /* 55 = reboot */ + AUE_REVOKE, /* 56 = revoke */ + AUE_SYMLINK, /* 57 = symlink */ + AUE_READLINK, /* 58 = readlink */ + AUE_EXECVE, /* 59 = execve */ + AUE_UMASK, /* 60 = umask */ + AUE_CHROOT, /* 61 = chroot */ + AUE_NULL, /* 62 = old fstat */ + AUE_NULL, /* 63 = used internally, reserved */ + AUE_NULL, /* 64 = old getpagesize */ + AUE_NULL, /* 65 = msync */ + AUE_NULL, /* 66 = vfork */ + AUE_NULL, /* 67 was obsolete vread */ + AUE_NULL, /* 68 was obsolete vwrite */ + AUE_NULL, /* 69 = sbrk */ + AUE_NULL, /* 70 = sstk */ + AUE_NULL, /* 71 = old mmap */ + AUE_NULL, /* 72 = old vadvise */ + AUE_NULL, /* 73 = munmap */ + AUE_NULL, /* 74 = mprotect */ + AUE_NULL, /* 75 = madvise */ + AUE_NULL, /* 76 was obsolete vhangup */ + AUE_NULL, /* 77 was obsolete vlimit */ + AUE_NULL, /* 78 = mincore */ + AUE_NULL, /* 79 = getgroups */ + AUE_SETGROUPS, /* 80 = setgroups */ + AUE_NULL, /* 81 = getpgrp */ + AUE_SETPGRP, /* 82 = setpgid */ + AUE_NULL, /* 83 = setitimer */ + AUE_NULL, /* 84 = old wait */ + AUE_NULL, /* 85 = swapon */ + AUE_NULL, /* 86 = getitimer */ + AUE_NULL, /* 87 = old gethostname */ + AUE_NULL, /* 88 = old sethostname */ + AUE_NULL, /* 89 getdtablesize */ + AUE_NULL, /* 90 = dup2 */ + AUE_NULL, /* 91 was obsolete getdopt */ + AUE_FCNTL, /* 92 = fcntl */ + AUE_NULL, /* 93 = select */ + AUE_NULL, /* 94 was obsolete setdopt */ + AUE_NULL, /* 95 = fsync */ + AUE_NULL, /* 96 = setpriority */ + AUE_SOCKET, /* 97 = socket */ + AUE_CONNECT, /* 98 = connect */ + AUE_NULL, /* 99 = accept */ + AUE_NULL, /* 100 = getpriority */ + AUE_NULL, /* 101 = old send */ + AUE_NULL, /* 102 = old recv */ + AUE_NULL, /* 103 = sigreturn */ + AUE_BIND, /* 104 = bind */ + AUE_SETSOCKOPT, /* 105 = setsockopt */ + AUE_NULL, /* 106 = listen */ + AUE_NULL, /* 107 was vtimes */ + AUE_NULL, /* 108 = sigvec */ + AUE_NULL, /* 109 = sigblock */ + AUE_NULL, /* 110 = sigsetmask */ + AUE_NULL, /* 111 = sigpause */ + AUE_NULL, /* 112 = sigstack */ + AUE_NULL, /* 113 = recvmsg */ + AUE_NULL, /* 114 = sendmsg */ + AUE_NULL, /* 115 = old vtrace */ + AUE_NULL, /* 116 = gettimeofday */ + AUE_NULL, /* 117 = getrusage */ + AUE_NULL, /* 118 = getsockopt */ + AUE_NULL, /* 119 = old resuba */ + AUE_NULL, /* 120 = readv */ + AUE_NULL, /* 121 = writev */ + AUE_NULL, /* 122 = settimeofday */ + AUE_FCHOWN, /* 123 = fchown */ + AUE_FCHMOD, /* 124 = fchmod */ + AUE_NULL, /* 125 = recvfrom */ + AUE_NULL, /* 126 = setreuid */ + AUE_NULL, /* 127 = setregid */ + AUE_RENAME, /* 128 = rename */ + AUE_NULL, /* 129 = old truncate */ + AUE_NULL, /* 130 = old ftruncate */ + AUE_FLOCK, /* 131 = flock */ + AUE_MKFIFO, /* 132 = mkfifo */ + AUE_SENDTO, /* 133 = sendto */ + AUE_SHUTDOWN, /* 134 = shutdown */ + AUE_SOCKETPAIR, /* 135 = socketpair */ + AUE_MKDIR, /* 136 = mkdir */ + AUE_RMDIR, /* 137 = rmdir */ + AUE_UTIMES, /* 138 = utimes */ + AUE_FUTIMES, /* 139 = futimes */ + AUE_ADJTIME, /* 140 = adjtime */ + AUE_NULL, /* 141 = getpeername */ + AUE_NULL, /* 142 = old gethostid */ + AUE_NULL, /* 143 = old sethostid */ + AUE_NULL, /* 144 = old getrlimit */ + AUE_NULL, /* 145 = old setrlimit */ + AUE_NULL, /* 146 = old killpg */ + AUE_NULL, /* 147 = setsid */ + AUE_NULL, /* 148 was setquota */ + AUE_NULL, /* 149 was qquota */ + AUE_NULL, /* 150 = getsockname */ + AUE_NULL, /* 151 = getpgid */ + AUE_NULL, /* 152 = setprivexec */ + AUE_NULL, /* 153 = pread */ + AUE_NULL, /* 154 = pwrite */ + AUE_NULL, /* 155 = nfs_svc */ + AUE_NULL, /* 156 = old getdirentries */ + AUE_STATFS, /* 157 = statfs */ + AUE_FSTATFS, /* 158 = fstatfs */ + AUE_UMOUNT, /* 159 = unmount */ + AUE_NULL, /* 160 was async_daemon */ + AUE_GETFH, /* 161 = get file handle */ + AUE_NULL, /* 162 = getdomainname */ + AUE_NULL, /* 163 = setdomainname */ + AUE_NULL, /* 164 */ +#if QUOTA + AUE_QUOTACTL, /* 165 = quotactl */ +#else /* QUOTA */ + AUE_NULL, /* 165 = not configured */ +#endif /* QUOTA */ + AUE_NULL, /* 166 was exportfs */ + AUE_MOUNT, /* 167 = mount */ + AUE_NULL, /* 168 was ustat */ + AUE_NULL, /* 169 = nosys */ + AUE_NULL, /* 170 was table */ + AUE_NULL, /* 171 = old wait3 */ + AUE_NULL, /* 172 was rpause */ + AUE_NULL, /* 173 = nosys */ + AUE_NULL, /* 174 was getdents */ + AUE_NULL, /* 175 was gc_control */ + AUE_NULL, /* 176 = add_profil */ + AUE_NULL, /* 177 */ + AUE_NULL, /* 178 */ + AUE_NULL, /* 179 */ + AUE_NULL, /* 180 */ + AUE_SETGID, /* 181 */ + AUE_SETEGID, /* 182 */ + AUE_SETEUID, /* 183 */ + AUE_NULL, /* 184 = nosys */ + AUE_NULL, /* 185 = nosys */ + AUE_NULL, /* 186 = nosys */ + AUE_NULL, /* 187 = nosys */ + AUE_STAT, /* 188 = stat */ + AUE_FSTAT, /* 189 = fstat */ + AUE_LSTAT, /* 190 = lstat */ + AUE_PATHCONF, /* 191 = pathconf */ + AUE_FPATHCONF, /* 192 = fpathconf */ + +#if COMPAT_GETFSSTAT + AUE_GETFSSTAT, /* 193 = getfsstat */ +#else + AUE_NULL, /* 193 is unused */ +#endif + AUE_NULL, /* 194 = getrlimit */ + AUE_SETRLIMIT, /* 195 = setrlimit */ + AUE_GETDIRENTRIES, /* 196 = getdirentries */ + AUE_NULL, /* 197 = mmap */ + AUE_NULL, /* 198 = __syscall */ + AUE_NULL, /* 199 = lseek */ + AUE_TRUNCATE, /* 200 = truncate */ + AUE_FTRUNCATE, /* 201 = ftruncate */ + AUE_NULL, /* 202 = __sysctl */ + AUE_NULL, /* 203 = mlock */ + AUE_NULL, /* 204 = munlock */ + AUE_UNDELETE, /* 205 = undelete */ + AUE_NULL, /* 206 = ATsocket */ + AUE_NULL, /* 207 = ATgetmsg*/ + AUE_NULL, /* 208 = ATputmsg*/ + AUE_NULL, /* 209 = ATPsndreq*/ + AUE_NULL, /* 210 = ATPsndrsp*/ + AUE_NULL, /* 211 = ATPgetreq*/ + AUE_NULL, /* 212 = ATPgetrsp*/ + AUE_NULL, /* 213 = Reserved for AppleTalk */ + AUE_NULL, /* 214 = Reserved for AppleTalk */ + AUE_NULL, /* 215 = Reserved for AppleTalk */ + + AUE_NULL, /* 216 = HFS make complex file call (multipel forks */ + AUE_NULL, /* 217 = HFS statv extended stat call for HFS */ + AUE_NULL, /* 218 = HFS lstatv extended lstat call for HFS */ + AUE_NULL, /* 219 = HFS fstatv extended fstat call for HFS */ + AUE_GETATTRLIST,/* 220 = HFS getarrtlist get attribute list cal */ + AUE_SETATTRLIST,/* 221 = HFS setattrlist set attribute list */ + AUE_GETDIRENTRIESATTR,/* 222 = HFS getdirentriesattr get directory attributes */ + AUE_EXCHANGEDATA,/* 223 = HFS exchangedata exchange file contents */ + AUE_NULL,/* 224 = HFS checkuseraccess check access to a file */ + AUE_SEARCHFS, /* 225 = HFS searchfs to implement catalog searching */ + AUE_NULL, /* 226 = private delete (Carbon semantics) */ + AUE_NULL, /* 227 = copyfile - orignally for AFP */ + AUE_NULL, /* 228 */ + AUE_NULL, /* 229 */ + AUE_NULL, /* 230 */ + AUE_NULL, /* 231 */ + AUE_NULL, /* 232 */ + AUE_NULL, /* 233 */ + AUE_NULL, /* 234 */ + AUE_NULL, /* 235 */ + AUE_NULL, /* 236 */ + AUE_NULL, /* 237 */ + AUE_NULL, /* 238 */ + AUE_NULL, /* 239 */ + AUE_NULL, /* 240 */ + AUE_NULL, /* 241 */ + AUE_NULL, /* 242 = fsctl */ + AUE_NULL, /* 243 */ + AUE_NULL, /* 244 */ + AUE_NULL, /* 245 */ + AUE_NULL, /* 246 */ + AUE_NULL, /* 247 = nfsclnt*/ + AUE_NULL, /* 248 = fhopen */ + AUE_NULL, /* 249 */ + AUE_NULL, /* 250 = minherit */ + AUE_NULL, /* 251 = semsys */ + AUE_NULL, /* 252 = msgsys */ + AUE_NULL, /* 253 = shmsys */ + AUE_SEMCTL, /* 254 = semctl */ + AUE_SEMGET, /* 255 = semget */ + AUE_SEMOP, /* 256 = semop */ + AUE_NULL, /* 257 = semconfig */ + AUE_MSGCTL, /* 258 = msgctl */ + AUE_MSGGET, /* 259 = msgget */ + AUE_MSGSND, /* 260 = msgsnd */ + AUE_MSGRCV, /* 261 = msgrcv */ + AUE_SHMAT, /* 262 = shmat */ + AUE_SHMCTL, /* 263 = shmctl */ + AUE_SHMDT, /* 264 = shmdt */ + AUE_SHMGET, /* 265 = shmget */ + AUE_NULL, /* 266 = shm_open */ + AUE_NULL, /* 267 = shm_unlink */ + AUE_NULL, /* 268 = sem_open */ + AUE_NULL, /* 269 = sem_close */ + AUE_NULL, /* 270 = sem_unlink */ + AUE_NULL, /* 271 = sem_wait */ + AUE_NULL, /* 272 = sem_trywait */ + AUE_NULL, /* 273 = sem_post */ + AUE_NULL, /* 274 = sem_getvalue */ + AUE_NULL, /* 275 = sem_init */ + AUE_NULL, /* 276 = sem_destroy */ + AUE_NULL, /* 277 */ + AUE_NULL, /* 278 */ + AUE_NULL, /* 279 */ + AUE_NULL, /* 280 */ + AUE_NULL, /* 281 */ + AUE_NULL, /* 282 */ + AUE_NULL, /* 283 */ + AUE_NULL, /* 284 */ + AUE_NULL, /* 285 */ + AUE_NULL, /* 286 */ + AUE_NULL, /* 287 */ + AUE_NULL, /* 288 */ + AUE_NULL, /* 289 */ + AUE_NULL, /* 290 */ + AUE_NULL, /* 291 */ + AUE_NULL, /* 292 */ + AUE_NULL, /* 293 */ + AUE_NULL, /* 294 */ + AUE_NULL, /* 295 */ + AUE_NULL, /* 296 = load_shared_file */ + AUE_NULL, /* 297 = reset_shared_file */ + AUE_NULL, /* 298 = new_system_shared_regions */ + AUE_NULL, /* 299 */ + AUE_NULL, /* 300 */ + AUE_NULL, /* 301 */ + AUE_NULL, /* 302 */ + AUE_NULL, /* 303 */ + AUE_NULL, /* 304 */ + AUE_NULL, /* 305 */ + AUE_NULL, /* 306 */ + AUE_NULL, /* 307 */ + AUE_NULL, /* 308 */ + AUE_NULL, /* 309 */ + AUE_NULL, /* 310 = getsid */ + AUE_NULL, /* 311 */ + AUE_NULL, /* 312 */ + AUE_NULL, /* 313 */ + AUE_NULL, /* 314 */ + AUE_NULL, /* 315 */ + AUE_NULL, /* 316 */ + AUE_NULL, /* 317 */ + AUE_NULL, /* 318 */ + AUE_NULL, /* 319 */ + AUE_NULL, /* 320 */ + AUE_NULL, /* 321 */ + AUE_NULL, /* 322 */ + AUE_NULL, /* 323 */ + AUE_NULL, /* 324 = mlockall*/ + AUE_NULL, /* 325 = munlockall*/ + AUE_NULL, /* 326 */ + AUE_NULL, /* 327 = issetugid */ + AUE_NULL, /* 328 */ + AUE_NULL, /* 329 */ + AUE_NULL, /* 330 */ + AUE_NULL, /* 331 */ + AUE_NULL, /* 332 */ + AUE_NULL, /* 333 */ + AUE_NULL, /* 334 */ + AUE_NULL, /* 335 = utrace */ + AUE_NULL, /* 336 */ + AUE_NULL, /* 337 */ + AUE_NULL, /* 338 */ + AUE_NULL, /* 339 */ + AUE_NULL, /* 340 */ + AUE_NULL, /* 341 */ + AUE_NULL, /* 342 */ + AUE_NULL, /* 343 */ + AUE_NULL, /* 344 */ + AUE_NULL, /* 345 */ + AUE_NULL, /* 346 */ + AUE_NULL, /* 347 */ + AUE_NULL, /* 348 */ + AUE_NULL, /* 349 */ + AUE_AUDIT, /* 350 */ + AUE_NULL, /* 351 */ + AUE_NULL, /* 352 */ + AUE_GETAUID, /* 353 */ + AUE_SETAUID, /* 354 */ + AUE_NULL, /* 355 */ + AUE_NULL, /* 356 */ + AUE_NULL, /* 357 */ + AUE_NULL, /* 358 */ + AUE_NULL, /* 359 */ + AUE_NULL, /* 360 */ + AUE_NULL, /* 361 */ + AUE_NULL, /* 362 = kqueue */ + AUE_NULL, /* 363 = kevent */ + AUE_NULL, /* 364 */ + AUE_NULL, /* 365 */ + AUE_NULL, /* 366 */ + AUE_NULL, /* 367 */ + AUE_NULL, /* 368 */ + AUE_NULL /* 369 */ +}; +int nsys_au_event = sizeof(sys_au_event) / sizeof(sys_au_event[0]); + +/* + * Check whether an event is aditable by comparing the mask of classes this + * event is part of against the kernel's preselection mask the given mask + * which will be the process event mask. + * + * XXX This needs to eventually implement the selection based on the + * event->class mapping that is controlled by a configuration file. + */ +int au_preselect(au_event_t event, au_mask_t *mask_p, int sorf) +{ + au_class_t ae_class; + au_class_t effmask = 0; + + if(mask_p == NULL) + return (-1); + + /* + * XXX Set the event class using a big ugly switch statement. This + * will change to use the mapping defined by a configuration file. + */ + switch (event) { + case AUE_MMAP: + case AUE_PIPE: + /* mmap() and pipe() are AU_NULL in some systems; we'll + * place them in AU_IPC for now. + */ + ae_class = AU_IPC; break; + case AUE_READLINK: + case AUE_GETDIRENTRIES: + ae_class = AU_FREAD; break; + case AUE_ACCESS: + case AUE_FSTAT: + case AUE_FSTATFS: + case AUE_GETFH: + case AUE_LSTAT: + case AUE_FPATHCONF: + case AUE_PATHCONF: + case AUE_STAT: + case AUE_STATFS: + case AUE_GETATTRLIST: + case AUE_GETFSSTAT: + case AUE_GETDIRENTRIESATTR: + case AUE_SEARCHFS: + ae_class = AU_FACCESS; break; + case AUE_CHMOD: + case AUE_CHOWN: + case AUE_FCHMOD: + case AUE_FCHOWN: + case AUE_FCNTL: + case AUE_FLOCK: + case AUE_UTIMES: + case AUE_CHFLAGS: + case AUE_FCHFLAGS: + case AUE_FUTIMES: + case AUE_SETATTRLIST: + case AUE_TRUNCATE: + case AUE_FTRUNCATE: + case AUE_UNDELETE: + case AUE_EXCHANGEDATA: + ae_class = AU_FMODIFY; break; + case AUE_LINK: + case AUE_MKDIR: + case AUE_MKNOD: + case AUE_SYMLINK: + case AUE_MKFIFO: + ae_class = AU_FCREATE; break; + case AUE_RMDIR: + case AUE_UNLINK: + ae_class = AU_FDELETE; break; + case AUE_CLOSE: + case AUE_MUNMAP: + case AUE_REVOKE: + ae_class = AU_CLOSE; break; + case AUE_CHDIR: + case AUE_CHROOT: + case AUE_EXIT: + case AUE_FCHDIR: + case AUE_FORK: + case AUE_KILL: + case AUE_SETEGID: + case AUE_SETEUID: + case AUE_SETGID: + case AUE_SETGROUPS: + case AUE_SETPGRP: + case AUE_SETUID: + case AUE_VFORK: + case AUE_UMASK: + ae_class = AU_PROCESS; break; + case AUE_ACCEPT: + case AUE_BIND: + case AUE_CONNECT: + case AUE_RECVFROM: + case AUE_RECVMSG: + case AUE_SENDMSG: + case AUE_SENDTO: + case AUE_SETSOCKOPT: + case AUE_SHUTDOWN: + case AUE_SOCKET: + case AUE_SOCKETPAIR: + ae_class = AU_NET; break; + case AUE_MSGCTL: + case AUE_MSGGET: + case AUE_MSGRCV: + case AUE_MSGSND: + case AUE_SEMCTL: + case AUE_SEMGET: + case AUE_SEMOP: + case AUE_SHMAT: + case AUE_SHMCTL: + case AUE_SHMDT: + case AUE_SHMGET: + ae_class = AU_IPC; break; + case AUE_ACCT: + case AUE_ADJTIME: + case AUE_GETAUID: + case AUE_MOUNT: + case AUE_SETAUID: + case AUE_SETRLIMIT: + case AUE_UMOUNT: + ae_class = AU_ADMIN; break; + case AUE_IOCTL: + ae_class = AU_IOCTL; break; + case AUE_EXECVE: + ae_class = AU_PROCESS|AU_EXEC; break; + case AUE_OPEN_R: + ae_class = AU_FREAD; break; + case AUE_OPEN_RC: + ae_class = AU_FREAD|AU_FCREATE; break; + case AUE_OPEN_RTC: + ae_class = AU_FREAD|AU_FCREATE|AU_FDELETE; break; + case AUE_OPEN_RT: + ae_class = AU_FREAD|AU_FDELETE; break; + case AUE_OPEN_RW: + ae_class = AU_FREAD|AU_FWRITE; break; + case AUE_OPEN_RWC: + ae_class = AU_FREAD|AU_FWRITE|AU_FCREATE; break; + case AUE_OPEN_RWTC: + ae_class = AU_FREAD|AU_FWRITE|AU_FCREATE|AU_FDELETE; break; + case AUE_OPEN_RWT: + ae_class = AU_FREAD|AU_FWRITE|AU_FDELETE; break; + case AUE_OPEN_W: + ae_class = AU_FWRITE; break; + case AUE_OPEN_WC: + ae_class = AU_FWRITE|AU_FCREATE; break; + case AUE_OPEN_WTC: + ae_class = AU_FWRITE|AU_FCREATE|AU_FDELETE; break; + case AUE_OPEN_WT: + ae_class = AU_FWRITE|AU_FDELETE; break; + case AUE_RENAME: + ae_class = AU_FCREATE|AU_FDELETE; break; + default: /* Assign the event to all classes */ + ae_class = AU_ALL; break; + } + + /* + * Perform the actual check of the masks against the event. + */ + /* + * XXX Need to compare against the kernel mask??? Or do we not do + * that by default and let the client code just call this function + * with the kernel preselection mask as the mask parameter? + */ + if(sorf & AU_PRS_SUCCESS) { + effmask |= (mask_p->am_success & ae_class); + } + + if(sorf & AU_PRS_FAILURE) { + effmask |= (mask_p->am_failure & ae_class); + } + + if(effmask) + return (1); + else + return (0); +} + +/* + * Convert an open flags specifier into a specific type of open event for + * auditing purposes. + */ +au_event_t flags_to_openevent(int oflags) { + + /* Need to check only those flags we care about. */ + oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY); + + /* These checks determine what flags are on with the condition + * that ONLY that combination is on, and no other flags are on. + */ + if (!(oflags ^ O_RDONLY)) + return AUE_OPEN_R; + if (!(oflags ^ (O_RDONLY | O_CREAT))) + return AUE_OPEN_RC; + if (!(oflags ^ (O_RDONLY | O_CREAT | O_TRUNC))) + return AUE_OPEN_RTC; + if (!(oflags ^ (O_RDONLY | O_TRUNC))) + return AUE_OPEN_RT; + if (!(oflags ^ O_RDWR)) + return AUE_OPEN_RW; + if (!(oflags ^ (O_RDWR | O_CREAT))) + return AUE_OPEN_RWC; + if (!(oflags ^ (O_RDWR | O_CREAT | O_TRUNC))) + return AUE_OPEN_RWTC; + if (!(oflags ^ (O_RDWR | O_TRUNC))) + return AUE_OPEN_RWT; + if (!(oflags ^ O_WRONLY)) + return AUE_OPEN_W; + if (!(oflags ^ (O_WRONLY | O_CREAT))) + return AUE_OPEN_WC; + if (!(oflags ^ (O_WRONLY | O_CREAT | O_TRUNC))) + return AUE_OPEN_WTC; + if (!(oflags ^ (O_WRONLY | O_TRUNC))) + return AUE_OPEN_WT; + + return AUE_OPEN_R; +} + +/* + * Fill in a vattr struct from kernel audit record fields. This function + * would be unecessary if we store a vattr in the kernel audit record + * directly. +*/ +void fill_vattr(struct vattr *v, struct vnode_au_info *vn_info) +{ + v->va_mode = vn_info->vn_mode; + v->va_uid = vn_info->vn_uid; + v->va_gid = vn_info->vn_gid; + v->va_fsid = vn_info->vn_fsid; + v->va_fileid = vn_info->vn_fileid; + v->va_rdev = vn_info->vn_dev; +} + +/* Convert a MSGCTL command to a specific event. */ +int msgctl_to_event(int cmd) +{ + switch (cmd) { + case IPC_RMID: + return AUE_MSGCTL_RMID; + case IPC_SET: + return AUE_MSGCTL_SET; + case IPC_STAT: + return AUE_MSGCTL_STAT; + default: + return AUE_MSGCTL; + /* We will audit a bad command */ + } +} + +/* Convert a SEMCTL command to a specific event. */ +int semctl_to_event(int cmd) +{ + switch (cmd) { + case GETALL: + return AUE_SEMCTL_GETALL; + case GETNCNT: + return AUE_SEMCTL_GETNCNT; + case GETPID: + return AUE_SEMCTL_GETPID; + case GETVAL: + return AUE_SEMCTL_GETVAL; + case GETZCNT: + return AUE_SEMCTL_GETZCNT; + case IPC_RMID: + return AUE_SEMCTL_RMID; + case IPC_SET: + return AUE_SEMCTL_SET; + case SETALL: + return AUE_SEMCTL_SETALL; + case SETVAL: + return AUE_SEMCTL_SETVAL; + case IPC_STAT: + return AUE_SEMCTL_STAT; + default: + return AUE_SEMCTL; + /* We will audit a bad command */ + } +} + +/* + * Create a canonical path from given path by prefixing either the + * root directory, or the current working directory. + * If the process working directory is NULL, we could use 'rootvnode' + * to obtain the root directoty, but this results in a volfs name + * written to the audit log. So we will leave the filename starting + * with '/' in the audit log in this case. + */ +void canon_path(struct proc *p, char *path, char *cpath) +{ + char *bufp; + int len; + struct vnode *vnp; + struct filedesc *fdp; + + fdp = p->p_fd; + bufp = path; + if (*(path) == '/') { + while (*(bufp) == '/') + bufp++; /* skip leading '/'s */ + /* If no process root, or it is the same as the system root, + * audit the path as passed in with a single '/'. + */ + if ((fdp->fd_rdir == NULL) || + (fdp->fd_rdir == rootvnode)) { + vnp = NULL; + bufp--; /* restore one '/' */ + } else { + vnp = fdp->fd_rdir; /* use process root */ + } + } else { + vnp = fdp->fd_cdir; /* prepend the current dir */ + bufp = path; + } + if (vnp != NULL) { + len = MAXPATHLEN; + vn_getpath(vnp, cpath, &len); + /* The length returned by vn_getpath() is two greater than the + * number of characters in the string. + */ + if (len < MAXPATHLEN) + cpath[len-2] = '/'; + strncpy(cpath + len-1, bufp, MAXPATHLEN - len); + } else { + strncpy(cpath, bufp, MAXPATHLEN); + } +} diff -urN xnu-344.49/bsd/kern/kern_bsm_token.c xnu-517/bsd/kern/kern_bsm_token.c --- xnu-344.49/bsd/kern/kern_bsm_token.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/kern/kern_bsm_token.c Sat Oct 25 00:25:25 2003 @@ -0,0 +1,1344 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +#define GET_TOKEN_AREA(tok, dptr, length) \ + do {\ + kmem_alloc(kernel_map, &tok, sizeof(*tok)); \ + if(tok != NULL)\ + {\ + tok->len = length;\ + kmem_alloc(kernel_map, &tok->t_data, \ + length * sizeof(u_char));\ + if((dptr = tok->t_data) == NULL)\ + {\ + kmem_free(kernel_map, tok, sizeof(*tok));\ + tok = NULL;\ + }\ + else\ + {\ + memset(dptr, 0, length);\ + }\ + }\ + }while(0) + + + +/* + * token ID 1 byte + * argument # 1 byte + * argument value 4 bytes/8 bytes (32-bit/64-bit value) + * text length 2 bytes + * text N bytes + 1 terminating NULL byte + */ +token_t *au_to_arg32(char n, char *text, u_int32_t v) +{ + token_t *t; + u_char *dptr; + u_int16_t textlen; + + if(text == NULL) { + return NULL; + } + + /* Make sure that text is null terminated */ + textlen = strlen(text); + if(text[textlen] != '\0') { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 9 + textlen); + if(t == NULL) { + return NULL; + } + + textlen += 1; + + ADD_U_CHAR(dptr, AU_ARG32_TOKEN); + ADD_U_CHAR(dptr, n); + ADD_U_INT32(dptr, v); + ADD_U_INT16(dptr, textlen); + ADD_STRING(dptr, text, textlen); + + return t; + +} + +token_t *au_to_arg64(char n, char *text, u_int64_t v) +{ + token_t *t; + u_char *dptr; + u_int16_t textlen; + + if(text == NULL) { + return NULL; + } + + /* Make sure that text is null terminated */ + textlen = strlen(text); + if(text[textlen] != '\0') { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 13 + textlen); + if(t == NULL) { + return NULL; + } + + textlen += 1; + + ADD_U_CHAR(dptr, AU_ARG64_TOKEN); + ADD_U_CHAR(dptr, n); + ADD_U_INT64(dptr, v); + ADD_U_INT16(dptr, textlen); + ADD_STRING(dptr, text, textlen); + + return t; + +} + +token_t *au_to_arg(char n, char *text, u_int32_t v) +{ + return au_to_arg32(n, text, v); +} + +/* + * token ID 1 byte + * file access mode 4 bytes + * owner user ID 4 bytes + * owner group ID 4 bytes + * file system ID 4 bytes + * node ID 8 bytes + * device 4 bytes/8 bytes (32-bit/64-bit) + */ +token_t *au_to_attr32(struct vattr *attr) +{ + token_t *t; + u_char *dptr; + + if(attr == NULL) { + return NULL; + } + + + GET_TOKEN_AREA(t, dptr, 29); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_ATTR32_TOKEN); + ADD_U_INT32(dptr, attr->va_mode); + ADD_U_INT32(dptr, attr->va_uid); + ADD_U_INT32(dptr, attr->va_gid); + ADD_U_INT32(dptr, attr->va_fsid); + ADD_U_INT64(dptr, attr->va_fileid); + ADD_U_INT32(dptr, attr->va_rdev); + + return t; +} + +token_t *au_to_attr64(struct vattr *attr) +{ + token_t *t; + u_char *dptr; + + if(attr == NULL) { + return NULL; + } + + + GET_TOKEN_AREA(t, dptr, 33); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_ATTR64_TOKEN); + ADD_U_INT32(dptr, attr->va_mode); + ADD_U_INT32(dptr, attr->va_uid); + ADD_U_INT32(dptr, attr->va_gid); + ADD_U_INT32(dptr, attr->va_fsid); + ADD_U_INT64(dptr, attr->va_fileid); + ADD_U_INT64(dptr, attr->va_rdev); + + return t; +} + +token_t *au_to_attr(struct vattr *attr) +{ + return au_to_attr32(attr); + +} + + +/* + * token ID 1 byte + * how to print 1 byte + * basic unit 1 byte + * unit count 1 byte + * data items (depends on basic unit) + */ +token_t *au_to_data(char unit_print, char unit_type, + char unit_count, char *p) +{ + token_t *t; + u_char *dptr; + size_t datasize, totdata; + + if(p == NULL) { + return NULL; + } + + /* Determine the size of the basic unit */ + switch(unit_type) { + case AUR_BYTE: datasize = AUR_BYTE_SIZE; + break; + + case AUR_SHORT: datasize = AUR_SHORT_SIZE; + break; + + case AUR_LONG: datasize = AUR_LONG_SIZE; + break; + + default: return NULL; + } + + totdata = datasize * unit_count; + + GET_TOKEN_AREA(t, dptr, totdata + 4); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_ARB_TOKEN); + ADD_U_CHAR(dptr, unit_print); + ADD_U_CHAR(dptr, unit_type); + ADD_U_CHAR(dptr, unit_count); + ADD_MEM(dptr, p, totdata); + + return t; +} + + +/* + * token ID 1 byte + * status 4 bytes + * return value 4 bytes + */ +token_t *au_to_exit(int retval, int err) +{ + token_t *t; + u_char *dptr; + + GET_TOKEN_AREA(t, dptr, 9); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_EXIT_TOKEN); + ADD_U_INT32(dptr, err); + ADD_U_INT32(dptr, retval); + + return t; +} + +/* + */ +token_t *au_to_groups(int *groups) +{ + return au_to_newgroups(MAX_GROUPS, groups); +} + +/* + * token ID 1 byte + * number groups 2 bytes + * group list count * 4 bytes + */ +token_t *au_to_newgroups(u_int16_t n, gid_t *groups) +{ + token_t *t; + u_char *dptr; + int i; + + if(groups == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, n * 4 + 3); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_NEWGROUPS_TOKEN); + ADD_U_INT16(dptr, n); + for(i = 0; i < n; i++) { + ADD_U_INT32(dptr, groups[i]); + } + + return t; +} + + + + +/* + * token ID 1 byte + * internet address 4 bytes + */ +token_t *au_to_in_addr(struct in_addr *internet_addr) +{ + token_t *t; + u_char *dptr; + + if(internet_addr == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 5); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IN_ADDR_TOKEN); + ADD_U_INT32(dptr, internet_addr->s_addr); + + return t; +} + +/* + * token ID 1 byte + * address type/length 4 bytes + * Address 16 bytes + */ +token_t *au_to_in_addr_ex(struct in6_addr *internet_addr) +{ + token_t *t; + u_char *dptr; + + if(internet_addr == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 21); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IN_ADDR_EX_TOKEN); + ADD_U_INT32(dptr, internet_addr->__u6_addr.__u6_addr32[0]); + ADD_U_INT32(dptr, internet_addr->__u6_addr.__u6_addr32[1]); + ADD_U_INT32(dptr, internet_addr->__u6_addr.__u6_addr32[2]); + ADD_U_INT32(dptr, internet_addr->__u6_addr.__u6_addr32[3]); + + return t; +} + +/* + * token ID 1 byte + * ip header 20 bytes + */ +token_t *au_to_ip(struct ip *ip) +{ + token_t *t; + u_char *dptr; + + if(ip == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 21); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IP_TOKEN); + ADD_MEM(dptr, ip, sizeof(struct ip)); + + return t; +} + +/* + * token ID 1 byte + * object ID type 1 byte + * object ID 4 bytes + */ +token_t *au_to_ipc(char type, int id) +{ + token_t *t; + u_char *dptr; + + + GET_TOKEN_AREA(t, dptr, 6); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IPC_TOKEN); + ADD_U_CHAR(dptr, type); + ADD_U_INT32(dptr, id); + + return t; +} + +/* + * token ID 1 byte + * owner user ID 4 bytes + * owner group ID 4 bytes + * creator user ID 4 bytes + * creator group ID 4 bytes + * access mode 4 bytes + * slot sequence # 4 bytes + * key 4 bytes + */ +token_t *au_to_ipc_perm(struct ipc_perm *perm) +{ + token_t *t; + u_char *dptr; + + if(perm == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 29); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IPCPERM_TOKEN); + ADD_U_INT32(dptr, perm->uid); + ADD_U_INT32(dptr, perm->gid); + ADD_U_INT32(dptr, perm->cuid); + ADD_U_INT32(dptr, perm->cgid); + ADD_U_INT32(dptr, perm->mode); + ADD_U_INT32(dptr, perm->seq); + ADD_U_INT32(dptr, perm->key); + + return t; +} + + +/* + * token ID 1 byte + * port IP address 2 bytes + */ +token_t *au_to_iport(u_int16_t iport) +{ + token_t *t; + u_char *dptr; + + + GET_TOKEN_AREA(t, dptr, 3); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_IPORT_TOKEN); + ADD_U_INT16(dptr, iport); + + return t; +} + + +/* + * token ID 1 byte + * size 2 bytes + * data size bytes + */ +token_t *au_to_opaque(char *data, u_int16_t bytes) +{ + token_t *t; + u_char *dptr; + + if((data == NULL) || (bytes <= 0)) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, bytes + 3); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_OPAQUE_TOKEN); + ADD_U_INT16(dptr, bytes); + ADD_MEM(dptr, data, bytes); + + return t; +} + +#ifdef KERNEL +/* + * Kernel version of the add file token function, where the time value + * is passed in as an additional parameter. + * token ID 1 byte + * seconds of time 4 bytes + * milliseconds of time 4 bytes + * file name len 2 bytes + * file pathname N bytes + 1 terminating NULL byte + */ +token_t *kau_to_file(char *file, struct timeval *tv) +{ + token_t *t; + u_char *dptr; + u_int16_t filelen; + u_int32_t timems = tv->tv_usec/1000; /* We need time in ms */ + + if(file == NULL) { + return NULL; + } + /* Make sure that text is null terminated */ + filelen = strlen(file); + if(file[filelen] != '\0') { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, filelen + 12); + if(t == NULL) { + return NULL; + } + + filelen += 1; + + ADD_U_CHAR(dptr, AU_FILE_TOKEN); + + /* Add the timestamp */ + ADD_U_INT32(dptr, tv->tv_sec); + ADD_U_INT32(dptr, timems); + + ADD_U_INT16(dptr, filelen); + ADD_STRING(dptr, file, filelen); + + return t; + +} +#endif + +/* + * token ID 1 byte + * text length 2 bytes + * text N bytes + 1 terminating NULL byte + */ +token_t *au_to_text(char *text) +{ + token_t *t; + u_char *dptr; + u_int16_t textlen; + + if(text == NULL) { + return NULL; + } + /* Make sure that text is null terminated */ + textlen = strlen(text); + if(text[textlen] != '\0') { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, textlen + 4); + if(t == NULL) { + return NULL; + } + + textlen += 1; + + ADD_U_CHAR(dptr, AU_TEXT_TOKEN); + ADD_U_INT16(dptr, textlen); + ADD_STRING(dptr, text, textlen); + + return t; +} + +/* + * token ID 1 byte + * path length 2 bytes + * path N bytes + 1 terminating NULL byte + */ +token_t *au_to_path(char *text) +{ + token_t *t; + u_char *dptr; + u_int16_t textlen; + + if(text == NULL) { + return NULL; + } + /* Make sure that text is null terminated */ + textlen = strlen(text); + if(text[textlen] != '\0') { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, textlen + 4); + if(t == NULL) { + return NULL; + } + + textlen += 1; + + ADD_U_CHAR(dptr, AU_PATH_TOKEN); + ADD_U_INT16(dptr, textlen); + ADD_STRING(dptr, text, textlen); + + return t; +} + +/* + * token ID 1 byte + * audit ID 4 bytes + * effective user ID 4 bytes + * effective group ID 4 bytes + * real user ID 4 bytes + * real group ID 4 bytes + * process ID 4 bytes + * session ID 4 bytes + * terminal ID + * port ID 4 bytes/8 bytes (32-bit/64-bit value) + * machine address 4 bytes + */ +token_t *au_to_process32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 37); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_PROCESS_32_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT32(dptr, tid->port); + ADD_U_INT32(dptr, tid->machine); + + return t; +} + +token_t *au_to_process64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 41); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_PROCESS_64_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT64(dptr, tid->port); + ADD_U_INT32(dptr, tid->machine); + + return t; +} + +token_t *au_to_process(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + return au_to_process32(auid, euid, egid, ruid, rgid, pid, + sid, tid); +} + + +/* + * token ID 1 byte + * audit ID 4 bytes + * effective user ID 4 bytes + * effective group ID 4 bytes + * real user ID 4 bytes + * real group ID 4 bytes + * process ID 4 bytes + * session ID 4 bytes + * terminal ID + * port ID 4 bytes/8 bytes (32-bit/64-bit value) + * address type-len 4 bytes + * machine address 16 bytes + */ +token_t *au_to_process32_ex(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 53); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_PROCESS_32_EX_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT32(dptr, tid->at_port); + ADD_U_INT32(dptr, tid->at_type); + ADD_U_INT32(dptr, tid->at_addr[0]); + ADD_U_INT32(dptr, tid->at_addr[1]); + ADD_U_INT32(dptr, tid->at_addr[2]); + ADD_U_INT32(dptr, tid->at_addr[3]); + + return t; +} + +token_t *au_to_process64_ex(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 57); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_PROCESS_64_EX_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT64(dptr, tid->at_port); + ADD_U_INT32(dptr, tid->at_type); + ADD_U_INT32(dptr, tid->at_addr[0]); + ADD_U_INT32(dptr, tid->at_addr[1]); + ADD_U_INT32(dptr, tid->at_addr[2]); + ADD_U_INT32(dptr, tid->at_addr[3]); + + return t; +} + +token_t *au_to_process_ex(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + return au_to_process32_ex(auid, euid, egid, ruid, rgid, + pid, sid, tid); +} + +/* + * token ID 1 byte + * error status 1 byte + * return value 4 bytes/8 bytes (32-bit/64-bit value) + */ +token_t *au_to_return32(char status, u_int32_t ret) +{ + token_t *t; + u_char *dptr; + + + GET_TOKEN_AREA(t, dptr, 6); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_RETURN_32_TOKEN); + ADD_U_CHAR(dptr, status); + ADD_U_INT32(dptr, ret); + + return t; +} + +token_t *au_to_return64(char status, u_int64_t ret) +{ + token_t *t; + u_char *dptr; + + + GET_TOKEN_AREA(t, dptr, 10); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_RETURN_64_TOKEN); + ADD_U_CHAR(dptr, status); + ADD_U_INT64(dptr, ret); + + return t; +} + +token_t *au_to_return(char status, u_int32_t ret) +{ + return au_to_return32(status, ret); +} + +/* + * token ID 1 byte + * sequence number 4 bytes + */ +token_t *au_to_seq(long audit_count) +{ + token_t *t; + u_char *dptr; + + + GET_TOKEN_AREA(t, dptr, 5); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SEQ_TOKEN); + ADD_U_INT32(dptr, audit_count); + + return t; +} + +/* + * token ID 1 byte + * socket type 2 bytes + * remote port 2 bytes + * remote Internet address 4 bytes + */ +token_t *au_to_socket(struct socket *so) +{ + return au_to_socket_ex_32(so); +} + +/* + * token ID 1 byte + * socket type 2 bytes + * local port 2 bytes + * address type/length 4 bytes + * local Internet address 4 bytes/16 bytes (IPv4/IPv6 address) + * remote port 4 bytes + * address type/length 4 bytes + * remote Internet address 4 bytes/16 bytes (IPv4/IPv6 address) + */ +token_t *au_to_socket_ex_32(struct socket *so) +{ + return NULL; +} +token_t *au_to_socket_ex_128(struct socket *so) +{ + return NULL; +} + +/* + * token ID 1 byte + * socket family 2 bytes + * local port 2 bytes + * socket address 4 bytes + */ +token_t *au_to_sock_inet32(struct sockaddr_in *so) +{ + token_t *t; + u_char *dptr; + + if(so == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 9); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SOCK_INET_32_TOKEN); + /* In Darwin, sin_family is one octet, but BSM defines the token + * to store two. So we copy in a 0 first. + */ + ADD_U_CHAR(dptr, 0); + ADD_U_CHAR(dptr, so->sin_family); + ADD_U_INT16(dptr, so->sin_port); + ADD_U_INT32(dptr, so->sin_addr.s_addr); + + return t; + +} + +token_t *au_to_sock_inet128(struct sockaddr_in6 *so) +{ + token_t *t; + u_char *dptr; + + if(so == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 21); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SOCK_INET_128_TOKEN); + /* In Darwin, sin_family is one octet, but BSM defines the token + * to store two. So we copy in a 0 first. + */ + ADD_U_CHAR(dptr, 0); + ADD_U_CHAR(dptr, so->sin6_family); + ADD_U_INT16(dptr, so->sin6_port); + ADD_U_INT32(dptr, so->sin6_addr.__u6_addr.__u6_addr32[0]); + ADD_U_INT32(dptr, so->sin6_addr.__u6_addr.__u6_addr32[1]); + ADD_U_INT32(dptr, so->sin6_addr.__u6_addr.__u6_addr32[2]); + ADD_U_INT32(dptr, so->sin6_addr.__u6_addr.__u6_addr32[3]); + + return t; + + + +} + +/* + * token ID 1 byte + * socket family 2 bytes + * path 104 bytes + */ +token_t *au_to_sock_unix(struct sockaddr_un *so) +{ + token_t *t; + u_char *dptr; + + if(so == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 107); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SOCK_UNIX_TOKEN); + /* BSM token has two bytes for family */ + ADD_U_CHAR(dptr, 0); + ADD_U_CHAR(dptr, so->sun_family); + ADD_STRING(dptr, so->sun_path, strlen(so->sun_path)); + + return t; + +} + +token_t *au_to_sock_inet(struct sockaddr_in *so) +{ + return au_to_sock_inet32(so); +} + +/* + * token ID 1 byte + * audit ID 4 bytes + * effective user ID 4 bytes + * effective group ID 4 bytes + * real user ID 4 bytes + * real group ID 4 bytes + * process ID 4 bytes + * session ID 4 bytes + * terminal ID + * port ID 4 bytes/8 bytes (32-bit/64-bit value) + * machine address 4 bytes + */ +token_t *au_to_subject32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 37); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SUBJECT_32_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT32(dptr, tid->port); + ADD_U_INT32(dptr, tid->machine); + + return t; +} + +token_t *au_to_subject64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 41); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SUBJECT_64_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT64(dptr, tid->port); + ADD_U_INT32(dptr, tid->machine); + + return t; +} + +token_t *au_to_subject(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid) +{ + return au_to_subject32(auid, euid, egid, ruid, rgid, + pid, sid, tid); + +} + +/* + * token ID 1 byte + * audit ID 4 bytes + * effective user ID 4 bytes + * effective group ID 4 bytes + * real user ID 4 bytes + * real group ID 4 bytes + * process ID 4 bytes + * session ID 4 bytes + * terminal ID + * port ID 4 bytes/8 bytes (32-bit/64-bit value) + * address type/length 4 bytes + * machine address 16 bytes + */ +token_t *au_to_subject32_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 53); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SUBJECT_32_EX_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT32(dptr, tid->at_port); + ADD_U_INT32(dptr, tid->at_type); + ADD_U_INT32(dptr, tid->at_addr[0]); + ADD_U_INT32(dptr, tid->at_addr[1]); + ADD_U_INT32(dptr, tid->at_addr[2]); + ADD_U_INT32(dptr, tid->at_addr[3]); + + return t; +} + +token_t *au_to_subject64_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + token_t *t; + u_char *dptr; + + if(tid == NULL) { + return NULL; + } + + GET_TOKEN_AREA(t, dptr, 57); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_SUBJECT_64_EX_TOKEN); + ADD_U_INT32(dptr, auid); + ADD_U_INT32(dptr, euid); + ADD_U_INT32(dptr, egid); + ADD_U_INT32(dptr, ruid); + ADD_U_INT32(dptr, rgid); + ADD_U_INT32(dptr, pid); + ADD_U_INT32(dptr, sid); + ADD_U_INT64(dptr, tid->at_port); + ADD_U_INT32(dptr, tid->at_type); + ADD_U_INT32(dptr, tid->at_addr[0]); + ADD_U_INT32(dptr, tid->at_addr[1]); + ADD_U_INT32(dptr, tid->at_addr[2]); + ADD_U_INT32(dptr, tid->at_addr[3]); + + return t; +} + +token_t *au_to_subject_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid) +{ + return au_to_subject32_ex(auid, euid, egid, ruid, rgid, + pid, sid, tid); + +} + +/* + * token ID 1 byte + * count 4 bytes + * text count null-terminated strings + */ +token_t *au_to_exec_args(const char **args) +{ + token_t *t; + u_char *dptr; + const char *nextarg; + int i, count = 0; + size_t totlen = 0; + + if(args == NULL) { + return NULL; + } + + nextarg = *args; + + while(nextarg != NULL) { + int nextlen; + + nextlen = strlen(nextarg); + if(nextarg[nextlen] != '\0') { + return NULL; + } + + totlen += nextlen + 1; + count++; + nextarg = *(args + count); + } + + + GET_TOKEN_AREA(t, dptr, 5 + totlen); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_EXEC_ARG_TOKEN); + ADD_U_INT32(dptr, count); + + for(i =0; i< count; i++) { + nextarg = *(args + i); + ADD_MEM(dptr, nextarg, strlen(nextarg) + 1); + } + + return t; +} + + +/* + * token ID 1 byte + * count 4 bytes + * text count null-terminated strings + */ +token_t *au_to_exec_env(const char **env) +{ + token_t *t; + u_char *dptr; + int i, count = 0; + size_t totlen = 0; + const char *nextenv; + + if(env == NULL) { + return NULL; + } + + nextenv = *env; + + while(nextenv != NULL) { + int nextlen; + + nextlen = strlen(nextenv); + if(nextenv[nextlen] != '\0') { + return NULL; + } + + totlen += nextlen + 1; + count++; + nextenv = *(env + count); + } + + + GET_TOKEN_AREA(t, dptr, 5 + totlen); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_EXEC_ENV_TOKEN); + ADD_U_INT32(dptr, count); + + for(i =0; i< count; i++) { + nextenv = *(env + i); + ADD_MEM(dptr, nextenv, strlen(nextenv) + 1); + } + + return t; +} + + +#ifdef KERNEL +/* + * Kernel version of the BSM header token functions. These versions take + * a timespec struct as an additional parameter in order to obtain the + * create time value for the BSM audit record. + * token ID 1 byte + * record byte count 4 bytes + * version # 1 byte [2] + * event type 2 bytes + * event modifier 2 bytes + * seconds of time 4 bytes/8 bytes (32-bit/64-bit value) + * milliseconds of time 4 bytes/8 bytes (32-bit/64-bit value) + */ +token_t *kau_to_header32(struct timespec *ctime, int rec_size, + au_event_t e_type, au_emod_t e_mod) +{ + token_t *t; + u_char *dptr; + u_int32_t timems = ctime->tv_nsec/1000000; /* We need time in ms */ + + GET_TOKEN_AREA(t, dptr, 18); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_HEADER_32_TOKEN); + ADD_U_INT32(dptr, rec_size); + ADD_U_CHAR(dptr, HEADER_VERSION); + ADD_U_INT16(dptr, e_type); + ADD_U_INT16(dptr, e_mod); + + /* Add the timestamp */ + ADD_U_INT32(dptr, ctime->tv_sec); + ADD_U_INT32(dptr, timems); + + return t; +} + +token_t *kau_to_header64(struct timespec *ctime, int rec_size, + au_event_t e_type, au_emod_t e_mod) +{ + token_t *t; + u_char *dptr; + u_int32_t timems = ctime->tv_nsec/1000000; /* We need time in ms */ + + GET_TOKEN_AREA(t, dptr, 26); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_HEADER_64_TOKEN); + ADD_U_INT32(dptr, rec_size); + ADD_U_CHAR(dptr, HEADER_VERSION); + ADD_U_INT16(dptr, e_type); + ADD_U_INT16(dptr, e_mod); + + /* Add the timestamp */ + ADD_U_INT32(dptr, ctime->tv_sec); + ADD_U_INT32(dptr, timems); + + return t; +} + +token_t *kau_to_header(struct timespec *ctime, int rec_size, + au_event_t e_type, au_emod_t e_mod) +{ + return kau_to_header32(ctime, rec_size, e_type, e_mod); +} + +#endif + +/* + * token ID 1 byte + * trailer magic number 2 bytes + * record byte count 4 bytes + */ +token_t *au_to_trailer(int rec_size) +{ + token_t *t; + u_char *dptr; + u_int16_t magic = TRAILER_PAD_MAGIC; + + + GET_TOKEN_AREA(t, dptr, 7); + if(t == NULL) { + return NULL; + } + + ADD_U_CHAR(dptr, AU_TRAILER_TOKEN); + ADD_U_INT16(dptr, magic); + ADD_U_INT32(dptr, rec_size); + + return t; + +} + diff -urN xnu-344.49/bsd/kern/kern_clock.c xnu-517/bsd/kern/kern_clock.c --- xnu-344.49/bsd/kern/kern_clock.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_clock.c Sat Oct 25 00:25:25 2003 @@ -121,6 +121,7 @@ register struct proc *p; register thread_t thread; int nusecs = numticks * tick; + struct timeval tv; if (!bsd_hardclockinit) return; @@ -128,13 +129,14 @@ /* * Increment the time-of-day. */ - microtime(&time); + microtime(&tv); + time = tv; if (bsd_hardclockinit < 0) { return; } - thread = current_thread(); + thread = current_act(); /* * Charge the time out based on the mode the cpu is in. * Here again we fudge for the lack of proper interval timers @@ -160,7 +162,7 @@ extern void psignal_vtalarm(struct proc *); /* does psignal(p, SIGVTALRM) in a thread context */ - thread_call_func(psignal_vtalarm, p, FALSE); + thread_call_func((thread_call_func_t)psignal_vtalarm, p, FALSE); } } @@ -183,7 +185,7 @@ extern void psignal_xcpu(struct proc *); /* does psignal(p, SIGXCPU) in a thread context */ - thread_call_func(psignal_xcpu, p, FALSE); + thread_call_func((thread_call_func_t)psignal_xcpu, p, FALSE); if (p->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur < p->p_limit->pl_rlimit[RLIMIT_CPU].rlim_max) @@ -195,7 +197,7 @@ extern void psignal_sigprof(struct proc *); /* does psignal(p, SIGPROF) in a thread context */ - thread_call_func(psignal_sigprof, p, FALSE); + thread_call_func((thread_call_func_t)psignal_sigprof, p, FALSE); } } } diff -urN xnu-344.49/bsd/kern/kern_control.c xnu-517/bsd/kern/kern_control.c --- xnu-344.49/bsd/kern/kern_control.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_control.c Sat Oct 25 00:25:25 2003 @@ -168,6 +168,13 @@ if (ctl == NULL) return(EADDRNOTAVAIL); + if (ctl->flags & CTL_FLAG_PRIVILEGED) { + if (p == 0) + return(EINVAL); + if (error = suser(p->p_ucred, &p->p_acflag)) + return error; + } + if (ctl->skt != NULL) return(EBUSY); @@ -179,13 +186,6 @@ ctl->skt = so; - if (ctl->flags & CTL_FLAG_PRIVILEGED) { - if (p == 0) - return(EPERM); - if (error = suser(p->p_ucred, &p->p_acflag)) - return error; - } - if (ctl->connect) error = (*ctl->connect)(ctl, ctl->userdata); if (error) { @@ -284,7 +284,8 @@ } bcopy(data, mtod(m, void *), len); - + m->m_pkthdr.len = m->m_len = len; + sbappend(&so->so_rcv, m); if ((flags & CTL_DATA_NOWAKEUP) == 0) sorwakeup(so); diff -urN xnu-344.49/bsd/kern/kern_core.c xnu-517/bsd/kern/kern_core.c --- xnu-344.49/bsd/kern/kern_core.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_core.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -28,9 +28,6 @@ * * This file contains machine independent code for performing core dumps. * - * HISTORY - * 16-Feb-91 Mike DeMoney (mike@next.com) - * Massaged into MI form from m68k/core.c. */ #include @@ -68,9 +65,10 @@ mythread_state_flavor_t thread_flavor_array[]={ {PPC_THREAD_STATE , PPC_THREAD_STATE_COUNT}, {PPC_FLOAT_STATE, PPC_FLOAT_STATE_COUNT}, - {PPC_EXCEPTION_STATE, PPC_EXCEPTION_STATE_COUNT} + {PPC_EXCEPTION_STATE, PPC_EXCEPTION_STATE_COUNT}, + {PPC_VECTOR_STATE, PPC_VECTOR_STATE_COUNT} }; -int mynum_flavors=3; +int mynum_flavors=4; #elif defined (__i386__) mythread_state_flavor_t thread_flavor_array [] = { {i386_THREAD_STATE, i386_THREAD_STATE_COUNT}, @@ -97,6 +95,7 @@ int tstate_size; } tir_t; +void collectth_state(thread_act_t th_act, tir_t *t) { vm_offset_t header; @@ -172,6 +171,7 @@ tir_t tir1; struct vnode * vp; extern boolean_t coredumpok(vm_map_t map, vm_offset_t va); /* temp fix */ + extern task_t current_task(); /* XXX */ if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) return (EFAULT); @@ -185,8 +185,8 @@ (void) task_suspend(task); sprintf(core_name, "/cores/core.%d", p->p_pid); - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, core_name, p); - if(error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR )) + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, core_name, p); + if(error = vn_open(&nd, O_CREAT | FWRITE | O_NOFOLLOW, S_IRUSR )) return (error); vp = nd.ni_vp; @@ -215,18 +215,8 @@ * nflavors here is really the number of ints in flavors * to meet the thread_getstatus() calling convention */ -#if 0 - nflavors = sizeof(flavors)/sizeof(int); - if (thread_getstatus(current_thread(), THREAD_STATE_FLAVOR_LIST, - (thread_state_t)(flavors), - &nflavors) != KERN_SUCCESS) - panic("core flavor list"); - /* now convert to number of flavors */ - nflavors /= sizeof(mythread_state_flavor_t)/sizeof(int); -#else nflavors = mynum_flavors; bcopy(thread_flavor_array,flavors,sizeof(thread_flavor_array)); -#endif tstate_size = 0; for (i = 0; i < nflavors; i++) tstate_size += sizeof(mythread_state_flavor_t) + @@ -255,9 +245,10 @@ mh->sizeofcmds = command_size; hoffset = sizeof(struct mach_header); /* offset into header */ - foffset = round_page(header_size); /* offset into file */ + foffset = round_page_32(header_size); /* offset into file */ vmoffset = VM_MIN_ADDRESS; /* offset into VM */ - /* We use to check for an error, here, now we try and get + /* + * We use to check for an error, here, now we try and get * as much as we can */ while (segment_count > 0){ @@ -314,7 +305,9 @@ * Note: if we can't read, then we end up with * a hole in the file. */ - if ((maxprot & VM_PROT_READ) == VM_PROT_READ && vbr.user_tag != VM_MEMORY_IOKIT && coredumpok(map,vmoffset)) { + if ((maxprot & VM_PROT_READ) == VM_PROT_READ + && vbr.user_tag != VM_MEMORY_IOKIT + && coredumpok(map,vmoffset)) { error = vn_rdwr(UIO_WRITE, vp, (caddr_t)vmoffset, size, foffset, UIO_USERSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, p); } @@ -325,44 +318,12 @@ segment_count--; } -#if 0 /* [ */ - task_lock(task); - thread = (thread_t) queue_first(&task->thread_list); - while (thread_count > 0) { - /* - * Fill in thread command structure. - */ - tc = (struct thread_command *) (header + hoffset); - tc->cmd = LC_THREAD; - tc->cmdsize = sizeof(struct thread_command) - + tstate_size; - hoffset += sizeof(struct thread_command); - /* - * Follow with a struct thread_state_flavor and - * the appropriate thread state struct for each - * thread state flavor. - */ - for (i = 0; i < nflavors; i++) { - *(mythread_state_flavor_t *)(header+hoffset) = - flavors[i]; - hoffset += sizeof(mythread_state_flavor_t); - thread_getstatus(thread, flavors[i].flavor, - (thread_state_t *)(header+hoffset), - &flavors[i].count); - hoffset += flavors[i].count*sizeof(int); - } - thread = (thread_t) queue_next(&thread->thread_list); - thread_count--; - } - task_unlock(task); -#else /* /* 0 ][ */ tir1.header = header; tir1.hoffset = hoffset; tir1.flavors = flavors; tir1.tstate_size = tstate_size; task_act_iterate_wth_args(task, collectth_state,&tir1); -#endif /* 0 ] */ /* * Write out the Mach header at the beginning of the * file. @@ -375,4 +336,5 @@ error1 = vn_close(vp, FWRITE, cred, p); if (error == 0) error = error1; + return (error); } diff -urN xnu-344.49/bsd/kern/kern_descrip.c xnu-517/bsd/kern/kern_descrip.c --- xnu-344.49/bsd/kern/kern_descrip.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_descrip.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -79,6 +79,8 @@ #include #include #include +#include +#include #include @@ -247,11 +249,14 @@ daddr_t lbn, bn; int devBlockSize = 0; + AUDIT_ARG(fd, uap->fd); + AUDIT_ARG(cmd, uap->cmd); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) return (EBADF); pop = &fdp->fd_ofileflags[fd]; + switch (uap->cmd) { case F_DUPFD: @@ -325,6 +330,7 @@ if (fp->f_type != DTYPE_VNODE) return (EBADF); vp = (struct vnode *)fp->f_data; + AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* Copy in the lock structure */ error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); @@ -358,6 +364,7 @@ if (fp->f_type != DTYPE_VNODE) return (EBADF); vp = (struct vnode *)fp->f_data; + AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* Copy in the lock structure */ error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); @@ -510,6 +517,18 @@ return(error); return (VOP_IOCTL(vp, 1, (caddr_t)&ra_struct, 0, fp->f_cred, p)); + case F_CHKCLEAN: + /* + * used by regression test to determine if + * all the dirty pages (via write) have been cleaned + * after a call to 'fsysnc'. + */ + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + + return (VOP_IOCTL(vp, 5, 0, 0, fp->f_cred, p)); + case F_READBOOTSTRAP: case F_WRITEBOOTSTRAP: if (fp->f_type != DTYPE_VNODE) @@ -550,10 +569,12 @@ error = vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, p); if (error) return (error); - if (VOP_OFFTOBLK(vp, fp->f_offset, &lbn)) - panic("fcntl LOG2PHYS OFFTOBLK"); - if (VOP_BLKTOOFF(vp, lbn, &offset)) - panic("fcntl LOG2PHYS BLKTOOFF1"); + error = VOP_OFFTOBLK(vp, fp->f_offset, &lbn); + if (error) + return (error); + error = VOP_BLKTOOFF(vp, lbn, &offset); + if (error) + return (error); error = VOP_BMAP(vp, lbn, &devvp, &bn, 0); VOP_DEVBLOCKSIZE(devvp, &devBlockSize); VOP_UNLOCK(vp, 0, p); @@ -568,6 +589,32 @@ } return (error); + case F_GETPATH: { + char *pathbuf; + int len; + extern int vn_getpath(struct vnode *vp, char *pathbuf, int *len); + + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + + len = MAXPATHLEN; + MALLOC(pathbuf, char *, len, M_TEMP, M_WAITOK); + error = vn_getpath(vp, pathbuf, &len); + if (error == 0) + error = copyout((caddr_t)pathbuf, (caddr_t)uap->arg, len); + FREE(pathbuf, M_TEMP); + return error; + } + + case F_FULLFSYNC: { + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + + return (VOP_IOCTL(vp, 6, (caddr_t)NULL, 0, fp->f_cred, p)); + } + default: return (EINVAL); } @@ -620,6 +667,16 @@ (fp = fdp->fd_ofiles[fd]) == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) return (EBADF); + + /* Keep people from using the filedesc while we are closing it */ + fdp->fd_ofileflags[fd] |= UF_RESERVED; + + /* cancel all async IO requests that can be cancelled. */ + _aio_close( p, fd ); + + if (fd < fdp->fd_knlistsize) + knote_fdclose(p, fd); + _fdrelse(fdp, fd); return (closef(fp, p)); } @@ -644,6 +701,7 @@ struct stat ub; int error; + AUDIT_ARG(fd, uap->fd); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) @@ -652,6 +710,9 @@ case DTYPE_VNODE: error = vn_stat((struct vnode *)fp->f_data, &ub, p); + if (error == 0) { + AUDIT_ARG(vnpath, (struct vnode *)fp->f_data, ARG_VNODE1); + } break; case DTYPE_SOCKET: @@ -661,6 +722,11 @@ case DTYPE_PSXSHM: error = pshm_stat((void *)fp->f_data, &ub); break; + + case DTYPE_KQUEUE: + error = kqueue_stat(fp, &ub, p); + break; + default: panic("fstat"); /*NOTREACHED*/ @@ -736,6 +802,7 @@ struct file *fp; struct vnode *vp; + AUDIT_ARG(fd, uap->fd); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) @@ -750,6 +817,8 @@ case DTYPE_VNODE: vp = (struct vnode *)fp->f_data; + AUDIT_ARG(vnpath, vp, ARG_VNODE1); + return (VOP_PATHCONF(vp, uap->name, retval)); default: @@ -923,11 +992,6 @@ nfiles++; MALLOC_ZONE(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); bzero(fp, sizeof(struct file)); - if (fq = p->p_fd->fd_ofiles[0]) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } p->p_fd->fd_ofiles[i] = fp; fp->f_count = 1; fp->f_cred = p->p_ucred; @@ -936,6 +1000,11 @@ *resultfp = fp; if (resultfd) *resultfd = i; + if (fq = p->p_fd->fd_ofiles[0]) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } return (0); } @@ -976,6 +1045,9 @@ if ((*flags & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) { register struct file *fp = *fpp; + if (i < fdp->fd_knlistsize) + knote_fdclose(p, i); + *fpp = NULL; *flags = 0; if (i == fdp->fd_lastfile && i > 0) fdp->fd_lastfile--; @@ -1037,6 +1109,26 @@ (void) memcpy(newfdp->fd_ofileflags, fdp->fd_ofileflags, i * sizeof *fdp->fd_ofileflags); + /* + * kq descriptors cannot be copied. + */ + if (newfdp->fd_knlistsize != -1) { + fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile]; + for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) { + if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) { + *fpp = NULL; + if (i < newfdp->fd_freefile) + newfdp->fd_freefile = i; + } + if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0) + newfdp->fd_lastfile--; + } + newfdp->fd_knlist = NULL; + newfdp->fd_knlistsize = -1; + newfdp->fd_knhash = NULL; + newfdp->fd_knhashmask = 0; + } + fpp = newfdp->fd_ofiles; flags = newfdp->fd_ofileflags; for (i = newfdp->fd_lastfile; i-- >= 0; fpp++, flags++) @@ -1060,31 +1152,69 @@ struct proc *p; { struct filedesc *fdp; - struct file **fpp; + struct file *fp; int i; struct vnode *tvp; + /* Certain daemons might not have file descriptors */ if ((fdp = p->p_fd) == NULL) return; + if (--fdp->fd_refcnt > 0) return; - p->p_fd = NULL; + + /* Last reference: the structure can't change out from under us */ if (fdp->fd_nfiles > 0) { - fpp = fdp->fd_ofiles; - for (i = fdp->fd_lastfile; i-- >= 0; fpp++) - if (*fpp) - (void) closef(*fpp, p); + for (i = fdp->fd_lastfile; i >= 0; i--) +#if 1 /* WORKAROUND */ + /* + * Merlot: need to remove the bogus f_data check + * from the following "if" statement. It's there + * because of the network/kernel funnel race on a + * close of a socket vs. fdfree on exit. See + * Radar rdar://problem/3365650 for details, but + * the sort version is the commment before the "if" + * above is wrong under certain circumstances. + * + * We have to do this twice, in case knote_fdclose() + * results in a block. + * + * This works because an fdfree() will set all fields + * in the struct file to -1. + */ + if ((fp = fdp->fd_ofiles[i]) != NULL && + fp->f_data != (caddr_t)-1) { + if (i < fdp->fd_knlistsize) + knote_fdclose(p, i); + if (fp->f_data != (caddr_t)-1) + (void) closef(fp, p); + } +#else /* !WORKAROUND */ + if ((fp = fdp->fd_ofiles[i]) != NULL) { + if (i < fdp->fd_knlistsize) + knote_fdclose(p, i); + (void) closef(fp, p); + } +#endif /* !WORKAROUND */ FREE_ZONE(fdp->fd_ofiles, fdp->fd_nfiles * OFILESIZE, M_OFILETABL); } + tvp = fdp->fd_cdir; fdp->fd_cdir = NULL; vrele(tvp); + if (fdp->fd_rdir) { tvp = fdp->fd_rdir; fdp->fd_rdir = NULL; vrele(tvp); } + + if (fdp->fd_knlist) + FREE(fdp->fd_knlist, M_KQUEUE); + if (fdp->fd_knhash) + FREE(fdp->fd_knhash, M_KQUEUE); + FREE_ZONE(fdp, sizeof *fdp, M_FILEDESC); } @@ -1175,6 +1305,7 @@ struct vnode *vp; struct flock lf; + AUDIT_ARG(fd, uap->fd); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) @@ -1182,6 +1313,7 @@ if (fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); vp = (struct vnode *)fp->f_data; + AUDIT_ARG(vnpath, vp, ARG_VNODE1); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; diff -urN xnu-344.49/bsd/kern/kern_event.c xnu-517/bsd/kern/kern_event.c --- xnu-344.49/bsd/kern/kern_event.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_event.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -23,11 +23,1092 @@ * @APPLE_LICENSE_HEADER_END@ * */ +/*- + * Copyright (c) 1999,2000,2001 Jonathan Lemon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ /* * @(#)kern_event.c 1.0 (3/31/2000) */ #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); + +static int kqueue_scan(struct file *fp, int maxevents, + struct kevent *ulistp, const struct timespec *timeout, + register_t *retval, struct proc *p); +static void kqueue_wakeup(struct kqueue *kq); + +static int kqueue_read __P((struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct proc *p)); +static int kqueue_write __P((struct file *fp, struct uio *uio, + struct ucred *cred, int flags, struct proc *p)); +static int kqueue_ioctl __P((struct file *fp, u_long com, caddr_t data, + struct proc *p)); +static int kqueue_select __P((struct file *fp, int which, void *wql, + struct proc *p)); +static int kqueue_close __P((struct file *fp, struct proc *p)); +static int kqueue_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); + +static struct fileops kqueueops = { + kqueue_read, + kqueue_write, + kqueue_ioctl, + kqueue_select, + kqueue_close, + kqueue_kqfilter +}; + +static void knote_fdpattach(struct knote *kn, struct filedesc *fdp); +static void knote_drop(struct knote *kn, struct proc *p); +static void knote_enqueue(struct knote *kn); +static void knote_dequeue(struct knote *kn); +static struct knote *knote_alloc(void); +static void knote_free(struct knote *kn); + +static int filt_fileattach(struct knote *kn); +static struct filterops file_filtops = + { 1, filt_fileattach, NULL, NULL }; + +static void filt_kqdetach(struct knote *kn); +static int filt_kqueue(struct knote *kn, long hint); +static struct filterops kqread_filtops = + { 1, NULL, filt_kqdetach, filt_kqueue }; + +/* + * JMM - placeholder for not-yet-implemented filters + */ +static int filt_badattach(struct knote *kn); +static struct filterops bad_filtops = + { 0, filt_badattach, 0 , 0 }; + +static int filt_procattach(struct knote *kn); +static void filt_procdetach(struct knote *kn); +static int filt_proc(struct knote *kn, long hint); + +static struct filterops proc_filtops = + { 0, filt_procattach, filt_procdetach, filt_proc }; + +extern struct filterops fs_filtops; + +extern struct filterops sig_filtops; + +#if 0 +/* JMM - We don't implement these now */ +static void filt_timerexpire(void *knx); +static int filt_timerattach(struct knote *kn); +static void filt_timerdetach(struct knote *kn); +static int filt_timer(struct knote *kn, long hint); + +static struct filterops timer_filtops = + { 0, filt_timerattach, filt_timerdetach, filt_timer }; + +static int kq_ncallouts = 0; +static int kq_calloutmax = (4 * 1024); + +SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, + &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); +#endif /* 0 */ + +static zone_t knote_zone; + +#define KNOTE_ACTIVATE(kn) do { \ + kn->kn_status |= KN_ACTIVE; \ + if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ + knote_enqueue(kn); \ +} while(0) + +#define KN_HASHSIZE 64 /* XXX should be tunable */ +#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) + +#if 0 +extern struct filterops aio_filtops; +#endif + +/* + * Table for for all system-defined filters. + */ +static struct filterops *sysfilt_ops[] = { + &file_filtops, /* EVFILT_READ */ + &file_filtops, /* EVFILT_WRITE */ +#if 0 + &aio_filtops, /* EVFILT_AIO */ +#else + &bad_filtops, /* EVFILT_AIO */ +#endif + &file_filtops, /* EVFILT_VNODE */ + &proc_filtops, /* EVFILT_PROC */ + &sig_filtops, /* EVFILT_SIGNAL */ +#if 0 + &timer_filtops, /* EVFILT_TIMER */ +#else + &bad_filtops, /* EVFILT_TIMER */ +#endif + &bad_filtops, /* EVFILT_MACHPORT */ + &fs_filtops /* EVFILT_FS */ +}; + +static int +filt_fileattach(struct knote *kn) +{ + + return (fo_kqfilter(kn->kn_fp, kn, current_proc())); +} + +static void +filt_kqdetach(struct knote *kn) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + if (kq->kq_state & KQ_SEL) + return; + + KNOTE_DETACH(&kq->kq_sel.si_note, kn); +} + +/*ARGSUSED*/ +static int +filt_kqueue(struct knote *kn, long hint) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + kn->kn_data = kq->kq_count; + return (kn->kn_data > 0); +} + +static int +filt_procattach(struct knote *kn) +{ + struct proc *p; + + p = pfind(kn->kn_id); + if (p == NULL) + return (ESRCH); + if (! PRISON_CHECK(current_proc(), p)) + return (EACCES); + + kn->kn_ptr.p_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + + /* + * internal flag indicating registration done by kernel + */ + if (kn->kn_flags & EV_FLAG1) { + kn->kn_data = kn->kn_sdata; /* ppid */ + kn->kn_fflags = NOTE_CHILD; + kn->kn_flags &= ~EV_FLAG1; + } + + /* XXX lock the proc here while adding to the list? */ + KNOTE_ATTACH(&p->p_klist, kn); + + return (0); +} + +/* + * The knote may be attached to a different process, which may exit, + * leaving nothing for the knote to be attached to. So when the process + * exits, the knote is marked as DETACHED and also flagged as ONESHOT so + * it will be deleted when read out. However, as part of the knote deletion, + * this routine is called, so a check is needed to avoid actually performing + * a detach, because the original process does not exist any more. + */ +static void +filt_procdetach(struct knote *kn) +{ + struct proc *p = kn->kn_ptr.p_proc; + + if (kn->kn_status & KN_DETACHED) + return; + + /* XXX locking? this might modify another process. */ + KNOTE_DETACH(&p->p_klist, kn); +} + +static int +filt_proc(struct knote *kn, long hint) +{ + u_int event; + + /* + * mask off extra data + */ + event = (u_int)hint & NOTE_PCTRLMASK; + + /* + * if the user is interested in this event, record it. + */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* + * process is gone, so flag the event as finished. + */ + if (event == NOTE_EXIT) { + kn->kn_status |= KN_DETACHED; + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + + /* + * process forked, and user wants to track the new process, + * so attach a new knote to it, and immediately report an + * event with the parent's pid. + */ + if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { + struct kevent kev; + int error; + + /* + * register knote with new process. + */ + kev.ident = hint & NOTE_PDATAMASK; /* pid */ + kev.filter = kn->kn_filter; + kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; + kev.fflags = kn->kn_sfflags; + kev.data = kn->kn_id; /* parent */ + kev.udata = kn->kn_kevent.udata; /* preserve udata */ + error = kqueue_register(kn->kn_kq, &kev, NULL); + if (error) + kn->kn_fflags |= NOTE_TRACKERR; + } + + return (kn->kn_fflags != 0); +} + +#if 0 +static void +filt_timerexpire(void *knx) +{ + struct knote *kn = knx; + struct callout *calloutp; + struct timeval tv; + int tticks; + + kn->kn_data++; + KNOTE_ACTIVATE(kn); + + if ((kn->kn_flags & EV_ONESHOT) == 0) { + tv.tv_sec = kn->kn_sdata / 1000; + tv.tv_usec = (kn->kn_sdata % 1000) * 1000; + tticks = tvtohz(&tv); + calloutp = (struct callout *)kn->kn_hook; + callout_reset(calloutp, tticks, filt_timerexpire, kn); + } +} + +/* + * data contains amount of time to sleep, in milliseconds + */ +static int +filt_timerattach(struct knote *kn) +{ + struct callout *calloutp; + struct timeval tv; + int tticks; + + if (kq_ncallouts >= kq_calloutmax) + return (ENOMEM); + kq_ncallouts++; + + tv.tv_sec = kn->kn_sdata / 1000; + tv.tv_usec = (kn->kn_sdata % 1000) * 1000; + tticks = tvtohz(&tv); + + kn->kn_flags |= EV_CLEAR; /* automatically set */ + MALLOC(calloutp, struct callout *, sizeof(*calloutp), + M_KQUEUE, M_WAITOK); + callout_init(calloutp); + callout_reset(calloutp, tticks, filt_timerexpire, kn); + kn->kn_hook = (caddr_t)calloutp; + + return (0); +} + +static void +filt_timerdetach(struct knote *kn) +{ + struct callout *calloutp; + + calloutp = (struct callout *)kn->kn_hook; + callout_stop(calloutp); + FREE(calloutp, M_KQUEUE); + kq_ncallouts--; +} + +static int +filt_timer(struct knote *kn, long hint) +{ + + return (kn->kn_data != 0); +} +#endif /* 0 */ + +/* + * JMM - placeholder for not-yet-implemented filters + */ +static int +filt_badattach(struct knote *kn) +{ + return(EOPNOTSUPP); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kqueue_args { + int dummy; +}; +#endif + +int +kqueue(struct proc *p, struct kqueue_args *uap, register_t *retval) +{ + struct filedesc *fdp = p->p_fd; + struct kqueue *kq; + struct file *fp; + int fd, error; + + error = falloc(p, &fp, &fd); + if (error) + return (error); + fp->f_flag = FREAD | FWRITE; + fp->f_type = DTYPE_KQUEUE; + fp->f_ops = &kqueueops; + kq = (struct kqueue *)_MALLOC(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); + TAILQ_INIT(&kq->kq_head); + fp->f_data = (caddr_t)kq; + *retval = fd; + if (fdp->fd_knlistsize < 0) + fdp->fd_knlistsize = 0; /* this process has a kq */ + kq->kq_fdp = fdp; + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kqueue_portset_np_args { + int fd; +}; +#endif +int +kqueue_portset_np(struct proc *p, struct kqueue_portset_np_args *uap, register_t *retval) +{ + /* JMM - Placeholder for now */ + return (EOPNOTSUPP); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kqueue_from_portset_np_args { + int fd; +}; +#endif +int +kqueue_from_portset_np(struct proc *p, struct kqueue_from_portset_np_args *uap, register_t *retval) +{ + /* JMM - Placeholder for now */ + return (EOPNOTSUPP); +} + +#if !0 +/* JMM - We don't implement this yet */ +#define fhold(fp) +#define fdrop(fp, p) +#endif /* !0 */ + +#ifndef _SYS_SYSPROTO_H_ +struct kevent_args { + int fd; + const struct kevent *changelist; + int nchanges; + struct kevent *eventlist; + int nevents; + const struct timespec *timeout; +}; +#endif +int +kevent(struct proc *p, struct kevent_args *uap, register_t *retval) +{ + struct filedesc* fdp = p->p_fd; + struct kqueue *kq; + struct file *fp = NULL; + struct timespec ts; + int i, nerrors, error; + + if (uap->timeout != NULL) { + error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, sizeof(ts)); + if (error) + goto done; + uap->timeout = &ts; + } + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_type != DTYPE_KQUEUE)) + return (EBADF); + + fhold(fp); + + kq = (struct kqueue *)fp->f_data; + nerrors = 0; + + while (uap->nchanges > 0) { + int i; + int n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; + struct kevent kq_kev[n]; + + error = copyin((caddr_t)uap->changelist, (caddr_t)kq_kev, + n * sizeof(struct kevent)); + if (error) + goto done; + for (i = 0; i < n; i++) { + struct kevent *kevp = &kq_kev[i]; + + kevp->flags &= ~EV_SYSFLAGS; + error = kqueue_register(kq, kevp, p); + if (error) { + if (uap->nevents != 0) { + kevp->flags = EV_ERROR; + kevp->data = error; + (void) copyout((caddr_t)kevp, + (caddr_t)uap->eventlist, + sizeof(*kevp)); + uap->eventlist++; + uap->nevents--; + nerrors++; + } else { + goto done; + } + } + } + uap->nchanges -= n; + uap->changelist += n; + } + if (nerrors) { + *retval = nerrors; + error = 0; + goto done; + } + + error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, retval, p); +done: + if (fp != NULL) + fdrop(fp, p); + return (error); +} + +int +kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) +{ + struct filedesc *fdp = kq->kq_fdp; + struct filterops *fops; + struct file *fp = NULL; + struct knote *kn = NULL; + int s, error = 0; + + if (kev->filter < 0) { + if (kev->filter + EVFILT_SYSCOUNT < 0) + return (EINVAL); + fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ + } else { + /* + * XXX + * filter attach routine is responsible for insuring that + * the identifier can be attached to it. + */ + printf("unknown filter: %d\n", kev->filter); + return (EINVAL); + } + + if (fops->f_isfd) { + /* validate descriptor */ + if ((u_int)kev->ident >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[kev->ident]) == NULL) + return (EBADF); + fhold(fp); + + if (kev->ident < fdp->fd_knlistsize) { + SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link) + if (kq == kn->kn_kq && + kev->filter == kn->kn_filter) + break; + } + } else { + if (fdp->fd_knhashmask != 0) { + struct klist *list; + + list = &fdp->fd_knhash[ + KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; + SLIST_FOREACH(kn, list, kn_link) + if (kev->ident == kn->kn_id && + kq == kn->kn_kq && + kev->filter == kn->kn_filter) + break; + } + } + + if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { + error = ENOENT; + goto done; + } + + /* + * kn now contains the matching knote, or NULL if no match + */ + if (kev->flags & EV_ADD) { + + if (kn == NULL) { + kn = knote_alloc(); + if (kn == NULL) { + error = ENOMEM; + goto done; + } + kn->kn_fp = fp; + kn->kn_kq = kq; + kn->kn_fop = fops; + + /* + * apply reference count to knote structure, and + * do not release it at the end of this routine. + */ + fp = NULL; + + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kev->fflags = 0; + kev->data = 0; + kn->kn_kevent = *kev; + + knote_fdpattach(kn, fdp); + if ((error = fops->f_attach(kn)) != 0) { + knote_drop(kn, p); + goto done; + } + } else { + /* + * The user may change some filter values after the + * initial EV_ADD, but doing so will not reset any + * filter which have already been triggered. + */ + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kn->kn_kevent.udata = kev->udata; + } + + s = splhigh(); + if (kn->kn_fop->f_event(kn, 0)) + KNOTE_ACTIVATE(kn); + splx(s); + + } else if (kev->flags & EV_DELETE) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + goto done; + } + + if ((kev->flags & EV_DISABLE) && + ((kn->kn_status & KN_DISABLED) == 0)) { + s = splhigh(); + kn->kn_status |= KN_DISABLED; + splx(s); + } + + if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { + s = splhigh(); + kn->kn_status &= ~KN_DISABLED; + if ((kn->kn_status & KN_ACTIVE) && + ((kn->kn_status & KN_QUEUED) == 0)) + knote_enqueue(kn); + splx(s); + } + +done: + if (fp != NULL) + fdrop(fp, p); + return (error); +} + +static int +kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, + const struct timespec *tsp, register_t *retval, struct proc *p) +{ + struct kqueue *kq = (struct kqueue *)fp->f_data; + struct timeval atv, rtv, ttv; + int s, count, timeout, error = 0; + struct knote marker; + + count = maxevents; + if (count == 0) + goto done; + + if (tsp != NULL) { + TIMESPEC_TO_TIMEVAL(&atv, tsp); + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) + timeout = -1; + else + timeout = atv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&atv); + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + timeout = 0; + } + goto start; + +retry: + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timeout = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + +start: + s = splhigh(); + if (kq->kq_count == 0) { + if (timeout < 0) { + error = EWOULDBLOCK; + } else { + kq->kq_state |= KQ_SLEEP; + error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); + } + splx(s); + if (error == 0) + goto retry; + /* don't restart after signals... */ + if (error == ERESTART) + error = EINTR; + else if (error == EWOULDBLOCK) + error = 0; + goto done; + } + + /* JMM - This marker trick doesn't work with multiple threads */ + TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); + while (count) { + int maxkev = (count > KQ_NEVENTS) ? KQ_NEVENTS : count; + struct kevent kq_kev[maxkev]; + struct kevent *kevp = kq_kev; + struct knote *kn; + int nkev = 0; + + while (nkev < maxkev) { + kn = TAILQ_FIRST(&kq->kq_head); + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + if (kn == &marker) { + if (count == maxevents) + goto retry; + break; + } else if (kn->kn_status & KN_DISABLED) { + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + continue; + } else if ((kn->kn_flags & EV_ONESHOT) == 0 && + kn->kn_fop->f_event(kn, 0) == 0) { + kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kq->kq_count--; + continue; + } + + *kevp = kn->kn_kevent; + kevp++; + nkev++; + count--; + + if (kn->kn_flags & EV_ONESHOT) { + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + splx(s); + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + s = splhigh(); + } else if (kn->kn_flags & EV_CLEAR) { + kn->kn_data = 0; + kn->kn_fflags = 0; + kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kq->kq_count--; + } else { + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + } + } + splx(s); + error = copyout((caddr_t)kq_kev, (caddr_t)ulistp, + sizeof(struct kevent) * nkev); + if (kn == &marker) + goto done; + ulistp += nkev; + s = splhigh(); + if (error) + break; + } + TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); + splx(s); +done: + *retval = maxevents - count; + return (error); +} + +/* + * XXX + * This could be expanded to call kqueue_scan, if desired. + */ +/*ARGSUSED*/ +static int +kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, + int flags, struct proc *p) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +static int +kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, + int flags, struct proc *p) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +static int +kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) +{ + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +kqueue_select(struct file *fp, int which, void *wql, struct proc *p) +{ + struct kqueue *kq = (struct kqueue *)fp->f_data; + int retnum = 0; + int s = splnet(); + + if (which == FREAD) { + if (kq->kq_count) { + retnum = 1; + } else { + selrecord(p, &kq->kq_sel, wql); + kq->kq_state |= KQ_SEL; + } + } + splx(s); + return (retnum); +} + +/*ARGSUSED*/ +static int +kqueue_close(struct file *fp, struct proc *p) +{ + struct kqueue *kq = (struct kqueue *)fp->f_data; + struct filedesc *fdp = p->p_fd; + struct knote **knp, *kn, *kn0; + int i; + + for (i = 0; i < fdp->fd_knlistsize; i++) { + knp = &SLIST_FIRST(&fdp->fd_knlist[i]); + kn = *knp; + while (kn != NULL) { + kn0 = SLIST_NEXT(kn, kn_link); + if (kq == kn->kn_kq) { + kn->kn_fop->f_detach(kn); + fdrop(kn->kn_fp, p); + knote_free(kn); + *knp = kn0; + } else { + knp = &SLIST_NEXT(kn, kn_link); + } + kn = kn0; + } + } + if (fdp->fd_knhashmask != 0) { + for (i = 0; i < fdp->fd_knhashmask + 1; i++) { + knp = &SLIST_FIRST(&fdp->fd_knhash[i]); + kn = *knp; + while (kn != NULL) { + kn0 = SLIST_NEXT(kn, kn_link); + if (kq == kn->kn_kq) { + kn->kn_fop->f_detach(kn); + /* XXX non-fd release of kn->kn_ptr */ + knote_free(kn); + *knp = kn0; + } else { + knp = &SLIST_NEXT(kn, kn_link); + } + kn = kn0; + } + } + } + _FREE(kq, M_KQUEUE); + fp->f_data = NULL; + + return (0); +} + +/*ARGSUSED*/ +static int +kqueue_kqfilter(struct file *fp, struct knote *kn, struct proc *p) +{ + struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; + + if (kn->kn_filter != EVFILT_READ || (kq->kq_state & KQ_SEL)) + return (1); + + kn->kn_fop = &kqread_filtops; + KNOTE_ATTACH(&kq->kq_sel.si_note, kn); + return (0); +} + +/*ARGSUSED*/ +int +kqueue_stat(struct file *fp, struct stat *st, struct proc *p) +{ + struct kqueue *kq = (struct kqueue *)fp->f_data; + + bzero((void *)st, sizeof(*st)); + st->st_size = kq->kq_count; + st->st_blksize = sizeof(struct kevent); + st->st_mode = S_IFIFO; + return (0); +} + +static void +kqueue_wakeup(struct kqueue *kq) +{ + + if (kq->kq_state & KQ_SLEEP) { + kq->kq_state &= ~KQ_SLEEP; + wakeup(kq); + } + if (kq->kq_state & KQ_SEL) { + // kq->kq_state &= ~KQ_SEL; /* remove for now */ + selwakeup(&kq->kq_sel); + } else + KNOTE(&kq->kq_sel.si_note, 0); +} + +void +klist_init(struct klist *list) +{ + SLIST_INIT(list); +} + +/* + * walk down a list of knotes, activating them if their event has triggered. + */ +void +knote(struct klist *list, long hint) +{ + struct knote *kn; + + SLIST_FOREACH(kn, list, kn_selnext) + if (kn->kn_fop->f_event(kn, hint)) + KNOTE_ACTIVATE(kn); +} + +/* + * attach a knote to the specified list. Return true if this is the first entry. + */ +int +knote_attach(struct klist *list, struct knote *kn) +{ + int ret = SLIST_EMPTY(list); + SLIST_INSERT_HEAD(list, kn, kn_selnext); + return ret; +} + +/* + * detach a knote from the specified list. Return true if that was the last entry. + */ +int +knote_detach(struct klist *list, struct knote *kn) +{ + SLIST_REMOVE(list, kn, knote, kn_selnext); + return SLIST_EMPTY(list); +} + +/* + * remove all knotes from a specified klist + */ +void +knote_remove(struct proc *p, struct klist *list) +{ + struct knote *kn; + + while ((kn = SLIST_FIRST(list)) != NULL) { + kn->kn_fop->f_detach(kn); + knote_drop(kn, p); + } +} + +/* + * remove all knotes referencing a specified fd + */ +void +knote_fdclose(struct proc *p, int fd) +{ + struct filedesc *fdp = p->p_fd; + struct klist *list = &fdp->fd_knlist[fd]; + + knote_remove(p, list); +} + +static void +knote_fdpattach(struct knote *kn, struct filedesc *fdp) +{ + struct klist *list; + int size; + + if (! kn->kn_fop->f_isfd) { + if (fdp->fd_knhashmask == 0) + fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, + &fdp->fd_knhashmask); + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + goto done; + } + + if (fdp->fd_knlistsize <= kn->kn_id) { + size = fdp->fd_knlistsize; + while (size <= kn->kn_id) + size += KQEXTENT; + MALLOC(list, struct klist *, + size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list, + fdp->fd_knlistsize * sizeof(struct klist *)); + bzero((caddr_t)list + + fdp->fd_knlistsize * sizeof(struct klist *), + (size - fdp->fd_knlistsize) * sizeof(struct klist *)); + if (fdp->fd_knlist != NULL) + FREE(fdp->fd_knlist, M_KQUEUE); + fdp->fd_knlistsize = size; + fdp->fd_knlist = list; + } + list = &fdp->fd_knlist[kn->kn_id]; +done: + SLIST_INSERT_HEAD(list, kn, kn_link); + kn->kn_status = 0; +} + +/* + * should be called at spl == 0, since we don't want to hold spl + * while calling fdrop and free. + */ +static void +knote_drop(struct knote *kn, struct proc *p) +{ + struct filedesc *fdp = p->p_fd; + struct klist *list; + + if (kn->kn_fop->f_isfd) + list = &fdp->fd_knlist[kn->kn_id]; + else + list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; + + SLIST_REMOVE(list, kn, knote, kn_link); + if (kn->kn_status & KN_QUEUED) + knote_dequeue(kn); + if (kn->kn_fop->f_isfd) + fdrop(kn->kn_fp, p); + knote_free(kn); +} + + +static void +knote_enqueue(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + int s = splhigh(); + + KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); + + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); + kn->kn_status |= KN_QUEUED; + kq->kq_count++; + splx(s); + kqueue_wakeup(kq); +} + +static void +knote_dequeue(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + int s = splhigh(); + + KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); + + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + kn->kn_status &= ~KN_QUEUED; + kq->kq_count--; + splx(s); +} + +void +knote_init(void) +{ + knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote), 8192, "knote zone"); +} +SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) + +static struct knote * +knote_alloc(void) +{ + return ((struct knote *)zalloc(knote_zone)); +} + +static void +knote_free(struct knote *kn) +{ + zfree(knote_zone, (vm_offset_t)kn); +} + +#include #include #include #include @@ -77,6 +1158,10 @@ int error; struct kern_event_pcb *ev_pcb; + error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE); + if (error) + return error; + ev_pcb = _MALLOC(sizeof(struct kern_event_pcb), M_PCB, M_WAITOK); if (ev_pcb == 0) return ENOBUFS; @@ -86,9 +1171,6 @@ so->so_pcb = (caddr_t) ev_pcb; LIST_INSERT_HEAD(&kern_event_head, ev_pcb, ev_link); - error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE); - if (error) - return error; return 0; } @@ -98,9 +1180,11 @@ { struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb; - LIST_REMOVE(ev_pcb, ev_link); - if (ev_pcb) - FREE(ev_pcb, M_PCB); + if (ev_pcb != 0) { + LIST_REMOVE(ev_pcb, ev_link); + FREE(ev_pcb, M_PCB); + so->so_pcb = 0; + } return 0; } diff -urN xnu-344.49/bsd/kern/kern_exec.c xnu-517/bsd/kern/kern_exec.c --- xnu-344.49/bsd/kern/kern_exec.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_exec.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -89,13 +89,18 @@ #include #include #include +#include #include #include #include +#include #include #include + +extern vm_map_t vm_map_switch(vm_map_t map); /* XXX */ + #include #include @@ -134,6 +139,45 @@ return (execve(p, args, retval)); } +extern char classichandler[32]; +extern long classichandler_fsid; +extern long classichandler_fileid; + +/* + * Helper routine to get rid of a loop in execve. Given a pointer to + * something for the arg list (which might be in kernel space or in user + * space), copy it into the kernel buffer at the currentWritePt. This code + * does the proper thing to get the data transferred. + * bytesWritten, currentWritePt, and bytesLeft are kept up-to-date. + */ + +static int copyArgument(char *argument, int pointerInKernel, + int *bytesWritten,char **currentWritePt, + int *bytesLeft){ + int error = 0; + do { + size_t len = 0; + if (*bytesLeft <= 0) { + error = E2BIG; + break; + } + if (pointerInKernel == UIO_SYSSPACE) { + error = copystr(argument, *currentWritePt, (unsigned)*bytesLeft, &len); + } else { + /* + * pointer in kernel == UIO_USERSPACE + * Copy in from user space. + */ + error = copyinstr((caddr_t)argument, *currentWritePt, (unsigned)*bytesLeft, + &len); + } + *currentWritePt += len; + *bytesWritten += len; + *bytesLeft -= len; + } while (error == ENAMETOOLONG); + return error; +} + /* ARGSUSED */ int execve(p, uap, retval) @@ -143,12 +187,14 @@ { register struct ucred *cred = p->p_ucred; register struct filedesc *fdp = p->p_fd; - register nc; - register char *cp; + int nc; + char *cp; int na, ne, ucp, ap, cc; unsigned len; - int indir; - char *sharg; + int executingInterpreter=0; + + int executingClassic=0; + char binaryWithClassicName[sizeof(p->p_comm)] = {0}; char *execnamep; struct vnode *vp; struct vattr vattr; @@ -157,6 +203,10 @@ struct nameidata nd; struct ps_strings ps; #define SHSIZE 512 + /* Argument(s) to an interpreter. If we're executing a shell + * script, the name (#!/bin/csh) is allowed to be followed by + * arguments. cfarg holds these arguments. + */ char cfarg[SHSIZE]; boolean_t is_fat; kern_return_t ret; @@ -169,7 +219,10 @@ vm_map_t old_map; vm_map_t map; int i; - boolean_t new_shared_regions = FALSE; + boolean_t clean_regions = FALSE; + shared_region_mapping_t shared_region = NULL; + shared_region_mapping_t initial_region = NULL; + union { /* #! and name of interpreter */ char ex_shell[SHSIZE]; @@ -193,6 +246,12 @@ unsigned long arch_size = 0; char *ws_cache_name = NULL; /* used for pre-heat */ + /* + * XXXAUDIT: Currently, we only audit the pathname of the binary. + * There may also be poor interaction with dyld. + */ + + cfarg[0] = '\0'; /* initialize to null value. */ task = current_task(); thr_act = current_act(); uthread = get_bsdthread_info(thr_act); @@ -214,7 +273,7 @@ if (error) return(error); - savedpath = execargs; + savedpath = (char *)execargs; /* * To support new app package launching for Mac OS X, the dyld @@ -229,16 +288,26 @@ * absolute pathname. This might be unacceptable for dyld. */ /* XXX We could optimize to avoid copyinstr in the namei() */ + + /* + * XXXAUDIT: Note: the double copyin introduces an audit + * race. To correct this race, we must use a single + * copyin(). + */ - error = copyinstr(uap->fname, savedpath, MAXPATHLEN, &savedpathlen); - if (error) - return (error); + error = copyinstr(uap->fname, savedpath, + MAXPATHLEN, (size_t *)&savedpathlen); + if (error) { + execargs_free(execargs); + return(error); + } /* * copyinstr will put in savedpathlen, the count of * characters (including NULL) in the path. + * No app profiles under chroot */ - if(app_profile != 0) { + if((fdp->fd_rdir == NULLVP) && (app_profile != 0)) { /* grab the name of the file out of its path */ /* we will need this for lookup within the */ @@ -253,13 +322,14 @@ } ws_cache_name++; } - + /* Save the name aside for future use */ execargsp = (vm_offset_t *)((char *)(execargs) + savedpathlen); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | SAVENAME, + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | SAVENAME | AUDITVNPATH1, UIO_USERSPACE, uap->fname, p); - if ((error = namei(&nd))) + error = namei(&nd); + if (error) goto bad1; vp = nd.ni_vp; VOP_LEASE(vp, p, p->p_ucred, LEASE_READ); @@ -273,7 +343,6 @@ goto bad; } - indir = 0; if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) origvattr.va_mode &= ~(VSUID | VSGID); @@ -317,27 +386,46 @@ #endif /* lint */ mach_header = &exdata.mach_header; fat_header = &exdata.fat_header; - if (mach_header->magic == MH_MAGIC) + if ((mach_header->magic == MH_CIGAM) && + (classichandler[0] == 0)) { + error = EBADARCH; + goto bad; + } else if ((mach_header->magic == MH_MAGIC) || + (mach_header->magic == MH_CIGAM)) { is_fat = FALSE; - else if (fat_header->magic == FAT_MAGIC || - fat_header->magic == FAT_CIGAM) + } else if ((fat_header->magic == FAT_MAGIC) || + (fat_header->magic == FAT_CIGAM)) { is_fat = TRUE; - else if (mach_header->magic == MH_CIGAM) { - error = EBADARCH; - goto bad; } else { + /* If we've already redirected once from an interpreted file + * to an interpreter, don't permit the second time. + */ if (exdata.ex_shell[0] != '#' || exdata.ex_shell[1] != '!' || - indir) { + executingInterpreter) { error = ENOEXEC; goto bad; } + if (executingClassic == 1) { + error = EBADARCH; + goto bad; + } cp = &exdata.ex_shell[2]; /* skip "#!" */ while (cp < &exdata.ex_shell[SHSIZE]) { - if (*cp == '\t') + if (*cp == '\t') /* convert all tabs to spaces */ *cp = ' '; - else if (*cp == '\n') { - *cp = '\0'; + else if (*cp == '\n' || *cp == '#') { + *cp = '\0'; /* trunc the line at nl or comment */ + + /* go back and remove the spaces before the /n or # */ + /* todo: do we have to do this if we fix the passing of args to shells ? */ + if ( cp != &exdata.ex_shell[2] ) { + do { + if ( *(cp-1) != ' ') + break; + *(--cp) = '\0'; + } while ( cp != &exdata.ex_shell[2] ); + } break; } cp++; @@ -369,14 +457,15 @@ * savedpathlen. +1 for NULL. */ savedpathlen = (cpnospace - execnamep + 1); - error = copystr(execnamep, savedpath, savedpathlen, &savedpathlen); + error = copystr(execnamep, savedpath, + savedpathlen, (size_t *)&savedpathlen); if (error) goto bad; /* Save the name aside for future use */ execargsp = (vm_offset_t *)((char *)(execargs) + savedpathlen); - indir = 1; + executingInterpreter= 1; vput(vp); nd.ni_cnd.cn_nameiop = LOOKUP; nd.ni_cnd.cn_flags = (nd.ni_cnd.cn_flags & HASBUF) | @@ -413,56 +502,7 @@ /* * Copy arguments into file in argdev area. */ - if (uap->argp) for (;;) { - ap = NULL; - sharg = NULL; - if (indir && na == 0) { - sharg = nd.ni_cnd.cn_nameptr; - ap = (int)sharg; - uap->argp++; /* ignore argv[0] */ - } else if (indir && (na == 1 && cfarg[0])) { - sharg = cfarg; - ap = (int)sharg; - } else if (indir && (na == 1 || (na == 2 && cfarg[0]))) - ap = (int)uap->fname; - else if (uap->argp) { - ap = fuword((caddr_t)uap->argp); - uap->argp++; - } - if (ap == NULL && uap->envp) { - uap->argp = NULL; - if ((ap = fuword((caddr_t)uap->envp)) != NULL) - uap->envp++, ne++; - } - if (ap == NULL) - break; - na++; - if (ap == -1) { - error = EFAULT; - break; - } - do { - if (nc >= (NCARGS - savedpathlen - 2*NBPW -1)) { - error = E2BIG; - break; - } - if (sharg) { - error = copystr(sharg, cp, (unsigned)cc, &len); - sharg += len; - } else { - error = copyinstr((caddr_t)ap, cp, (unsigned)cc, - &len); - ap += len; - } - cp += len; - nc += len; - cc -= len; - } while (error == ENAMETOOLONG); - if (error) { - goto bad; - } - } - nc = (nc + NBPW-1) & ~(NBPW-1); + /* * If we have a fat file, find "our" executable. @@ -471,7 +511,8 @@ /* * Look up our architecture in the fat file. */ - lret = fatfile_getarch(vp, (vm_offset_t)fat_header, &fat_arch); + lret = fatfile_getarch_affinity(vp,(vm_offset_t)fat_header, &fat_arch, + (p->p_flag & P_AFFINITY)); if (lret != LOAD_SUCCESS) { error = load_return_to_errno(lret); goto bad; @@ -493,7 +534,8 @@ } /* Is what we found a Mach-O executable */ - if (mach_header->magic != MH_MAGIC) { + if ((mach_header->magic != MH_MAGIC) && + (mach_header->magic != MH_CIGAM)) { error = ENOEXEC; goto bad; } @@ -508,10 +550,168 @@ arch_size = (u_long)vattr.va_size; } + if ( ! check_cpu_subtype(mach_header->cpusubtype) ) { + error = EBADARCH; + goto bad; + } + + if (mach_header->magic == MH_CIGAM) { + + int classicBinaryLen = nd.ni_cnd.cn_namelen; + if (classicBinaryLen > MAXCOMLEN) + classicBinaryLen = MAXCOMLEN; + bcopy((caddr_t)nd.ni_cnd.cn_nameptr, + (caddr_t)binaryWithClassicName, + (unsigned)classicBinaryLen); + binaryWithClassicName[classicBinaryLen] = '\0'; + executingClassic = 1; + + vput(vp); /* cleanup? */ + nd.ni_cnd.cn_nameiop = LOOKUP; + + nd.ni_cnd.cn_flags = (nd.ni_cnd.cn_flags & HASBUF) | + /* (FOLLOW | LOCKLEAF | SAVENAME) */ + (LOCKLEAF | SAVENAME); + nd.ni_segflg = UIO_SYSSPACE; + + nd.ni_dirp = classichandler; + if ((error = namei(&nd)) != 0) { + error = EBADARCH; + goto bad1; + } + vp = nd.ni_vp; + + VOP_LEASE(vp,p,cred,LEASE_READ); + if ((error = VOP_GETATTR(vp,&vattr,p->p_ucred,p))) { + goto bad; + } + goto again; + } + + if (uap->argp != NULL) { + /* geez -- why would argp ever be NULL, and why would we proceed? */ + + /* First, handle any argument massaging */ + if (executingInterpreter && executingClassic) { + error = copyArgument(classichandler,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + if (error) goto bad; + + /* Now name the interpreter. */ + error = copyArgument(savedpath,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + if (error) goto bad; + + /* + * if we're running an interpreter, as we'd be passing the + * command line executable as an argument to the interpreter already. + * Doing "execve("myShellScript","bogusName",arg1,arg2,...) + * probably shouldn't ever let bogusName be seen by the shell + * script. + */ + + if (cfarg[0]) { + error = copyArgument(cfarg,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + if (error) goto bad; + } + + char* originalExecutable = uap->fname; + error = copyArgument(originalExecutable,UIO_USERSPACE,&nc,&cp,&cc); + na++; + /* remove argv[0] b/c we've already placed it at */ + /* this point */ + uap->argp++; + if (error) goto bad; + + /* and continue with rest of the arguments. */ + } else if (executingClassic) { + error = copyArgument(classichandler,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + if (error) goto bad; + + char* originalExecutable = uap->fname; + error = copyArgument(originalExecutable,UIO_USERSPACE,&nc,&cp,&cc); + if (error) goto bad; + uap->argp++; + na++; + + /* and rest of arguments continue as before. */ + } else if (executingInterpreter) { + char *actualExecutable = nd.ni_cnd.cn_nameptr; + error = copyArgument(actualExecutable,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + /* remove argv[0] b/c we just placed it in the arg list. */ + uap->argp++; + if (error) goto bad; + /* Copy the argument in the interpreter first line if there + * was one. + */ + if (cfarg[0]) { + error = copyArgument(cfarg,UIO_SYSSPACE,&nc,&cp,&cc); + na++; + if (error) goto bad; + } + + /* copy the name of the file being interpreted, gotten from + * the structures passed in to execve. + */ + error = copyArgument(uap->fname,UIO_USERSPACE,&nc,&cp,&cc); + na++; + } + /* Now, get rest of arguments */ + while (uap->argp != NULL) { + char* userArgument = (char*)fuword((caddr_t) uap->argp); + uap->argp++; + if (userArgument == NULL) { + break; + } else if ((int)userArgument == -1) { + /* Um... why would it be -1? */ + error = EFAULT; + goto bad; + } + error = copyArgument(userArgument, UIO_USERSPACE,&nc,&cp,&cc); + if (error) goto bad; + na++; + } + /* Now, get the environment */ + while (uap->envp != NULL) { + char *userEnv = (char*) fuword((caddr_t) uap->envp); + uap->envp++; + if (userEnv == NULL) { + break; + } else if ((int)userEnv == -1) { + error = EFAULT; + goto bad; + } + error = copyArgument(userEnv,UIO_USERSPACE,&nc,&cp,&cc); + if (error) goto bad; + na++; + ne++; + } + } + + /* make sure there are nulls are the end!! */ + { + int cnt = 3; + char *mp = cp; + + while ( cnt-- ) + *mp++ = '\0'; + } + + /* and round up count of bytes written to next word. */ + nc = (nc + NBPW-1) & ~(NBPW-1); + + if (vattr.va_fsid == classichandler_fsid && + vattr.va_fileid == classichandler_fileid) { + executingClassic = 1; + } + if (vfexec) { kern_return_t result; - result = task_create_local(task, FALSE, FALSE, &new_task); + result = task_create_internal(task, FALSE, &new_task); if (result != KERN_SUCCESS) printf("execve: task_create failed. Code: 0x%x\n", result); p->task = new_task; @@ -526,35 +726,58 @@ uthread = get_bsdthread_info(thr_act); } else { map = VM_MAP_NULL; - } /* * Load the Mach-O file. */ - VOP_UNLOCK(vp, 0, p); + VOP_UNLOCK(vp, 0, p); /* XXX */ if(ws_cache_name) { tws_handle_startup_file(task, cred->cr_uid, - ws_cache_name, vp, &new_shared_regions); + ws_cache_name, vp, &clean_regions); } - if (new_shared_regions) { - shared_region_mapping_t new_shared_region; - shared_region_mapping_t old_shared_region; - - if (shared_file_create_system_region(&new_shared_region)) - panic("couldn't create system_shared_region\n"); - - vm_get_shared_region(task, &old_shared_region); - vm_set_shared_region(task, new_shared_region); - shared_region_mapping_dealloc(old_shared_region); + vm_get_shared_region(task, &initial_region); + int parentIsClassic = (p->p_flag & P_CLASSIC); + struct vnode *rootDir = p->p_fd->fd_rdir; + + if ((parentIsClassic && !executingClassic) || + (!parentIsClassic && executingClassic)) { + shared_region = lookup_default_shared_region( + (int)rootDir, + (executingClassic ? + CPU_TYPE_POWERPC : + machine_slot[cpu_number()].cpu_type)); + if (shared_region == NULL) { + shared_region_mapping_t old_region; + shared_region_mapping_t new_region; + vm_get_shared_region(current_task(), &old_region); + /* grrrr... this sets current_task(), not task + * -- they're different (usually) + */ + shared_file_boot_time_init( + (int)rootDir, + (executingClassic ? + CPU_TYPE_POWERPC : + machine_slot[cpu_number()].cpu_type)); + if ( current_task() != task ) { + vm_get_shared_region(current_task(),&new_region); + vm_set_shared_region(task,new_region); + vm_set_shared_region(current_task(),old_region); + } + } else { + vm_set_shared_region(task, shared_region); + } + shared_region_mapping_dealloc(initial_region); } - + lret = load_machfile(vp, mach_header, arch_offset, - arch_size, &load_result, thr_act, map); + arch_size, &load_result, thr_act, map, clean_regions); if (lret != LOAD_SUCCESS) { error = load_return_to_errno(lret); + vrele(vp); + vp = NULL; goto badtoolate; } @@ -587,6 +810,14 @@ if (origvattr.va_mode & VSGID) p->p_ucred->cr_gid = origvattr.va_gid; + /* + * Have mach reset the task port. We don't want + * anyone who had the task port before a setuid + * exec to be able to access/control the task + * after. + */ + ipc_task_reset(task); + set_security_token(p); p->p_flag |= P_SUGID; @@ -626,13 +857,17 @@ p->p_cred->p_svuid = p->p_ucred->cr_uid; p->p_cred->p_svgid = p->p_ucred->cr_gid; + KNOTE(&p->p_klist, NOTE_EXEC); + if (!vfexec && (p->p_flag & P_TRACED)) psignal(p, SIGTRAP); if (error) { + vrele(vp); + vp = NULL; goto badtoolate; } - VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY, p); + VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ vput(vp); vp = NULL; @@ -652,7 +887,7 @@ */ - ucp = p->user_stack; + ucp = (int)p->user_stack; if (vfexec) { old_map = vm_map_switch(get_task_map(task)); } @@ -666,17 +901,26 @@ * the "path" at the begining of the execargs buffer. * copy it just before the string area. */ - savedpathlen = (savedpathlen + NBPW-1) & ~(NBPW-1); len = 0; - pathptr = ucp - savedpathlen; + pathptr = ucp - ((savedpathlen + NBPW-1) & ~(NBPW-1)); error = copyoutstr(savedpath, (caddr_t)pathptr, - (unsigned)savedpathlen, &len); + (unsigned)savedpathlen, (size_t *)&len); + savedpathlen = (savedpathlen + NBPW-1) & ~(NBPW-1); + if (error) { if (vfexec) vm_map_switch(old_map); goto badtoolate; } - + + /* + * Record the size of the arguments area so that + * sysctl_procargs() can return the argument area without having + * to parse the arguments. + */ + p->p_argslen = (int)p->user_stack - pathptr; + p->p_argc = na - ne; /* save argc for sysctl_procargs() */ + /* Save a NULL pointer below it */ (void) suword((caddr_t)(pathptr - NBPW), 0); @@ -717,7 +961,7 @@ (void) suword((caddr_t)ap, ucp); do { error = copyoutstr(cp, (caddr_t)ucp, - (unsigned)cc, &len); + (unsigned)cc, (size_t *)&len); ucp += len; cp += len; nc += len; @@ -762,9 +1006,16 @@ * which specify close-on-exec. */ fdexec(p); + + /* + * need to cancel async IO requests that can be cancelled and wait for those + * already active. MAY BLOCK! + */ + _aio_exec( p ); + /* FIXME: Till vmspace inherit is fixed: */ if (!vfexec && p->vm_shm) - shmexit(p); + shmexec(p); /* Clean up the semaphores */ semexit(p); @@ -772,11 +1023,20 @@ * Remember file name for accounting. */ p->p_acflag &= ~AFORK; - if (nd.ni_cnd.cn_namelen > MAXCOMLEN) - nd.ni_cnd.cn_namelen = MAXCOMLEN; - bcopy((caddr_t)nd.ni_cnd.cn_nameptr, (caddr_t)p->p_comm, - (unsigned)nd.ni_cnd.cn_namelen); - p->p_comm[nd.ni_cnd.cn_namelen] = '\0'; + /* If the translated name isn't NULL, then we want to use + * that translated name as the name we show as the "real" name. + * Otherwise, use the name passed into exec. + */ + if (0 != binaryWithClassicName[0]) { + bcopy((caddr_t)binaryWithClassicName, (caddr_t)p->p_comm, + sizeof(binaryWithClassicName)); + } else { + if (nd.ni_cnd.cn_namelen > MAXCOMLEN) + nd.ni_cnd.cn_namelen = MAXCOMLEN; + bcopy((caddr_t)nd.ni_cnd.cn_nameptr, (caddr_t)p->p_comm, + (unsigned)nd.ni_cnd.cn_namelen); + p->p_comm[nd.ni_cnd.cn_namelen] = '\0'; + } { /* This is for kdebug */ @@ -785,14 +1045,29 @@ /* Collect the pathname for tracing */ kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); + + if (vfexec) + { + KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + p->p_pid ,0,0,0, (unsigned int)thr_act); KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, - dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, getshuttle_thread(thr_act)); + dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (unsigned int)thr_act); + } else + { + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE, + p->p_pid ,0,0,0,0); KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE, dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); + } } + if (executingClassic) + p->p_flag |= P_CLASSIC | P_AFFINITY; + else + p->p_flag &= ~P_CLASSIC; + /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has it's own resources back @@ -842,11 +1117,12 @@ vm_size_t size; vm_offset_t addr; - p->user_stack = user_stack; + p->user_stack = (caddr_t)user_stack; if (!customstack) { - size = round_page(unix_stack_size(p)); - addr = trunc_page(user_stack - size); - return (vm_allocate(map,&addr, size, FALSE)); + size = round_page_64(unix_stack_size(p)); + addr = trunc_page_32(user_stack - size); + return (vm_allocate(map, &addr, size, + VM_MAKE_TAG(VM_MEMORY_STACK) | FALSE)); } else return(KERN_SUCCESS); } @@ -974,7 +1250,7 @@ { switch (lrtn) { case LOAD_SUCCESS: - return 0; + return 0; case LOAD_BADARCH: return EBADARCH; case LOAD_BADMACHO: @@ -982,10 +1258,14 @@ case LOAD_SHLIB: return ESHLIBVERS; case LOAD_NOSPACE: + case LOAD_RESOURCE: return ENOMEM; case LOAD_PROTECT: return EACCES; - case LOAD_RESOURCE: + case LOAD_ENOENT: + return ENOENT; + case LOAD_IOERROR: + return EIO; case LOAD_FAILURE: default: return EBADEXEC; @@ -1046,9 +1326,10 @@ } kret = kmem_alloc_pageable(bsd_pageable_map, addrp, NCARGS); - if (kret != KERN_SUCCESS) + if (kret != KERN_SUCCESS) { + semaphore_signal(execve_semaphore); return (ENOMEM); - + } return (0); } @@ -1074,4 +1355,3 @@ return (EINVAL); } } - diff -urN xnu-344.49/bsd/kern/kern_exit.c xnu-517/bsd/kern/kern_exit.c --- xnu-344.49/bsd/kern/kern_exit.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_exit.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -85,6 +85,8 @@ #include #include #include +#include +#include #include #include @@ -97,6 +99,9 @@ extern char init_task_failure_data[]; int exit1 __P((struct proc *, int, int *)); +void proc_prepareexit(struct proc *p); +void vfork_exit(struct proc *p, int rv); +void vproc_exit(struct proc *p); /* * exit -- @@ -134,8 +139,7 @@ int * retval; { register struct proc *q, *nq; - thread_t self = current_thread(); - thread_act_t th_act_self = current_act(); + thread_act_t self = current_act(); struct task *task = p->task; register int i,s; struct uthread *ut; @@ -146,22 +150,23 @@ * right here. */ - ut = get_bsdthread_info(th_act_self); + ut = get_bsdthread_info(self); if (ut->uu_flag & P_VFORK) { - (void)vfork_exit(p, rv); - vfork_return(th_act_self, p->p_pptr, p , retval); + vfork_exit(p, rv); + vfork_return(self, p->p_pptr, p , retval); unix_syscall_return(0); /* NOT REACHED */ } + audit_syscall_exit(0, p, ut); /* Exit is always successfull */ signal_lock(p); while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { - if (get_threadtask(th_act_self) != task) { + if (get_threadtask(self) != task) { signal_unlock(p); return(0); } signal_unlock(p); - thread_terminate(th_act_self); + thread_terminate(self); thread_funnel_set(kernel_flock, FALSE); thread_exception_return(); /* NOTREACHED */ @@ -179,27 +184,12 @@ s = splsched(); p->p_flag |= P_WEXIT; splx(s); - (void)proc_prepareexit(p); + proc_prepareexit(p); p->p_xstat = rv; /* task terminate will call proc_terminate and that cleans it up */ task_terminate_internal(task); - /* - * we come back and returns to AST which - * should cleanup the rest - */ -#if 0 - if (task == current_task()) { - thread_exception_return(); - /*NOTREACHED*/ - } - - while (task == current_task()) { - thread_terminate_self(); - /*NOTREACHED*/ - } -#endif return(0); } @@ -208,8 +198,12 @@ { int s; struct uthread *ut; - thread_t self = current_thread(); - thread_act_t th_act_self = current_act(); + exception_data_t code[EXCEPTION_CODE_MAX]; + thread_act_t self = current_act(); + + code[0] = 0xFF000001; /* Set terminate code */ + code[1] = p->p_pid; /* Pass out the pid */ + (void)sys_perf_notify(p->task, &code, 2); /* Notify the perf server */ /* * Remove proc from allproc queue and from pidhash chain. @@ -218,6 +212,7 @@ * in partially cleaned state. */ LIST_REMOVE(p, p_list); + LIST_INSERT_HEAD(&zombproc, p, p_list); /* Place onto zombproc. */ LIST_REMOVE(p, p_hash); #ifdef PGINPROF @@ -230,7 +225,7 @@ p->p_flag &= ~(P_TRACED | P_PPWAIT); p->p_sigignore = ~0; p->p_siglist = 0; - ut = get_bsdthread_info(th_act_self); + ut = get_bsdthread_info(self); ut->uu_siglist = 0; untimeout(realitexpire, (caddr_t)p->p_pid); } @@ -239,8 +234,6 @@ proc_exit(struct proc *p) { register struct proc *q, *nq, *pp; - thread_t self = current_thread(); - thread_act_t th_act_self = current_act(); struct task *task = p->task; register int i,s; boolean_t funnel_state; @@ -261,6 +254,12 @@ sizeof (*p->p_ru), M_ZOMBIE, M_WAITOK); /* + * need to cancel async IO requests that can be cancelled and wait for those + * already active. MAY BLOCK! + */ + _aio_exit( p ); + + /* * Close open files and release open-file table. * This may block! */ @@ -337,9 +336,6 @@ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; if (q->sigwait_thread) { - thread_t sig_shuttle; - - sig_shuttle = (thread_t)getshuttle_thread((thread_act_t)q->sigwait_thread); /* * The sigwait_thread could be stopped at a * breakpoint. Wake it up to kill. @@ -348,7 +344,7 @@ * the process would result into a deadlock on q->sigwait. */ thread_resume((thread_act_t)q->sigwait_thread); - clear_wait(sig_shuttle, THREAD_INTERRUPTED); + clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); threadsignal((thread_act_t)q->sigwait_thread, SIGKILL, 0); } psignal(q, SIGKILL); @@ -421,6 +417,9 @@ FREE_ZONE(p->p_limit, sizeof *p->p_limit, M_SUBPROC); p->p_limit = NULL; + /* Free the auditing info */ + audit_proc_free(p); + /* * Finish up by terminating the task * and halt this thread (only if a @@ -430,12 +429,20 @@ //task->proc = NULL; set_bsdtask_info(task, NULL); + KNOTE(&p->p_klist, NOTE_EXIT); + /* * Notify parent that we're gone. */ if (p->p_pptr->p_flag & P_NOCLDWAIT) { struct proc * pp = p->p_pptr; + /* + * Add child resource usage to parent before giving + * zombie to init + */ + ruadd(&p->p_pptr->p_stats->p_cru, p->p_ru); + proc_reparent(p, initproc); /* If there are no more children wakeup parent */ if (LIST_EMPTY(&pp->p_children)) @@ -452,8 +459,7 @@ psignal(pp, SIGCHLD); - /* Place onto zombproc. */ - LIST_INSERT_HEAD(&zombproc, p, p_list); + /* mark as a zombie */ p->p_stat = SZOMB; /* and now wakeup the parent */ @@ -540,7 +546,7 @@ thread = current_act(); vt = (void *)get_bsduthreadarg(thread); retval = (int *)get_bsduthreadrval(thread); - wait1((struct proc *)p, (struct wait4_args *)vt, retval, 0); + return(wait1((struct proc *)p, (struct wait4_args *)vt, retval, 0)); } int @@ -777,11 +783,11 @@ int rv; { register struct proc *q, *nq; - thread_t self = current_thread(); - thread_act_t th_act_self = current_act(); + thread_act_t self = current_act(); struct task *task = p->task; register int i,s; struct uthread *ut; + exception_data_t code[EXCEPTION_CODE_MAX]; /* * If a thread in this task has already @@ -789,17 +795,17 @@ * right here. */ - ut = get_bsdthread_info(th_act_self); + ut = get_bsdthread_info(self); #ifdef FIXME signal_lock(p); while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { - if (get_threadtask(th_act_self) != task) { + if (get_threadtask(self) != task) { signal_unlock(p); return; } signal_unlock(p); - thread_terminate(th_act_self); + thread_terminate(self); thread_funnel_set(kernel_flock, FALSE); thread_exception_return(); /* NOTREACHED */ @@ -817,6 +823,11 @@ s = splsched(); p->p_flag |= P_WEXIT; splx(s); + + code[0] = 0xFF000001; /* Set terminate code */ + code[1] = p->p_pid; /* Pass out the pid */ + (void)sys_perf_notify(p->task, &code, 2); /* Notify the perf server */ + /* * Remove proc from allproc queue and from pidhash chain. * Need to do this before we do anything that can block. @@ -824,6 +835,7 @@ * in partially cleaned state. */ LIST_REMOVE(p, p_list); + LIST_INSERT_HEAD(&zombproc, p, p_list); /* Place onto zombproc. */ LIST_REMOVE(p, p_hash); /* * If parent is waiting for us to exit or exec, @@ -838,15 +850,13 @@ p->p_xstat = rv; - (void)vproc_exit(p); + vproc_exit(p); } void vproc_exit(struct proc *p) { register struct proc *q, *nq, *pp; - thread_t self = current_thread(); - thread_act_t th_act_self = current_act(); struct task *task = p->task; register int i,s; boolean_t funnel_state; @@ -924,9 +934,6 @@ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; if (q->sigwait_thread) { - thread_t sig_shuttle; - - sig_shuttle = (thread_t) getshuttle_thread((thread_act_t)q->sigwait_thread); /* * The sigwait_thread could be stopped at a * breakpoint. Wake it up to kill. @@ -935,7 +942,7 @@ * the process would result into a deadlock on q->sigwait. */ thread_resume((thread_act_t)q->sigwait_thread); - clear_wait(sig_shuttle, THREAD_INTERRUPTED); + clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); threadsignal((thread_act_t)q->sigwait_thread, SIGKILL, 0); } psignal(q, SIGKILL); @@ -1029,8 +1036,7 @@ } psignal(p->p_pptr, SIGCHLD); - /* Place onto zombproc. */ - LIST_INSERT_HEAD(&zombproc, p, p_list); + /* mark as a zombie */ p->p_stat = SZOMB; /* and now wakeup the parent */ diff -urN xnu-344.49/bsd/kern/kern_fork.c xnu-517/bsd/kern/kern_fork.c --- xnu-344.49/bsd/kern/kern_fork.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_fork.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -63,6 +63,7 @@ * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95 */ +#include #include #include #include @@ -74,6 +75,7 @@ #include #include #include +#include #if KTRACE #include #endif @@ -146,6 +148,7 @@ ut = (struct uthread *)get_bsdthread_info(cur_act); if (ut->uu_flag & P_VFORK) { printf("vfork called recursively by %s\n", p->p_comm); + (void)chgproccnt(uid, -1); return (EINVAL); } p->p_flag |= P_VFORK; @@ -204,7 +207,6 @@ { long flags; register uid_t uid; - thread_t newth, self = current_thread(); thread_act_t cur_act = (thread_act_t)current_act(); int s, count; task_t t; @@ -245,12 +247,13 @@ thread_act_t thread; task_t task; kern_return_t result; + pmap_t pmap; extern task_t kernel_task; if (parent->task == kernel_task) - result = task_create_local(TASK_NULL, FALSE, FALSE, &task); + result = task_create_internal(TASK_NULL, FALSE, &task); else - result = task_create_local(parent->task, TRUE, FALSE, &task); + result = task_create_internal(parent->task, TRUE, &task); if (result != KERN_SUCCESS) printf("fork/procdup: task_create failed. Code: 0x%x\n", result); child->task = task; @@ -258,6 +261,7 @@ set_bsdtask_info(task, child); if (child->p_nice != 0) resetpriority(child); + result = thread_create(task, &thread); if (result != KERN_SUCCESS) printf("fork/procdup: thread_create failed. Code: 0x%x\n", result); @@ -333,6 +337,8 @@ } act_deallocate(newth); + KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); + while (p2->p_flag & P_PPWAIT) tsleep(p1, PWAIT, "ppwait", 0); @@ -465,17 +471,25 @@ p2->vm_shm = (void *)NULL; /* Make sure it is zero */ /* + * Copy the audit info. + */ + audit_proc_fork(p1, p2); + + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. */ p2->p_flag = P_INMEM; + p2->p_flag |= (p1->p_flag & P_CLASSIC); // copy from parent + p2->p_flag |= (p1->p_flag & P_AFFINITY); // copy from parent if (p1->p_flag & P_PROFIL) startprofclock(p2); bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred)); p2->p_cred->p_refcnt = 1; crhold(p1->p_ucred); lockinit(&p2->p_cred->pc_lock, PLOCK, "proc cred", 0, 0); + klist_init(&p2->p_klist); /* bump references to the text vnode */ p2->p_textvp = p1->p_textvp; @@ -515,6 +529,8 @@ if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; + p2->p_argslen = p1->p_argslen; + p2->p_argc = p1->p_argc; p2->p_xstat = 0; p2->p_ru = NULL; @@ -527,10 +543,13 @@ p2->sigwait_thread = NULL; p2->exit_thread = NULL; p2->user_stack = p1->user_stack; - p2->p_xxxsigpending = 0; p2->p_vforkcnt = 0; p2->p_vforkact = 0; TAILQ_INIT(&p2->p_uthlist); + TAILQ_INIT(&p2->aio_activeq); + TAILQ_INIT(&p2->aio_doneq); + p2->aio_active_count = 0; + p2->aio_done_count = 0; #if KTRACE /* @@ -581,7 +600,7 @@ if (task != kernel_task) { uth = (struct uthread *)ut; - p = get_bsdtask_info(task); + p = (struct proc *) get_bsdtask_info(task); funnel_state = thread_funnel_set(kernel_flock, TRUE); uth_parent = (struct uthread *)get_bsdthread_info(current_act()); @@ -612,6 +631,15 @@ extern task_t kernel_task; int size; boolean_t funnel_state; + struct nlminfo *nlmp; + + /* + * Per-thread audit state should never last beyond system + * call return. Since we don't audit the thread creation/ + * removal, the thread state pointer should never be + * non-NULL when we get here. + */ + assert(uth->uu_ar == NULL); sel = &uth->uu_state.ss_select; /* cleanup the select bit space */ @@ -626,6 +654,11 @@ sel->allocsize = 0; uth->uu_wqsub = 0; sel->wql = 0; + } + + if ((nlmp = uth->uu_nlminfo)) { + uth->uu_nlminfo = 0; + FREE(nlmp, M_LOCKF); } if ((task != kernel_task) && p) { diff -urN xnu-344.49/bsd/kern/kern_ktrace.c xnu-517/bsd/kern/kern_ktrace.c --- xnu-344.49/bsd/kern/kern_ktrace.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_ktrace.c Sat Oct 25 00:25:25 2003 @@ -419,7 +419,7 @@ p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_USER); MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK); - if (!copyin(uap->addr, cp, uap->len)) { + if (!copyin((caddr_t)uap->addr, cp, uap->len)) { kth->ktr_buf = cp; kth->ktr_len = uap->len; ktrwrite(p->p_tracep, kth, NULL, KERNEL_FUNNEL); @@ -641,7 +641,8 @@ target->p_ruid == target->p_svuid && caller->p_rgid == target->p_rgid && /* XXX */ target->p_rgid == target->p_svgid && - (targetp->p_traceflag & KTRFAC_ROOT) == 0) || + (targetp->p_traceflag & KTRFAC_ROOT) == 0 && + (targetp->p_flag & P_SUGID) == 0) || caller->pc_ucred->cr_uid == 0) return (1); diff -urN xnu-344.49/bsd/kern/kern_lock.c xnu-517/bsd/kern/kern_lock.c --- xnu-344.49/bsd/kern/kern_lock.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_lock.c Sat Oct 25 00:25:25 2003 @@ -192,7 +192,7 @@ int extflags; void *self; - error = 0; self = current_thread(); + error = 0; self = current_act(); if (p) pid = p->p_pid; else diff -urN xnu-344.49/bsd/kern/kern_malloc.c xnu-517/bsd/kern/kern_malloc.c --- xnu-344.49/bsd/kern/kern_malloc.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_malloc.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -71,6 +71,9 @@ #include #include +#include +#include + #include #include #include @@ -86,6 +89,7 @@ #include #include +#include #include #include @@ -210,12 +214,15 @@ 0, KMZ_MALLOC, /* 88 M_IP6MISC */ 0, KMZ_MALLOC, /* 89 M_TSEGQ */ 0, KMZ_MALLOC, /* 90 M_IGMP */ - SOS(journal), KMZ_CREATEZONE, /* 91 M_JNL_JNL */ + SOS(journal), KMZ_CREATEZONE, /* 91 M_JNL_JNL */ SOS(transaction), KMZ_CREATEZONE, /* 92 M_JNL_TR */ + SOS(specinfo), KMZ_CREATEZONE, /* 93 M_SPECINFO */ + SOS(kqueue), KMZ_CREATEZONE, /* 94 M_KQUEUE */ #undef SOS #undef SOX }; +extern zone_t kalloc_zone(vm_size_t); /* XXX */ /* * Initialize the kernel memory allocator @@ -277,7 +284,7 @@ char dat[0]; }; -#define ZEROSIZETOKEN 0xFADEDFAD +#define ZEROSIZETOKEN (void *)0xFADEDFAD void *_MALLOC( size_t size, @@ -306,6 +313,9 @@ return (0); mem->hdr.mlen = memsize; + + if (flags & M_ZERO) + bzero(mem->hdr.dat, size); return (mem->hdr.dat); } diff -urN xnu-344.49/bsd/kern/kern_mib.c xnu-517/bsd/kern/kern_mib.c --- xnu-344.49/bsd/kern/kern_mib.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_mib.c Sat Oct 25 00:25:25 2003 @@ -304,12 +304,6 @@ SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN, 0, 1, ""); /* always set */ /* - * Export of _cpu_capabilities to userspace, consumed by the pthread code - * only. - */ -SYSCTL_INT(_hw, OID_AUTO, _cpu_capabilities, CTLFLAG_RD, &_cpu_capabilities, 0, ""); - -/* * Deprecated variables. These are supported for backwards compatibility * purposes only. The MASKED flag requests that the variables not be * printed by sysctl(8) and similar utilities. @@ -332,7 +326,7 @@ SYSCTL_INT (_hw, HW_TB_FREQ, tbfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.timebase_frequency_hz, 0, ""); SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MACHINE, sysctl_hw_generic, "A", ""); SYSCTL_PROC(_hw, HW_MODEL, model, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MODEL, sysctl_hw_generic, "A", ""); -SYSCTL_INT (_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED, &mem_size, 0, ""); +SYSCTL_UINT(_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED, &mem_size, 0, ""); SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_USERMEM, sysctl_hw_generic, "I", ""); SYSCTL_PROC(_hw, HW_EPOCH, epoch, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_EPOCH, sysctl_hw_generic, "I", ""); SYSCTL_PROC(_hw, HW_VECTORUNIT, vectorunit, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_VECTORUNIT, sysctl_hw_generic, "I", ""); diff -urN xnu-344.49/bsd/kern/kern_mman.c xnu-517/bsd/kern/kern_mman.c --- xnu-344.49/bsd/kern/kern_mman.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_mman.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -151,6 +151,7 @@ long pos; }; +int osmmap(curp, uap, retval) struct proc *curp; register struct osmmap_args *uap; @@ -246,7 +247,7 @@ /* Adjust size for rounding (on both ends). */ user_size += pageoff; /* low end... */ - user_size = (vm_size_t) round_page(user_size); /* hi end */ + user_size = (vm_size_t) round_page_32(user_size); /* hi end */ /* @@ -280,8 +281,8 @@ * There should really be a pmap call to determine a reasonable * location. */ - else if (addr < round_page(p->p_vmspace->vm_daddr + MAXDSIZ)) - addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); + else if (addr < round_page_32(p->p_vmspace->vm_daddr + MAXDSIZ)) + addr = round_page_32(p->p_vmspace->vm_daddr + MAXDSIZ); #endif @@ -303,7 +304,7 @@ if (err) return(err); if(fp->f_type == DTYPE_PSXSHM) { - uap->addr = user_addr; + uap->addr = (caddr_t)user_addr; uap->len = user_size; uap->prot = prot; uap->flags = flags; @@ -322,7 +323,7 @@ * SunOS). */ if (vp->v_type == VCHR || vp->v_type == VSTR) { - return(EOPNOTSUPP); + return(ENODEV); } else { /* * Ensure that file and memory protections are @@ -374,7 +375,7 @@ * We bend a little - round the start and end addresses * to the nearest page boundary. */ - user_size = round_page(user_size); + user_size = round_page_32(user_size); if (file_pos & PAGE_MASK_64) return (EINVAL); @@ -383,9 +384,9 @@ if ((flags & MAP_FIXED) == 0) { find_space = TRUE; - user_addr = round_page(user_addr); + user_addr = round_page_32(user_addr); } else { - if (user_addr != trunc_page(user_addr)) + if (user_addr != trunc_page_32(user_addr)) return (EINVAL); find_space = FALSE; (void) vm_deallocate(user_map, user_addr, user_size); @@ -419,9 +420,16 @@ if (result != KERN_SUCCESS) goto out; + result = vm_protect(user_map, user_addr, user_size, TRUE, maxprot); + if (result != KERN_SUCCESS) + goto out; + result = vm_protect(user_map, user_addr, user_size, FALSE, prot); + if (result != KERN_SUCCESS) + goto out; + } else { UBCINFOCHECK("mmap", vp); - pager = ubc_getpager(vp); + pager = (vm_pager_t)ubc_getpager(vp); if (pager == NULL) return (ENOMEM); @@ -461,7 +469,7 @@ ubc_map(vp); } - if (flags & (MAP_SHARED|MAP_INHERIT)) { + if (flags & MAP_SHARED) { result = vm_inherit(user_map, user_addr, user_size, VM_INHERIT_SHARE); if (result != KERN_SUCCESS) { @@ -510,7 +518,7 @@ pageoff = (addr & PAGE_MASK); addr -= pageoff; size = uap->len; - size = (vm_size_t) round_page(size); + size = (vm_size_t) round_page_32(size); flags = uap->flags; if (addr + size < addr) @@ -518,6 +526,9 @@ user_map = current_map(); + if ((flags & (MS_ASYNC|MS_SYNC)) == (MS_ASYNC|MS_SYNC)) + return (EINVAL); + if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); @@ -529,7 +540,7 @@ * inaccurate results, lets just return error as invalid size * specified */ - return(EINVAL); + return (EINVAL); /* XXX breaks posix apps */ } if (flags & MS_KILLPAGES) @@ -559,10 +570,10 @@ } return (0); - } +int mremap() { /* Not yet implemented */ @@ -573,6 +584,7 @@ caddr_t addr; int len; }; +int munmap(p, uap, retval) struct proc *p; struct munmap_args *uap; @@ -590,7 +602,7 @@ user_addr -= pageoff; user_size += pageoff; - user_size = round_page(user_size); + user_size = round_page_32(user_size); if (user_addr + user_size < user_addr) return(EINVAL); @@ -654,7 +666,7 @@ pageoff = (user_addr & PAGE_MASK); user_addr -= pageoff; user_size += pageoff; - user_size = round_page(user_size); + user_size = round_page_32(user_size); if (user_addr + user_size < user_addr) return(EINVAL); @@ -697,7 +709,7 @@ pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; - size = (vm_size_t) round_page(size); + size = (vm_size_t) round_page_32(size); if (addr + size < addr) return(EINVAL); @@ -747,8 +759,8 @@ * Since this routine is only advisory, we default to conservative * behavior. */ - start = trunc_page((vm_offset_t) uap->addr); - end = round_page((vm_offset_t) uap->addr + uap->len); + start = trunc_page_32((vm_offset_t) uap->addr); + end = round_page_32((vm_offset_t) uap->addr + uap->len); user_map = current_map(); @@ -812,8 +824,8 @@ * Make sure that the addresses presented are valid for user * mode. */ - first_addr = addr = trunc_page((vm_offset_t) uap->addr); - end = addr + (vm_size_t)round_page(uap->len); + first_addr = addr = trunc_page_32((vm_offset_t) uap->addr); + end = addr + (vm_size_t)round_page_32(uap->len); if (VM_MAX_ADDRESS > 0 && end > VM_MAX_ADDRESS) return (EINVAL); @@ -913,7 +925,7 @@ pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; - size = (vm_size_t) round_page(size); + size = (vm_size_t) round_page_32(size); /* disable wrap around */ if (addr + size < addr) @@ -962,7 +974,7 @@ pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; - size = (vm_size_t) round_page(size); + size = (vm_size_t) round_page_32(size); /* disable wrap around */ if (addr + size < addr) @@ -1014,6 +1026,7 @@ struct obreak_args { char *nsiz; }; +int obreak(p, uap, retval) struct proc *p; struct obreak_args *uap; @@ -1025,6 +1038,7 @@ int both; +int ovadvise() { @@ -1033,12 +1047,11 @@ #endif } /* END DEFUNCT */ -#if 1 -int print_map_addr=0; -#endif /* 1 */ /* CDY need to fix interface to allow user to map above 32 bits */ -kern_return_t map_fd( +/* USV: No! need to obsolete map_fd()! mmap() already supports 64 bits */ +kern_return_t +map_fd( int fd, vm_offset_t offset, vm_offset_t *va, @@ -1058,7 +1071,8 @@ return ret; } -kern_return_t map_fd_funneled( +kern_return_t +map_fd_funneled( int fd, vm_object_offset_t offset, vm_offset_t *va, @@ -1075,9 +1089,6 @@ int err=0; vm_map_t my_map; struct proc *p =(struct proc *)current_proc(); -#if 0 - extern int print_map_addr; -#endif /* 0 */ /* * Find the inode; verify that it's a regular file. @@ -1102,7 +1113,7 @@ printf("map_fd: file offset not page aligned(%d : %s)\n",p->p_pid, p->p_comm); return (KERN_INVALID_ARGUMENT); } - map_size = round_page(size); + map_size = round_page_32(size); /* * Allow user to map in a zero length file. @@ -1135,7 +1146,7 @@ vm_map_copy_t tmp; if (copyin(va, &dst_addr, sizeof (dst_addr)) || - trunc_page(dst_addr) != dst_addr) { + trunc_page_32(dst_addr) != dst_addr) { (void) vm_map_remove( my_map, map_addr, map_addr + map_size, diff -urN xnu-344.49/bsd/kern/kern_newsysctl.c xnu-517/bsd/kern/kern_newsysctl.c --- xnu-344.49/bsd/kern/kern_newsysctl.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_newsysctl.c Sat Oct 25 00:25:25 2003 @@ -80,6 +80,7 @@ */ extern struct sysctl_oid *newsysctl_list[]; +extern struct sysctl_oid *machdep_sysctl_list[]; static void @@ -211,12 +212,13 @@ void sysctl_register_fixed() { - int i = 0; + int i; - - while (newsysctl_list[i]) { -/* printf("Registering %d\n", i); */ - sysctl_register_oid(newsysctl_list[i++]); + for (i=0; newsysctl_list[i]; i++) { + sysctl_register_oid(newsysctl_list[i]); + } + for (i=0; machdep_sysctl_list[i]; i++) { + sysctl_register_oid(machdep_sysctl_list[i]); } } @@ -1052,6 +1054,9 @@ } return (error); } + +/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */ +#define KINFO_BSDI_SYSINFO (101<<8) /* * Kernel versions of the userland sysctl helper functions. diff -urN xnu-344.49/bsd/kern/kern_panicinfo.c xnu-517/bsd/kern/kern_panicinfo.c --- xnu-344.49/bsd/kern/kern_panicinfo.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_panicinfo.c Sat Oct 25 00:25:25 2003 @@ -192,8 +192,8 @@ off_t filesize = 0; size_t len; vm_offset_t image; - vm_offset_t oimage; - vm_size_t osize; + vm_offset_t oimage = NULL; + vm_size_t osize = 0; /* covariable: quiet compiler */ len = strlen(imname); oldstr = image_pathname; diff -urN xnu-344.49/bsd/kern/kern_pcsamples.c xnu-517/bsd/kern/kern_pcsamples.c --- xnu-344.49/bsd/kern/kern_pcsamples.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_pcsamples.c Sat Oct 25 00:25:25 2003 @@ -184,7 +184,7 @@ pcsample_enable = 0; if (pc_bufsize && pc_buffer) - kmem_free(kernel_map,pc_buffer,pc_bufsize); + kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); ret= pcsamples_bootstrap(); return(ret); @@ -196,7 +196,7 @@ global_state_pid = -1; pcsample_enable = 0; if(pc_bufsize && pc_buffer) - kmem_free(kernel_map,pc_buffer,pc_bufsize); + kmem_free(kernel_map, (vm_offset_t)pc_buffer, pc_bufsize); pc_buffer = (u_long *)0; pc_bufptr = (u_long *)0; pc_buflast = (u_long *)0; diff -urN xnu-344.49/bsd/kern/kern_proc.c xnu-517/bsd/kern/kern_proc.c --- xnu-344.49/bsd/kern/kern_proc.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_proc.c Sat Oct 25 00:25:25 2003 @@ -207,6 +207,21 @@ } /* + * Locate a zombie by PID + */ +__private_extern__ struct proc * +pzfind(pid) + register pid_t pid; +{ + register struct proc *p; + + for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_pid == pid) + return (p); + return (NULL); +} + +/* * Locate a process group by number */ struct pgrp * @@ -440,6 +455,12 @@ } } #endif /* DEBUG */ + +int +proc_is_classic(struct proc *p) +{ + return (p->p_flag & P_CLASSIC) ? 1 : 0; +} struct proc * current_proc_EXTERNAL() { diff -urN xnu-344.49/bsd/kern/kern_prot.c xnu-517/bsd/kern/kern_prot.c --- xnu-344.49/bsd/kern/kern_prot.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_prot.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -279,7 +280,7 @@ register_t *retval; { - if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) { + if (p->p_pgid == p->p_pid || pgfind(p->p_pid) || p->p_flag & P_INVFORK) { return (EPERM); } else { (void)enterpgrp(p, p->p_pid, 1); @@ -329,7 +330,7 @@ uap->pgid = targp->p_pid; else if (uap->pgid != targp->p_pid) if ((pgrp = pgfind(uap->pgid)) == 0 || - pgrp->pg_session != curp->p_session) + pgrp->pg_session != curp->p_session) return (EPERM); return (enterpgrp(targp, uap->pgid, 0)); } @@ -369,6 +370,7 @@ int error; uid = uap->uid; + AUDIT_ARG(uid, uid, 0, 0, 0); if (uid != pc->p_ruid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); @@ -407,6 +409,7 @@ int error; euid = uap->euid; + AUDIT_ARG(uid, 0, euid, 0, 0); if (euid != pc->p_ruid && euid != pc->p_svuid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); @@ -437,6 +440,7 @@ int error; gid = uap->gid; + AUDIT_ARG(gid, gid, 0, 0, 0); if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); pcred_writelock(p); @@ -464,6 +468,7 @@ int error; egid = uap->egid; + AUDIT_ARG(gid, 0, egid, 0, 0); if (egid != pc->p_rgid && egid != pc->p_svgid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); @@ -495,16 +500,23 @@ if (error = suser(pc->pc_ucred, &p->p_acflag)) return (error); ngrp = uap->gidsetsize; - if (ngrp < 1 || ngrp > NGROUPS) + if (ngrp > NGROUPS) return (EINVAL); new = crget(); - error = copyin((caddr_t)uap->gidset, - (caddr_t)new->cr_groups, ngrp * sizeof(gid_t)); - if (error) { - crfree(new); - return (error); + + if ( ngrp < 1 ) { + ngrp = 1; + } + else { + error = copyin((caddr_t)uap->gidset, + (caddr_t)new->cr_groups, ngrp * sizeof(gid_t)); + if (error) { + crfree(new); + return (error); + } } new->cr_ngroups = ngrp; + AUDIT_ARG(groupset, new->cr_groups, ngrp); pcred_writelock(p); old = pc->pc_ucred; new->cr_uid = old->cr_uid; @@ -723,6 +735,32 @@ } /* + * compare two cred structs + */ +int +crcmp(cr1, cr2) + struct ucred *cr1; + struct ucred *cr2; +{ + int i; + + if (cr1 == cr2) + return 0; + if (cr1 == NOCRED || cr1 == FSCRED || + cr2 == NOCRED || cr2 == FSCRED) + return 1; + if (cr1->cr_uid != cr2->cr_uid) + return 1; + if (cr1->cr_ngroups != cr2->cr_ngroups) + return 1; + // XXX assumes groups will always be listed in some order + for (i=0; i < cr1->cr_ngroups; i++) + if (cr1->cr_groups[i] != cr2->cr_groups[i]) + return 1; + return (0); +} + +/* * Get login name, if available. */ struct getlogin_args { @@ -774,13 +812,40 @@ set_security_token(struct proc * p) { security_token_t sec_token; + audit_token_t audit_token; sec_token.val[0] = p->p_ucred->cr_uid; sec_token.val[1] = p->p_ucred->cr_gid; + audit_token.val[0] = p->p_au->ai_auid; + audit_token.val[1] = p->p_au->ai_asid; + /* use au_tid for now, until au_tid_addr is put to use */ + audit_token.val[2] = p->p_au->ai_termid.port; + audit_token.val[3] = p->p_au->ai_termid.machine; + audit_token.val[4] = 0; + audit_token.val[5] = 0; + audit_token.val[6] = 0; + audit_token.val[7] = 0; return host_security_set_task_token(host_security_self(), p->task, sec_token, + audit_token, (sec_token.val[0]) ? - HOST_PRIV_NULL : + HOST_PRIV_NULL : host_priv_self()); +} + + +/* + * Fill in a struct xucred based on a struct ucred. + */ +__private_extern__ +void +cru2x(struct ucred *cr, struct xucred *xcr) +{ + + bzero(xcr, sizeof(*xcr)); + xcr->cr_version = XUCRED_VERSION; + xcr->cr_uid = cr->cr_uid; + xcr->cr_ngroups = cr->cr_ngroups; + bcopy(cr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); } diff -urN xnu-344.49/bsd/kern/kern_resource.c xnu-517/bsd/kern/kern_resource.c --- xnu-344.49/bsd/kern/kern_resource.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_resource.c Sat Oct 25 00:25:25 2003 @@ -65,6 +65,7 @@ #include #include +#include #include #include #include @@ -89,6 +90,22 @@ rlim_t maxsmap = MAXSSIZ; /* XXX */ /* + * Limits on the number of open files per process, and the number + * of child processes per process. + * + * Note: would be in kern/subr_param.c in FreeBSD. + */ +int maxprocperuid = CHILD_MAX; /* max # of procs per user */ +int maxfilesperproc = OPEN_MAX; /* per-proc open files limit */ + +SYSCTL_INT( _kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, + &maxprocperuid, 0, "Maximum processes allowed per userid" ); + +SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, + &maxfilesperproc, 0, "Maximum files allowed open per process" ); + + +/* * Resource controls and accounting. */ struct getpriority_args { @@ -353,14 +370,14 @@ if (limp->rlim_cur > alimp->rlim_cur) { /* grow stack */ - size = round_page(limp->rlim_cur); - size -= round_page(alimp->rlim_cur); + size = round_page_64(limp->rlim_cur); + size -= round_page_64(alimp->rlim_cur); #if STACK_GROWTH_UP /* go to top of current stack */ - addr = trunc_page(p->user_stack + alimp->rlim_cur); + addr = trunc_page((unsigned int)(p->user_stack + alimp->rlim_cur)); #else STACK_GROWTH_UP - addr = trunc_page(p->user_stack - alimp->rlim_cur); + addr = trunc_page_32((unsigned int)(p->user_stack - alimp->rlim_cur)); addr -= size; #endif /* STACK_GROWTH_UP */ if (vm_allocate(current_map(), @@ -373,39 +390,44 @@ break; case RLIMIT_NOFILE: - /* - * Only root can get the maxfiles limits, as it is systemwide resource - */ - if (is_suser()) { + /* + * Only root can set the maxfiles limits, as it is systemwide resource + */ + if ( is_suser() ) { if (limp->rlim_cur > maxfiles) limp->rlim_cur = maxfiles; if (limp->rlim_max > maxfiles) limp->rlim_max = maxfiles; - } else { - if (limp->rlim_cur > OPEN_MAX) - limp->rlim_cur = OPEN_MAX; - if (limp->rlim_max > OPEN_MAX) - limp->rlim_max = OPEN_MAX; + } + else { + if (limp->rlim_cur > maxfilesperproc) + limp->rlim_cur = maxfilesperproc; + if (limp->rlim_max > maxfilesperproc) + limp->rlim_max = maxfilesperproc; } break; case RLIMIT_NPROC: - /* - * Only root can get the maxproc limits, as it is systemwide resource - */ - if (is_suser()) { + /* + * Only root can set to the maxproc limits, as it is + * systemwide resource; all others are limited to + * maxprocperuid (presumably less than maxproc). + */ + if ( is_suser() ) { if (limp->rlim_cur > maxproc) limp->rlim_cur = maxproc; if (limp->rlim_max > maxproc) limp->rlim_max = maxproc; - } else { - if (limp->rlim_cur > CHILD_MAX) - limp->rlim_cur = CHILD_MAX; - if (limp->rlim_max > CHILD_MAX) - limp->rlim_max = CHILD_MAX; + } + else { + if (limp->rlim_cur > maxprocperuid) + limp->rlim_cur = maxprocperuid; + if (limp->rlim_max > maxprocperuid) + limp->rlim_max = maxprocperuid; } break; - } + + } /* switch... */ *alimp = *limp; return (0); } @@ -460,8 +482,8 @@ ut.tv_usec = tinfo.user_time.microseconds; st.tv_sec = tinfo.system_time.seconds; st.tv_usec = tinfo.system_time.microseconds; - timeradd(&ut,up,up); - timeradd(&st,up,up); + timeradd(&ut, up, up); + timeradd(&st, sp, sp); task_ttimes_stuff = TASK_THREAD_TIMES_INFO_COUNT; task_info(task, TASK_THREAD_TIMES_INFO, @@ -471,8 +493,8 @@ ut.tv_usec = ttimesinfo.user_time.microseconds; st.tv_sec = ttimesinfo.system_time.seconds; st.tv_usec = ttimesinfo.system_time.microseconds; - timeradd(&ut,up,up); - timeradd(&st,up,up); + timeradd(&ut, up, up); + timeradd(&st, sp, sp); } } diff -urN xnu-344.49/bsd/kern/kern_shutdown.c xnu-517/bsd/kern/kern_shutdown.c --- xnu-344.49/bsd/kern/kern_shutdown.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_shutdown.c Sat Oct 25 00:25:25 2003 @@ -59,6 +59,7 @@ #include #include #include +#include int waittime = -1; @@ -94,6 +95,8 @@ /* handle live procs (deallocate their root and current directories). */ proc_shutdown(); + audit_shutdown(); + sync(p, (void *)NULL, (int *)NULL); /* Release vnodes from the VM object cache */ @@ -208,6 +211,19 @@ if (TERM_catch == 0) break; } + if (TERM_catch) { + /* + * log the names of the unresponsive tasks + */ + + for (p = allproc.lh_first; p; p = p->p_list.le_next) { + if (((p->p_flag&P_SYSTEM) == 0) && (p->p_pptr->p_pid != 0) && (p != self)) { + if (p->p_sigcatch & sigmask(SIGTERM)) + printf("%s[%d]: didn't act on SIGTERM\n", p->p_comm, p->p_pid); + } + } + IOSleep(1000 * 5); + } /* * send a SIGKILL to all the procs still hanging around @@ -251,7 +267,7 @@ thread_block(THREAD_CONTINUE_NULL); } else { - p->exit_thread = current_thread(); + p->exit_thread = current_act(); printf("."); exit1(p, 1, (int *)NULL); } diff -urN xnu-344.49/bsd/kern/kern_sig.c xnu-517/bsd/kern/kern_sig.c --- xnu-344.49/bsd/kern/kern_sig.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_sig.c Sat Oct 25 00:25:25 2003 @@ -113,6 +113,13 @@ void psignal_uthread __P((thread_act_t, int)); kern_return_t do_bsdexception(int, int, int); +static int filt_sigattach(struct knote *kn); +static void filt_sigdetach(struct knote *kn); +static int filt_signal(struct knote *kn, long hint); + +struct filterops sig_filtops = + { 0, filt_sigattach, filt_sigdetach, filt_signal }; + #if SIGNAL_DEBUG void ram_printf __P((int)); int ram_debug=0; @@ -290,6 +297,8 @@ sa->sa_flags |= SA_SIGINFO; if (ps->ps_signodefer & bit) sa->sa_flags |= SA_NODEFER; + if (ps->ps_64regset & bit) + sa->sa_flags |= SA_64REGSET; if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDSTOP)) sa->sa_flags |= SA_NOCLDSTOP; if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDWAIT)) @@ -427,12 +436,16 @@ * Change setting atomically. */ ps->ps_sigact[signum] = sa->sa_handler; - ps->ps_trampact[signum] = sa->sa_tramp; + ps->ps_trampact[signum] = (sig_t) sa->sa_tramp; ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; if (sa->sa_flags & SA_SIGINFO) ps->ps_siginfo |= bit; else ps->ps_siginfo &= ~bit; + if (sa->sa_flags & SA_64REGSET) + ps->ps_64regset |= bit; + else + ps->ps_64regset &= ~bit; if ((sa->sa_flags & SA_RESTART) == 0) ps->ps_sigintr |= bit; else @@ -655,7 +668,6 @@ register int signum; int bit, error=0; - panic("osigvec: notsupp"); #if 0 signum = uap->signum; if (signum <= 0 || signum >= NSIG || @@ -684,6 +696,8 @@ sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ error = setsigvec(p, signum, (struct sigaction *)sv); } +#else +error = ENOSYS; #endif return (error); } @@ -814,8 +828,7 @@ } uth = (struct uthread *)get_bsdthread_info(target_act); - { void *tht = getshuttle_thread(target_act); -} + if (uth->uu_flag & UNO_SIGMASK) { error = ESRCH; goto out; @@ -1048,7 +1061,9 @@ psp->ps_sigstk.ss_flags = ss.ss_flags; return (0); } - if (ss.ss_size < MINSIGSTKSZ) +/* The older stacksize was 8K, enforce that one so no compat problems */ +#define OLDMINSIGSTKSZ 8*1024 + if (ss.ss_size < OLDMINSIGSTKSZ) return (ENOMEM); psp->ps_flags |= SAS_ALTSTACK; psp->ps_sigstk= ss; @@ -1073,8 +1088,16 @@ return (EINVAL); if (uap->pid > 0) { /* kill single process */ - if ((p = pfind(uap->pid)) == NULL) + if ((p = pfind(uap->pid)) == NULL) { + if ((p = pzfind(uap->pid)) != NULL) { + /* + * IEEE Std 1003.1-2001: return success + * when killing a zombie. + */ + return (0); + } return (ESRCH); + } if (!cansignal(cp, pc, p, uap->signum)) return (EPERM); if (uap->signum) @@ -1376,12 +1399,11 @@ sigset_t mask = sigmask(signum); thread_act_t sig_thread_act; struct task * sig_task = p->task; - thread_t sig_thread; kern_return_t kret; if ((p->p_flag & P_INVFORK) && p->p_vforkact) { sig_thread_act = p->p_vforkact; - kret = check_actforsig(sig_task, sig_thread_act, &sig_thread, 1); + kret = check_actforsig(sig_task, sig_thread_act, 1); if (kret == KERN_SUCCESS) return(sig_thread_act); else @@ -1391,11 +1413,11 @@ TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { if(((uth->uu_flag & UNO_SIGMASK)== 0) && (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) { - if (check_actforsig(p->task, uth->uu_act, NULL, 1) == KERN_SUCCESS) + if (check_actforsig(p->task, uth->uu_act, 1) == KERN_SUCCESS) return(uth->uu_act); } } - if (get_signalact(p->task, &thr_act, NULL, 1) == KERN_SUCCESS) { + if (get_signalact(p->task, &thr_act, 1) == KERN_SUCCESS) { return(thr_act); } @@ -1424,10 +1446,7 @@ register int s, prop; register sig_t action; thread_act_t sig_thread_act; - thread_t sig_thread; register task_t sig_task; - register thread_t cur_thread; - thread_act_t cur_act; int mask; struct uthread *uth; kern_return_t kret; @@ -1459,6 +1478,10 @@ return; } + s = splhigh(); + KNOTE(&p->p_klist, NOTE_SIGNAL | signum); + splx(s); + /* * do not send signals to the process that has the thread * doing a reboot(). Not doing so will mark that thread aborted @@ -1477,7 +1500,7 @@ * Deliver the signal to the first thread in the task. This * allows single threaded applications which use signals to * be able to be linked with multithreaded libraries. We have - * an implicit reference to the current_thread, but need + * an implicit reference to the current thread, but need * an explicit one otherwise. The thread reference keeps * the corresponding task data structures around too. This * reference is released by thread_deallocate. @@ -1486,9 +1509,6 @@ if (((p->p_flag & P_TRACED) == 0) && (p->p_sigignore & mask)) goto psigout; - cur_thread = current_thread(); /* this is a shuttle */ - cur_act = current_act(); - /* If successful return with ast set */ sig_thread_act = get_signalthread(p, signum); @@ -1602,8 +1622,15 @@ * Wake up the thread, but don't un-suspend it * (except for SIGCONT). */ - if (prop & SA_CONT) - (void) task_resume(sig_task); + if (prop & SA_CONT) { + if (p->p_flag & P_TTYSLEEP) { + p->p_flag &= ~P_TTYSLEEP; + wakeup(&p->p_siglist); + } else { + (void) task_resume(sig_task); + } + p->p_stat = SRUN; + } goto run; } else { /* Default action - varies */ @@ -1726,10 +1753,7 @@ register int s, prop; register sig_t action; thread_act_t sig_thread_act; - thread_t sig_thread; register task_t sig_task; - register thread_t cur_thread; - thread_act_t cur_act; int mask; struct uthread *uth; kern_return_t kret; @@ -1772,7 +1796,7 @@ * Deliver the signal to the first thread in the task. This * allows single threaded applications which use signals to * be able to be linked with multithreaded libraries. We have - * an implicit reference to the current_thread, but need + * an implicit reference to the current thread, but need * an explicit one otherwise. The thread reference keeps * the corresponding task data structures around too. This * reference is released by thread_deallocate. @@ -1781,10 +1805,7 @@ if (((p->p_flag & P_TRACED) == 0) && (p->p_sigignore & mask)) goto puthout; - cur_thread = current_thread(); /* this is a shuttle */ - cur_act = current_act(); - - kret = check_actforsig(sig_task, sig_thread_act, &sig_thread, 1); + kret = check_actforsig(sig_task, sig_thread_act, 1); if (kret != KERN_SUCCESS) { error = EINVAL; @@ -2007,7 +2028,7 @@ sig_lock_to_exit( struct proc *p) { - thread_t self = current_thread(); + thread_t self = current_act(); p->exit_thread = self; (void) task_suspend(p->task); @@ -2017,7 +2038,7 @@ sig_try_locked( struct proc *p) { - thread_t self = current_thread(); + thread_t self = current_act(); while (p->sigwait || p->exit_thread) { if (p->exit_thread) { @@ -2025,7 +2046,7 @@ /* * Already exiting - no signals. */ - thread_abort(current_act()); + thread_abort(self); } return(0); } @@ -2064,14 +2085,12 @@ { register int signum, mask, prop, sigbits; task_t task = p->task; - thread_t cur_thread; thread_act_t cur_act; int s; struct uthread * ut; kern_return_t kret; struct proc *pp; - cur_thread = current_thread(); cur_act = current_act(); #if SIGNAL_DEBUG @@ -2133,6 +2152,7 @@ do_bsdexception(EXC_SOFTWARE, EXC_SOFT_SIGNAL, signum); signal_lock(p); } else { +// panic("Unsupportef gdb option \n");; pp->si_pid = p->p_pid; pp->si_status = p->p_xstat; pp->si_code = CLD_TRAPPED; @@ -2177,7 +2197,7 @@ * clear it, since sig_lock_to_exit will * wait. */ - clear_wait(current_thread(), THREAD_INTERRUPTED); + clear_wait(current_act(), THREAD_INTERRUPTED); sig_lock_to_exit(p); /* * Since this thread will be resumed @@ -2194,7 +2214,7 @@ /* * We may have to quit */ - if (thread_should_abort(current_thread())) { + if (thread_should_abort(current_act())) { signal_unlock(p); return(0); } @@ -2314,14 +2334,12 @@ { register int signum, mask, prop, sigbits; task_t task = p->task; - thread_t cur_thread; thread_act_t cur_act; int s; struct uthread * ut; int retnum = 0; - cur_thread = current_thread(); cur_act = current_act(); ut = get_bsdthread_info(cur_act); @@ -2584,6 +2602,48 @@ /* NOTREACHED */ } + +static int +filt_sigattach(struct knote *kn) +{ + struct proc *p = current_proc(); + + kn->kn_ptr.p_proc = p; + kn->kn_flags |= EV_CLEAR; /* automatically set */ + + /* XXX lock the proc here while adding to the list? */ + KNOTE_ATTACH(&p->p_klist, kn); + + return (0); +} + +static void +filt_sigdetach(struct knote *kn) +{ + struct proc *p = kn->kn_ptr.p_proc; + + KNOTE_DETACH(&p->p_klist, kn); +} + +/* + * signal knotes are shared with proc knotes, so we apply a mask to + * the hint in order to differentiate them from process hints. This + * could be avoided by using a signal-specific knote list, but probably + * isn't worth the trouble. + */ +static int +filt_signal(struct knote *kn, long hint) +{ + + if (hint & NOTE_SIGNAL) { + hint &= ~NOTE_SIGNAL; + + if (kn->kn_id == hint) + kn->kn_data++; + } + return (kn->kn_data != 0); +} + void bsd_ast(thread_act_t thr_act) { @@ -2605,7 +2665,7 @@ p->p_flag &= ~P_OWEUPC; } - if (CHECK_SIGNALS(p, current_thread(), ut)) { + if (CHECK_SIGNALS(p, current_act(), ut)) { while (signum = issignal(p)) postsig(signum); } diff -urN xnu-344.49/bsd/kern/kern_subr.c xnu-517/bsd/kern/kern_subr.c --- xnu-344.49/bsd/kern/kern_subr.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_subr.c Sat Oct 25 00:25:25 2003 @@ -68,6 +68,7 @@ #include #include #include +#include #include @@ -82,6 +83,12 @@ register int n; register struct uio *uio; { + return uiomove64((addr64_t)((unsigned int)cp), n, uio); +} + +int +uiomove64(addr64_t cp, int n, struct uio *uio) +{ register struct iovec *iov; u_int cnt; int error = 0; @@ -110,22 +117,22 @@ if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, - cp, iov->iov_base, cnt, 0,0); + (int)cp, (int)iov->iov_base, cnt, 0,0); - error = copyout(cp, iov->iov_base, cnt); + error = copyout( CAST_DOWN(caddr_t, cp), iov->iov_base, cnt ); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, - cp, iov->iov_base, cnt, 0,0); + (int)cp, (int)iov->iov_base, cnt, 0,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, - iov->iov_base, cp, cnt, 0,0); + (int)iov->iov_base, (int)cp, cnt, 0,0); - error = copyin(iov->iov_base, cp, cnt); + error = copyin(iov->iov_base, CAST_DOWN(caddr_t, cp), cnt); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, - iov->iov_base, cp, cnt, 0,0); + (int)iov->iov_base, (int)cp, cnt, 0,0); } if (error) return (error); @@ -133,10 +140,10 @@ case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) - error = copywithin((caddr_t)cp, iov->iov_base, + error = copywithin(CAST_DOWN(caddr_t, cp), iov->iov_base, cnt); else - error = copywithin(iov->iov_base, (caddr_t)cp, + error = copywithin(iov->iov_base, CAST_DOWN(caddr_t, cp), cnt); break; @@ -144,23 +151,51 @@ if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, - cp, iov->iov_base, cnt, 1,0); + (int)cp, (int)iov->iov_base, cnt, 1,0); + + if (error = copypv((addr64_t)cp, (addr64_t)((unsigned int)iov->iov_base), cnt, cppvPsrc | cppvNoRefSrc)) /* Copy physical to virtual */ + error = EFAULT; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, + (int)cp, (int)iov->iov_base, cnt, 1,0); + } + else + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, + (int)iov->iov_base, (int)cp, cnt, 1,0); - error = copyp2v(cp, iov->iov_base, cnt); + if (error = copypv((addr64_t)((unsigned int)iov->iov_base), (addr64_t)cp, cnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk)) /* Copy virtual to physical */ + error = EFAULT; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, + (int)iov->iov_base, (int)cp, cnt, 1,0); + } + if (error) + return (error); + break; + + case UIO_PHYS_SYSSPACE: + if (uio->uio_rw == UIO_READ) + { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, + (int)cp, (int)iov->iov_base, cnt, 2,0); + if (error = copypv((addr64_t)cp, (addr64_t)((unsigned int)iov->iov_base), cnt, cppvKmap | cppvPsrc | cppvNoRefSrc)) /* Copy physical to virtual */ + error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, - cp, iov->iov_base, cnt, 1,0); + (int)cp, (int)iov->iov_base, cnt, 2,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, - iov->iov_base, cp, cnt, 1,0); + (int)iov->iov_base, (int)cp, cnt, 2,0); - panic("copyv2p not implemented yet\n"); + if (error = copypv((addr64_t)((unsigned int)iov->iov_base), (addr64_t)cp, cnt, cppvKmap | cppvPsnk | cppvNoRefSrc | cppvNoModSnk)) /* Copy virtual to physical */ + error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, - iov->iov_base, cp, cnt, 1,0); + (int)iov->iov_base, (int)cp, cnt, 2,0); } if (error) return (error); diff -urN xnu-344.49/bsd/kern/kern_symfile.c xnu-517/bsd/kern/kern_symfile.c --- xnu-344.49/bsd/kern/kern_symfile.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_symfile.c Sat Oct 25 00:25:25 2003 @@ -110,15 +110,15 @@ // Dispose of unnecessary gumf, the booter doesn't need to load these rc_mh = IODTGetLoaderInfo("Kernel-__HEADER", (void **)&orig_mh, &orig_mhsize); - if (rc_mh && orig_mh) + if (rc_mh == 0 && orig_mh) IODTFreeLoaderInfo("Kernel-__HEADER", - (void *)orig_mh, round_page(orig_mhsize)); + (void *)orig_mh, round_page_32(orig_mhsize)); rc_sc = IODTGetLoaderInfo("Kernel-__SYMTAB", (void **) &orig_st, &orig_st_size); - if (rc_sc && orig_st) + if (rc_sc == 0 && orig_st) IODTFreeLoaderInfo("Kernel-__SYMTAB", - (void *)orig_st, round_page(orig_st_size)); + (void *)orig_st, round_page_32(orig_st_size)); if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) goto out; @@ -207,7 +207,7 @@ mh->flags = orig_mh->flags; // Initialise the current file offset and addr - offset = round_page(header_size); + offset = round_page_32(header_size); addr = (caddr_t) const_text->addr; // Load address of __TEXT,__const /* @@ -220,7 +220,7 @@ sg->vmaddr = (unsigned long) addr; sg->vmsize = const_text->size; sg->fileoff = 0; - sg->filesize = const_text->size + round_page(header_size); + sg->filesize = const_text->size + round_page_32(header_size); sg->maxprot = 0; sg->initprot = 0; sg->flags = 0; @@ -237,7 +237,7 @@ const_text = se; } } - offset = round_page((vm_address_t) offset); + offset = round_page_32((vm_address_t) offset); // Now copy of the __DATA segment load command, the image need // not be stored to disk nobody needs it, yet! @@ -258,7 +258,7 @@ se->offset = offset; se->nreloc = 0; } - offset = round_page(offset); + offset = round_page_32(offset); /* diff -urN xnu-344.49/bsd/kern/kern_synch.c xnu-517/bsd/kern/kern_synch.c --- xnu-344.49/bsd/kern/kern_synch.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_synch.c Sat Oct 25 00:25:25 2003 @@ -61,21 +61,16 @@ _sleep_continue(void) { register struct proc *p; - register thread_t thread = current_thread(); - thread_act_t th_act; + register thread_t self = current_act(); struct uthread * ut; int sig, catch; int error = 0; - th_act = current_act(); - ut = get_bsdthread_info(th_act); + ut = get_bsdthread_info(self); catch = ut->uu_pri & PCATCH; p = current_proc(); -#if FIXME /* [ */ - thread->wait_mesg = NULL; -#endif /* FIXME ] */ - switch (get_thread_waitresult(thread)) { + switch (get_thread_waitresult(self)) { case THREAD_TIMED_OUT: error = EWOULDBLOCK; break; @@ -90,7 +85,7 @@ /* else fall through */ case THREAD_INTERRUPTED: if (catch) { - if (thread_should_abort(current_thread())) { + if (thread_should_abort(self)) { error = EINTR; } else if (SHOULDissignal(p,ut)) { if (sig = CURSIG(p)) { @@ -99,7 +94,7 @@ else error = ERESTART; } - if (thread_should_abort(current_thread())) { + if (thread_should_abort(self)) { error = EINTR; } } @@ -109,7 +104,7 @@ } if (error == EINTR || error == ERESTART) - act_set_astbsd(th_act); + act_set_astbsd(self); if (ut->uu_timo) thread_cancel_timer(); @@ -145,8 +140,7 @@ int (*continuation)(int)) { register struct proc *p; - register thread_t thread = current_thread(); - thread_act_t th_act; + register thread_t self = current_act(); struct uthread * ut; int sig, catch = pri & PCATCH; int sigttblock = pri & PTTYBLOCK; @@ -156,8 +150,7 @@ s = splhigh(); - th_act = current_act(); - ut = get_bsdthread_info(th_act); + ut = get_bsdthread_info(self); p = current_proc(); #if KTRACE @@ -166,11 +159,11 @@ #endif p->p_priority = pri & PRIMASK; - if (chan) - wait_result = assert_wait(chan, - (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); - - if (abstime) + if (chan != NULL) + assert_wait_prim(chan, NULL, abstime, + (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); + else + if (abstime != 0) thread_set_timer_deadline(abstime); /* @@ -185,7 +178,8 @@ if (catch) { if (SHOULDissignal(p,ut)) { if (sig = CURSIG(p)) { - clear_wait(thread, THREAD_INTERRUPTED); + if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) + goto block; /* if SIGTTOU or SIGTTIN then block till SIGCONT */ if (sigttblock && ((sig == SIGTTOU) || (sig == SIGTTIN))) { p->p_flag |= P_TTYSLEEP; @@ -206,24 +200,24 @@ goto out; } } - if (thread_should_abort(current_thread())) { - clear_wait(thread, THREAD_INTERRUPTED); + if (thread_should_abort(self)) { + if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) + goto block; error = EINTR; goto out; } - if (get_thread_waitresult(thread) != THREAD_WAITING) { + if (get_thread_waitresult(self) != THREAD_WAITING) { /*already happened */ goto out; } } -#if FIXME /* [ */ - thread->wait_mesg = wmsg; -#endif /* FIXME ] */ +block: + splx(s); p->p_stats->p_ru.ru_nvcsw++; - if (continuation != THREAD_CONTINUE_NULL ) { + if ((thread_continue_t)continuation != THREAD_CONTINUE_NULL ) { ut->uu_continuation = continuation; ut->uu_pri = pri; ut->uu_timo = abstime? 1: 0; @@ -233,9 +227,6 @@ wait_result = thread_block(THREAD_CONTINUE_NULL); -#if FIXME /* [ */ - thread->wait_mesg = NULL; -#endif /* FIXME ] */ switch (wait_result) { case THREAD_TIMED_OUT: error = EWOULDBLOCK; @@ -251,7 +242,7 @@ /* else fall through */ case THREAD_INTERRUPTED: if (catch) { - if (thread_should_abort(current_thread())) { + if (thread_should_abort(self)) { error = EINTR; } else if (SHOULDissignal(p,ut)) { if (sig = CURSIG(p)) { @@ -260,7 +251,7 @@ else error = ERESTART; } - if (thread_should_abort(current_thread())) { + if (thread_should_abort(self)) { error = EINTR; } } @@ -270,7 +261,7 @@ } out: if (error == EINTR || error == ERESTART) - act_set_astbsd(th_act); + act_set_astbsd(self); if (abstime) thread_cancel_timer(); (void) splx(s); diff -urN xnu-344.49/bsd/kern/kern_sysctl.c xnu-517/bsd/kern/kern_sysctl.c --- xnu-344.49/bsd/kern/kern_sysctl.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_sysctl.c Sat Oct 25 00:25:25 2003 @@ -75,11 +75,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -95,9 +97,7 @@ #include #include -#if __ppc__ -#include -#endif +#include sysctlfn kern_sysctl; #ifdef DEBUG @@ -107,18 +107,35 @@ extern sysctlfn vfs_sysctl; extern sysctlfn net_sysctl; extern sysctlfn cpu_sysctl; +extern int aio_max_requests; +extern int aio_max_requests_per_process; +extern int aio_worker_threads; +extern int maxprocperuid; +extern int maxfilesperproc; int userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval); -void -fill_proc(struct proc *p,struct kinfo_proc *kp, int doingzomb); - -void -fill_externproc(struct proc *p, struct extern_proc *exp); - +static int +sysctl_aiomax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static int +sysctl_aioprocmax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static int +sysctl_aiothreads( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static void +fill_proc(struct proc *p, struct kinfo_proc *kp); +static int +sysctl_maxfilesperproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static int +sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static int +sysctl_maxproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ); +static int +sysctl_procargs2( int *name, u_int namelen, char *where, size_t *sizep, struct proc *cur_proc); +static int +sysctl_procargsx( int *name, u_int namelen, char *where, size_t *sizep, struct proc *cur_proc, int argc_yes); /* @@ -308,6 +325,10 @@ extern int hostnamelen; extern char domainname[MAXHOSTNAMELEN]; extern int domainnamelen; +extern char classichandler[32]; +extern long classichandler_fsid; +extern long classichandler_fileid; + extern long hostid; #ifdef INSECURE int securelevel = -1; @@ -315,6 +336,124 @@ int securelevel; #endif +static int +sysctl_affinity(name, namelen, oldBuf, oldSize, newBuf, newSize, cur_proc) + int *name; + u_int namelen; + char *oldBuf; + size_t *oldSize; + char *newBuf; + size_t newSize; + struct proc *cur_proc; +{ + if (namelen < 1) + return (EOPNOTSUPP); + + if (name[0] == 0 && 1 == namelen) { + return sysctl_rdint(oldBuf, oldSize, newBuf, + (cur_proc->p_flag & P_AFFINITY) ? 1 : 0); + } else if (name[0] == 1 && 2 == namelen) { + if (name[1] == 0) { + cur_proc->p_flag &= ~P_AFFINITY; + } else { + cur_proc->p_flag |= P_AFFINITY; + } + return 0; + } + return (EOPNOTSUPP); +} + +static int +sysctl_classic(name, namelen, oldBuf, oldSize, newBuf, newSize, cur_proc) + int *name; + u_int namelen; + char *oldBuf; + size_t *oldSize; + char *newBuf; + size_t newSize; + struct proc *cur_proc; +{ + int newVal; + int err; + struct proc *p; + + if (namelen != 1) + return (EOPNOTSUPP); + + p = pfind(name[0]); + if (p == NULL) + return (EINVAL); + + if ((p->p_ucred->cr_uid != cur_proc->p_ucred->cr_uid) + && suser(cur_proc->p_ucred, &cur_proc->p_acflag)) + return (EPERM); + + return sysctl_rdint(oldBuf, oldSize, newBuf, + (p->p_flag & P_CLASSIC) ? 1 : 0); +} + +static int +sysctl_classichandler(name, namelen, oldBuf, oldSize, newBuf, newSize, p) + int *name; + u_int namelen; + char *oldBuf; + size_t *oldSize; + char *newBuf; + size_t newSize; + struct proc *p; +{ + int error; + int len; + struct nameidata nd; + struct vattr vattr; + char handler[sizeof(classichandler)]; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + len = strlen(classichandler) + 1; + if (oldBuf && *oldSize < len) + return (ENOMEM); + if (newBuf && newSize >= sizeof(classichandler)) + return (ENAMETOOLONG); + *oldSize = len - 1; + if (newBuf) { + error = copyin(newBuf, handler, newSize); + if (error) + return (error); + handler[newSize] = 0; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + handler, p); + error = namei(&nd); + if (error) + return (error); + /* Check mount point */ + if ((nd.ni_vp->v_mount->mnt_flag & MNT_NOEXEC) || + (nd.ni_vp->v_type != VREG)) { + vput(nd.ni_vp); + return (EACCES); + } + error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p); + if (error) { + vput(nd.ni_vp); + return (error); + } + classichandler_fsid = vattr.va_fsid; + classichandler_fileid = vattr.va_fileid; + vput(nd.ni_vp); + } + if (oldBuf) { + error = copyout(classichandler, oldBuf, len); + if (error) + return (error); + } + if (newBuf) { + strcpy(classichandler, handler); + } + return (error); +} + + extern int get_kernel_symfile( struct proc *, char **); extern int sysctl_dopanicinfo(int *, u_int, void *, size_t *, void *, size_t, struct proc *); @@ -344,9 +483,12 @@ || name[0] == KERN_PROF || name[0] == KERN_KDEBUG || name[0] == KERN_PROCARGS + || name[0] == KERN_PROCARGS2 || name[0] == KERN_PCSAMPLES || name[0] == KERN_IPC || name[0] == KERN_SYSV + || name[0] == KERN_AFFINITY + || name[0] == KERN_CLASSIC || name[0] == KERN_PANICINFO) ) return (ENOTDIR); /* overloaded */ @@ -365,11 +507,16 @@ error = sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes); reset_vmobjectcache(oldval, desiredvnodes); + resize_namecache(desiredvnodes); return(error); case KERN_MAXPROC: - return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc)); + return (sysctl_maxproc(oldp, oldlenp, newp, newlen)); case KERN_MAXFILES: return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles)); + case KERN_MAXPROCPERUID: + return( sysctl_maxprocperuid( oldp, oldlenp, newp, newlen ) ); + case KERN_MAXFILESPERPROC: + return( sysctl_maxfilesperproc( oldp, oldlenp, newp, newlen ) ); case KERN_ARGMAX: return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX)); case KERN_SECURELVL: @@ -433,6 +580,9 @@ case KERN_PROCARGS: /* new one as it does not use kinfo_proc */ return (sysctl_procargs(name + 1, namelen - 1, oldp, oldlenp, p)); + case KERN_PROCARGS2: + /* new one as it does not use kinfo_proc */ + return (sysctl_procargs2(name + 1, namelen - 1, oldp, oldlenp, p)); case KERN_SYMFILE: error = get_kernel_symfile( p, &str ); if ( error ) @@ -443,6 +593,21 @@ case KERN_PANICINFO: return(sysctl_dopanicinfo(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p)); + case KERN_AFFINITY: + return sysctl_affinity(name+1, namelen-1, oldp, oldlenp, + newp, newlen, p); + case KERN_CLASSIC: + return sysctl_classic(name+1, namelen-1, oldp, oldlenp, + newp, newlen, p); + case KERN_CLASSICHANDLER: + return sysctl_classichandler(name+1, namelen-1, oldp, oldlenp, + newp, newlen, p); + case KERN_AIOMAX: + return( sysctl_aiomax( oldp, oldlenp, newp, newlen ) ); + case KERN_AIOPROCMAX: + return( sysctl_aioprocmax( oldp, oldlenp, newp, newlen ) ); + case KERN_AIOTHREADS: + return( sysctl_aiothreads( oldp, oldlenp, newp, newlen ) ); default: return (EOPNOTSUPP); } @@ -798,25 +963,28 @@ break; case KERN_PROC_TTY: - if ( doingzomb || (p->p_flag & P_CONTROLT) == 0 || + if ((p->p_flag & P_CONTROLT) == 0 || + (p->p_session == NULL) || p->p_session->s_ttyp == NULL || p->p_session->s_ttyp->t_dev != (dev_t)name[1]) continue; break; case KERN_PROC_UID: - if (doingzomb || (p->p_ucred->cr_uid != (uid_t)name[1])) + if ((p->p_ucred == NULL) || + (p->p_ucred->cr_uid != (uid_t)name[1])) continue; break; case KERN_PROC_RUID: - if ( doingzomb || (p->p_cred->p_ruid != (uid_t)name[1])) + if ((p->p_ucred == NULL) || + (p->p_cred->p_ruid != (uid_t)name[1])) continue; break; } if (buflen >= sizeof(struct kinfo_proc)) { bzero(&kproc, sizeof(struct kinfo_proc)); - fill_proc(p, &kproc, doingzomb); + fill_proc(p, &kproc); if (error = copyout((caddr_t)&kproc, &dp->kp_proc, sizeof(struct kinfo_proc))) return (error); @@ -841,56 +1009,49 @@ return (0); } -void -fill_proc(p,kp, doingzomb) - register struct proc *p; - register struct kinfo_proc *kp; - int doingzomb; -{ - fill_externproc(p, &kp->kp_proc); - if (!doingzomb) - fill_eproc(p, &kp->kp_eproc); -} /* * Fill in an eproc structure for the specified process. */ -void +static void fill_eproc(p, ep) register struct proc *p; register struct eproc *ep; { register struct tty *tp; - /* - * Skip zombie processes. - */ - if (p->p_stat == SZOMB) - return; - ep->e_paddr = p; - ep->e_sess = p->p_pgrp->pg_session; - ep->e_pcred = *p->p_cred; - ep->e_ucred = *p->p_ucred; + if (p->p_pgrp) { + ep->e_sess = p->p_pgrp->pg_session; + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + if (ep->e_sess && ep->e_sess->s_ttyvp) + ep->e_flag = EPROC_CTTY; + } else { + ep->e_sess = (struct session *)0; + ep->e_pgid = 0; + ep->e_jobc = 0; + } + ep->e_ppid = (p->p_pptr) ? p->p_pptr->p_pid : 0; + if (p->p_cred) { + ep->e_pcred = *p->p_cred; + if (p->p_ucred) + ep->e_ucred = *p->p_ucred; + } if (p->p_stat == SIDL || p->p_stat == SZOMB) { ep->e_vm.vm_tsize = 0; ep->e_vm.vm_dsize = 0; ep->e_vm.vm_ssize = 0; } ep->e_vm.vm_rssize = 0; - if (p->p_pptr) - ep->e_ppid = p->p_pptr->p_pid; - else - ep->e_ppid = 0; - ep->e_pgid = p->p_pgrp->pg_id; - ep->e_jobc = p->p_pgrp->pg_jobc; - if ((p->p_flag & P_CONTROLT) && + + if ((p->p_flag & P_CONTROLT) && (ep->e_sess) && (tp = ep->e_sess->s_ttyp)) { ep->e_tdev = tp->t_dev; ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; ep->e_tsess = tp->t_session; } else ep->e_tdev = NODEV; - ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0; + if (SESS_LEADER(p)) ep->e_flag |= EPROC_SLEADER; if (p->p_wmesg) @@ -898,10 +1059,11 @@ ep->e_xsize = ep->e_xrssize = 0; ep->e_xccount = ep->e_xswrss = 0; } + /* * Fill in an eproc structure for the specified process. */ -void +static void fill_externproc(p, exp) register struct proc *p; register struct extern_proc *exp; @@ -954,6 +1116,15 @@ exp->p_ru = p->p_ru ; } +static void +fill_proc(p, kp) + register struct proc *p; + register struct kinfo_proc *kp; +{ + fill_externproc(p, &kp->kp_proc); + fill_eproc(p, &kp->kp_eproc); +} + int kdebug_ops(name, namelen, where, sizep, p) int *name; @@ -1029,10 +1200,8 @@ } /* - * Returns the top N bytes of the user stack, with - * everything below the first argument character - * zeroed for security reasons. - * Odd data structure is for compatibility. + * Return the top *sizep bytes of the user stack, or the entire area of the + * user stack down through the saved exec_path, whichever is smaller. */ int sysctl_procargs(name, namelen, where, sizep, cur_proc) @@ -1042,6 +1211,29 @@ size_t *sizep; struct proc *cur_proc; { + return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 0); +} + +static int +sysctl_procargs2(name, namelen, where, sizep, cur_proc) + int *name; + u_int namelen; + char *where; + size_t *sizep; + struct proc *cur_proc; +{ + return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 1); +} + +static int +sysctl_procargsx(name, namelen, where, sizep, cur_proc, argc_yes) + int *name; + u_int namelen; + char *where; + size_t *sizep; + struct proc *cur_proc; + int argc_yes; +{ register struct proc *p; register int needed = 0; int buflen = where != NULL ? *sizep : 0; @@ -1054,14 +1246,14 @@ caddr_t data; unsigned size; vm_offset_t copy_start, copy_end; - vm_offset_t dealloc_start; /* area to remove from kernel map */ - vm_offset_t dealloc_end; int *ip; kern_return_t ret; int pid; + if (argc_yes) + buflen -= NBPW; /* reserve first word to return argc */ - if ((buflen <= 0) || (buflen > (PAGE_SIZE << 1))) { + if ((buflen <= 0) || (buflen > ARG_MAX)) { return(EINVAL); } arg_size = buflen; @@ -1116,20 +1308,20 @@ goto restart; } - ret = kmem_alloc(kernel_map, ©_start, round_page(arg_size)); + ret = kmem_alloc(kernel_map, ©_start, round_page_32(arg_size)); if (ret != KERN_SUCCESS) { task_deallocate(task); return(ENOMEM); } proc_map = get_task_map(task); - copy_end = round_page(copy_start + arg_size); + copy_end = round_page_32(copy_start + arg_size); - if( vm_map_copyin(proc_map, trunc_page(arg_addr), round_page(arg_size), + if( vm_map_copyin(proc_map, trunc_page(arg_addr), round_page_32(arg_size), FALSE, &tmp) != KERN_SUCCESS) { task_deallocate(task); kmem_free(kernel_map, copy_start, - round_page(arg_size)); + round_page_32(arg_size)); return (EIO); } @@ -1142,61 +1334,94 @@ if( vm_map_copy_overwrite(kernel_map, copy_start, tmp, FALSE) != KERN_SUCCESS) { kmem_free(kernel_map, copy_start, - round_page(arg_size)); + round_page_32(arg_size)); return (EIO); } data = (caddr_t) (copy_end - arg_size); - ip = (int *) copy_end; - size = arg_size; - /* - * Now look down the stack for the bottom of the - * argument list. Since this call is otherwise - * unprotected, we can't let the nosy user see - * anything else on the stack. - * - * The arguments are pushed on the stack by - * execve() as: - * - * .long 0 - * arg 0 (null-terminated) - * arg 1 - * ... - * arg N - * .long 0 - * - */ + if (buflen > p->p_argslen) { + data = &data[buflen - p->p_argslen]; + size = p->p_argslen; + } else { + size = buflen; + } - ip -= 2; /*skip trailing 0 word and assume at least one - argument. The last word of argN may be just - the trailing 0, in which case we'd stop - there */ - while (*--ip) - if (ip == (int *)data) - break; - /* - * To account for saved path name and not having a null after that - * Run the sweep again. If we have already sweeped entire range skip this - */ - if (ip != (int *)data) { - while (*--ip) - if (ip == (int *)data) - break; - } - - bzero(data, (unsigned) ((int)ip - (int)data)); - - dealloc_start = copy_start; - dealloc_end = copy_end; - - - size = MIN(size, buflen); - error = copyout(data, where, size); - - if (dealloc_start != (vm_offset_t) 0) { - kmem_free(kernel_map, dealloc_start, - dealloc_end - dealloc_start); + if (argc_yes) { + /* Put processes argc as the first word in the copyout buffer */ + suword(where, p->p_argc); + error = copyout(data, where + NBPW, size); + } else { + error = copyout(data, where, size); + + /* + * Make the old PROCARGS work to return the executable's path + * But, only if there is enough space in the provided buffer + * + * on entry: data [possibily] points to the beginning of the path + * + * Note: we keep all pointers&sizes aligned to word boundries + */ + + if ( (! error) && (buflen > p->p_argslen) ) + { + int binPath_sz; + int extraSpaceNeeded, addThis; + char * placeHere; + char * str = (char *) data; + unsigned int max_len = size; + + /* Some apps are really bad about messing up their stacks + So, we have to be extra careful about getting the length + of the executing binary. If we encounter an error, we bail. + */ + + /* Limit ourselves to PATH_MAX paths */ + if ( max_len > PATH_MAX ) max_len = PATH_MAX; + + binPath_sz = 0; + + while ( (binPath_sz < max_len-1) && (*str++ != 0) ) + binPath_sz++; + + if (binPath_sz < max_len-1) binPath_sz += 1; + + /* Pre-Flight the space requiremnts */ + + /* Account for the padding that fills out binPath to the next word */ + binPath_sz += (binPath_sz & (NBPW-1)) ? (NBPW-(binPath_sz & (NBPW-1))) : 0; + + placeHere = where + size; + + /* Account for the bytes needed to keep placeHere word aligned */ + addThis = ((unsigned long)placeHere & (NBPW-1)) ? (NBPW-((unsigned long)placeHere & (NBPW-1))) : 0; + + /* Add up all the space that is needed */ + extraSpaceNeeded = binPath_sz + addThis + (4 * NBPW); + + /* is there is room to tack on argv[0]? */ + if ( (buflen & ~(NBPW-1)) >= ( p->p_argslen + extraSpaceNeeded )) + { + placeHere += addThis; + suword(placeHere, 0); + placeHere += NBPW; + suword(placeHere, 0xBFFF0000); + placeHere += NBPW; + suword(placeHere, 0); + placeHere += NBPW; + error = copyout(data, placeHere, binPath_sz); + if ( ! error ) + { + placeHere += binPath_sz; + suword(placeHere, 0); + size += extraSpaceNeeded; + } + } + } + } + + if (copy_start != (vm_offset_t) 0) { + kmem_free(kernel_map, copy_start, copy_end - copy_start); } if (error) { return(error); @@ -1206,3 +1431,197 @@ *sizep = size; return (0); } + + +/* + * Validate parameters and get old / set new parameters + * for max number of concurrent aio requests. Makes sure + * the system wide limit is greater than the per process + * limit. + */ +static int +sysctl_aiomax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value; + + if ( oldp && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp ) + error = copyout( &aio_max_requests, oldp, sizeof(int) ); + if ( error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && newp ) { + if ( new_value >= aio_max_requests_per_process ) + aio_max_requests = new_value; + else + error = EINVAL; + } + return( error ); + +} /* sysctl_aiomax */ + + +/* + * Validate parameters and get old / set new parameters + * for max number of concurrent aio requests per process. + * Makes sure per process limit is less than the system wide + * limit. + */ +static int +sysctl_aioprocmax( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value = 0; + + if ( oldp && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp ) + error = copyout( &aio_max_requests_per_process, oldp, sizeof(int) ); + if ( error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && newp ) { + if ( new_value <= aio_max_requests && new_value >= AIO_LISTIO_MAX ) + aio_max_requests_per_process = new_value; + else + error = EINVAL; + } + return( error ); + +} /* sysctl_aioprocmax */ + + +/* + * Validate parameters and get old / set new parameters + * for max number of async IO worker threads. + * We only allow an increase in the number of worker threads. + */ +static int +sysctl_aiothreads( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value; + + if ( oldp && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp ) + error = copyout( &aio_worker_threads, oldp, sizeof(int) ); + if ( error == 0 && newp ) + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && newp ) { + if (new_value > aio_worker_threads ) { + _aio_create_worker_threads( (new_value - aio_worker_threads) ); + aio_worker_threads = new_value; + } + else + error = EINVAL; + } + return( error ); + +} /* sysctl_aiothreads */ + + +/* + * Validate parameters and get old / set new parameters + * for max number of processes per UID. + * Makes sure per UID limit is less than the system wide limit. + */ +static int +sysctl_maxprocperuid( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value; + + if ( oldp != NULL && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp != NULL && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp != NULL ) + error = copyout( &maxprocperuid, oldp, sizeof(int) ); + if ( error == 0 && newp != NULL ) { + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && new_value <= maxproc && new_value > 0 ) + maxprocperuid = new_value; + else + error = EINVAL; + } + return( error ); + +} /* sysctl_maxprocperuid */ + + +/* + * Validate parameters and get old / set new parameters + * for max number of files per process. + * Makes sure per process limit is less than the system-wide limit. + */ +static int +sysctl_maxfilesperproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value; + + if ( oldp != NULL && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp != NULL && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp != NULL ) + error = copyout( &maxfilesperproc, oldp, sizeof(int) ); + if ( error == 0 && newp != NULL ) { + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && new_value < maxfiles && new_value > 0 ) + maxfilesperproc = new_value; + else + error = EINVAL; + } + return( error ); + +} /* sysctl_maxfilesperproc */ + + +/* + * Validate parameters and get old / set new parameters + * for the system-wide limit on the max number of processes. + * Makes sure the system-wide limit is less than the configured hard + * limit set at kernel compilation. + */ +static int +sysctl_maxproc( void *oldp, size_t *oldlenp, void *newp, size_t newlen ) +{ + int error = 0; + int new_value; + + if ( oldp != NULL && *oldlenp < sizeof(int) ) + return (ENOMEM); + if ( newp != NULL && newlen != sizeof(int) ) + return (EINVAL); + + *oldlenp = sizeof(int); + if ( oldp != NULL ) + error = copyout( &maxproc, oldp, sizeof(int) ); + if ( error == 0 && newp != NULL ) { + error = copyin( newp, &new_value, sizeof(int) ); + if ( error == 0 && new_value <= hard_maxproc && new_value > 0 ) + maxproc = new_value; + else + error = EINVAL; + } + return( error ); + +} /* sysctl_maxproc */ diff -urN xnu-344.49/bsd/kern/kern_time.c xnu-517/bsd/kern/kern_time.c --- xnu-344.49/bsd/kern/kern_time.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_time.c Sat Oct 25 00:25:25 2003 @@ -102,7 +102,7 @@ /* NOTE THIS implementation is for non ppc architectures only */ if (uap->tp) { - microtime(&atv); + clock_get_calendar_microtime(&atv.tv_sec, &atv.tv_usec); if (error = copyout((caddr_t)&atv, (caddr_t)uap->tp, sizeof (atv))) return(error); @@ -158,21 +158,14 @@ struct timeval *tv; { long delta = tv->tv_sec - time.tv_sec; - mach_timespec_t now; - now.tv_sec = tv->tv_sec; - now.tv_nsec = tv->tv_usec * NSEC_PER_USEC; - - clock_set_calendar_value(now); + clock_set_calendar_microtime(tv->tv_sec, tv->tv_usec); boottime.tv_sec += delta; #if NFSCLIENT || NFSSERVER lease_updatetime(delta); #endif } -#define tickadj (40 * NSEC_PER_USEC) /* "standard" skew, ns / 10 ms */ -#define bigadj (1 * NSEC_PER_SEC) /* use 10x skew above bigadj ns */ - struct adjtime_args { struct timeval *delta; struct timeval *olddelta; @@ -185,8 +178,6 @@ register_t *retval; { struct timeval atv; - int64_t total; - uint32_t delta; int error; if (error = suser(p->p_ucred, &p->p_acflag)) @@ -198,17 +189,9 @@ /* * Compute the total correction and the rate at which to apply it. */ - total = (int64_t)atv.tv_sec * NSEC_PER_SEC + atv.tv_usec * NSEC_PER_USEC; - if (total > bigadj || total < -bigadj) - delta = 10 * tickadj; - else - delta = tickadj; - - total = clock_set_calendar_adjtime(total, delta); + clock_adjtime(&atv.tv_sec, &atv.tv_usec); if (uap->olddelta) { - atv.tv_sec = total / NSEC_PER_SEC; - atv.tv_usec = (total / NSEC_PER_USEC) % USEC_PER_SEC; (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta, sizeof (struct timeval)); } @@ -226,6 +209,8 @@ inittodr(base) time_t base; { + struct timeval tv; + /* * Assertion: * The calendar has already been @@ -234,21 +219,17 @@ * The value returned by microtime() * is gotten from the calendar. */ - microtime(&time); + microtime(&tv); - /* - * This variable still exists to keep - * 'w' happy. It should only be considered - * an approximation. - */ - boottime.tv_sec = time.tv_sec; + time = tv; + boottime.tv_sec = tv.tv_sec; boottime.tv_usec = 0; /* * If the RTC does not have acceptable value, i.e. time before * the UNIX epoch, set it to the UNIX epoch */ - if (time.tv_sec < 0) { + if (tv.tv_sec < 0) { printf ("WARNING: preposterous time in Real Time Clock"); time.tv_sec = 0; /* the UNIX epoch */ time.tv_usec = 0; @@ -430,10 +411,10 @@ } } - thread_call_func_delayed(realitexpire, pid, tvtoabstime(&p->p_rtime)); - psignal(p, SIGALRM); + thread_call_func_delayed(realitexpire, pid, tvtoabstime(&p->p_rtime)); + (void) thread_funnel_set(kernel_flock, FALSE); } @@ -549,20 +530,14 @@ microtime( struct timeval *tvp) { - mach_timespec_t now = clock_get_calendar_value(); - - tvp->tv_sec = now.tv_sec; - tvp->tv_usec = now.tv_nsec / NSEC_PER_USEC; + clock_get_calendar_microtime(&tvp->tv_sec, &tvp->tv_usec); } void microuptime( struct timeval *tvp) { - mach_timespec_t now = clock_get_system_value(); - - tvp->tv_sec = now.tv_sec; - tvp->tv_usec = now.tv_nsec / NSEC_PER_USEC; + clock_get_system_microtime(&tvp->tv_sec, &tvp->tv_usec); } /* @@ -572,20 +547,14 @@ nanotime( struct timespec *tsp) { - mach_timespec_t now = clock_get_calendar_value(); - - tsp->tv_sec = now.tv_sec; - tsp->tv_nsec = now.tv_nsec; + clock_get_calendar_nanotime((uint32_t *)&tsp->tv_sec, &tsp->tv_nsec); } void nanouptime( struct timespec *tsp) { - mach_timespec_t now = clock_get_system_value(); - - tsp->tv_sec = now.tv_sec; - tsp->tv_nsec = now.tv_nsec; + clock_get_system_nanotime((uint32_t *)&tsp->tv_sec, &tsp->tv_nsec); } uint64_t diff -urN xnu-344.49/bsd/kern/kern_xxx.c xnu-517/bsd/kern/kern_xxx.c --- xnu-344.49/bsd/kern/kern_xxx.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/kern_xxx.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -118,8 +118,7 @@ name = KERN_HOSTNAME; - return (kern_sysctl(&name, 1, uap->hostname, &uap->len), - 0, 0); + return (kern_sysctl(&name, 1, uap->hostname, &uap->len, 0, 0)); } struct osethostname_args { @@ -204,8 +203,8 @@ return(error); if (uap->opt & RB_COMMAND) - error = copyinstr(uap->command, - command, sizeof(command), &dummy); + error = copyinstr((void *)uap->command, + (void *)command, sizeof(command), (size_t *)&dummy); if (!error) { SET(p->p_flag, P_REBOOT); /* No more signals for this proc */ boot(RB_BOOT, uap->opt, command); diff -urN xnu-344.49/bsd/kern/mach_fat.c xnu-517/bsd/kern/mach_fat.c --- xnu-344.49/bsd/kern/mach_fat.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/mach_fat.c Sat Oct 25 00:25:25 2003 @@ -43,33 +43,36 @@ #include #include +#define CPU_TYPE_NATIVE (machine_slot[cpu_number()].cpu_type) +#define CPU_TYPE_CLASSIC CPU_TYPE_POWERPC /********************************************************************** - * Routine: fatfile_getarch() + * Routine: fatfile_getarch2() * * Function: Locate the architecture-dependant contents of a fat * file that match this CPU. * * Args: vp: The vnode for the fat file. * header: A pointer to the fat file header. + * cpu_type: The required cpu type. * archret (out): Pointer to fat_arch structure to hold * the results. * * Returns: KERN_SUCCESS: Valid architecture found. * KERN_FAILURE: No valid architecture found. **********************************************************************/ -load_return_t -fatfile_getarch( - struct vnode *vp, - vm_offset_t data_ptr, - struct fat_arch *archret) +static load_return_t +fatfile_getarch2( + struct vnode *vp, + vm_offset_t data_ptr, + cpu_type_t cpu_type, + struct fat_arch *archret) { /* vm_pager_t pager; */ vm_offset_t addr; vm_size_t size; kern_return_t kret; load_return_t lret; - struct machine_slot *ms; struct fat_arch *arch; struct fat_arch *best_arch; int grade; @@ -107,7 +110,7 @@ /* * Round size of fat_arch structures up to page boundry. */ - size = round_page(end_of_archs); + size = round_page_32(end_of_archs); if (size <= 0) return(LOAD_BADMACHO); @@ -115,7 +118,6 @@ * Scan the fat_arch's looking for the best one. */ addr = data_ptr; - ms = &machine_slot[cpu_number()]; best_arch = NULL; best_grade = 0; arch = (struct fat_arch *) (addr + sizeof(struct fat_header)); @@ -124,7 +126,7 @@ /* * Check to see if right cpu type. */ - if(NXSwapBigIntToHost(arch->cputype) != ms->cpu_type) + if(NXSwapBigIntToHost(arch->cputype) != cpu_type) continue; /* @@ -168,4 +170,54 @@ return(lret); } +extern char classichandler[]; + +load_return_t +fatfile_getarch_affinity( + struct vnode *vp, + vm_offset_t data_ptr, + struct fat_arch *archret, + int affinity) +{ + load_return_t lret; + int handler = (classichandler[0] != 0); + cpu_type_t primary_type, fallback_type; + + if (handler && affinity) { + primary_type = CPU_TYPE_CLASSIC; + fallback_type = CPU_TYPE_NATIVE; + } else { + primary_type = CPU_TYPE_NATIVE; + fallback_type = CPU_TYPE_CLASSIC; + } + lret = fatfile_getarch2(vp, data_ptr, primary_type, archret); + if ((lret != 0) && handler) { + lret = fatfile_getarch2(vp, data_ptr, fallback_type, + archret); + } + return lret; +} + +/********************************************************************** + * Routine: fatfile_getarch() + * + * Function: Locate the architecture-dependant contents of a fat + * file that match this CPU. + * + * Args: vp: The vnode for the fat file. + * header: A pointer to the fat file header. + * archret (out): Pointer to fat_arch structure to hold + * the results. + * + * Returns: KERN_SUCCESS: Valid architecture found. + * KERN_FAILURE: No valid architecture found. + **********************************************************************/ +load_return_t +fatfile_getarch( + struct vnode *vp, + vm_offset_t data_ptr, + struct fat_arch *archret) +{ + return fatfile_getarch2(vp, data_ptr, CPU_TYPE_NATIVE, archret); +} diff -urN xnu-344.49/bsd/kern/mach_header.c xnu-517/bsd/kern/mach_header.c --- xnu-344.49/bsd/kern/mach_header.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/mach_header.c Sat Oct 25 00:25:25 2003 @@ -398,7 +398,7 @@ #if DEBUG printf("fake fvm seg __USER/\"%s\" at 0x%x, size 0x%x\n", sp->sectname, sp->addr, sp->size); -#endif DEBUG +#endif /* DEBUG */ } /* diff -urN xnu-344.49/bsd/kern/mach_loader.c xnu-517/bsd/kern/mach_loader.c --- xnu-344.49/bsd/kern/mach_loader.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/mach_loader.c Sat Oct 25 00:25:25 2003 @@ -47,6 +47,7 @@ #include #include +#include #include #include @@ -77,7 +78,8 @@ unsigned long file_offset, unsigned long macho_size, int depth, - load_result_t *result + load_result_t *result, + boolean_t clean_regions ), load_segment( struct segment_command *scp, @@ -121,7 +123,8 @@ vm_map_t map, thread_act_t thr_act, int depth, - load_result_t *result + load_result_t *result, + boolean_t clean_regions ), get_macho_vnode( char *path, @@ -139,7 +142,8 @@ unsigned long macho_size, load_result_t *result, thread_act_t thr_act, - vm_map_t new_map + vm_map_t new_map, + boolean_t clean_regions ) { pmap_t pmap; @@ -149,6 +153,9 @@ kern_return_t kret; load_return_t lret; boolean_t create_map = TRUE; +#ifndef i386 + extern pmap_t pmap_create(vm_size_t size); /* XXX */ +#endif if (new_map != VM_MAP_NULL) { create_map = FALSE; @@ -168,29 +175,30 @@ TRUE); /**** FIXME ****/ } else map = new_map; - + if (!result) result = &myresult; *result = (load_result_t) { 0 }; lret = parse_machfile(vp, map, thr_act, header, file_offset, macho_size, - 0, result); + 0, result, clean_regions); if (lret != LOAD_SUCCESS) { - if (create_map) + if (create_map) { vm_map_deallocate(map); /* will lose pmap reference too */ + } return(lret); } + /* * Commit to new map. First make sure that the current * users of the task get done with it, and that we clean * up the old contents of IPC and memory. The task is * guaranteed to be single threaded upon return (us). * - * Swap the new map for the old at the task level and at - * our activation. The latter consumes our new map reference - * but each leaves us responsible for the old_map reference. + * Swap the new map for the old, which consumes our new map + * reference but each leaves us responsible for the old_map reference. * That lets us get off the pmap associated with it, and * then we can release it. */ @@ -198,10 +206,6 @@ task_halt(current_task()); old_map = swap_task_map(current_task(), map); - vm_map_deallocate(old_map); - - old_map = swap_act_map(current_act(), map); - #ifndef i386 pmap_switch(pmap); /* Make sure we are using the new pmap */ #endif @@ -211,7 +215,6 @@ } int dylink_test = 1; -extern vm_offset_t system_shared_region; static load_return_t @@ -223,7 +226,8 @@ unsigned long file_offset, unsigned long macho_size, int depth, - load_result_t *result + load_result_t *result, + boolean_t clean_regions ) { struct machine_slot *ms; @@ -231,7 +235,7 @@ struct load_command *lcp, *next; struct dylinker_command *dlp = 0; void * pager; - load_return_t ret; + load_return_t ret = LOAD_SUCCESS; vm_offset_t addr, kl_addr; vm_size_t size,kl_size; int offset; @@ -299,7 +303,7 @@ /* * Round size of Mach-O commands up to page boundry. */ - size = round_page(sizeof (struct mach_header) + header->sizeofcmds); + size = round_page_32(sizeof (struct mach_header) + header->sizeofcmds); if (size <= 0) return(LOAD_BADMACHO); @@ -313,11 +317,11 @@ if (addr == NULL) return(LOAD_NOSPACE); - if(error = vn_rdwr(UIO_READ, vp, addr, size, file_offset, + if(error = vn_rdwr(UIO_READ, vp, (caddr_t)addr, size, file_offset, UIO_SYSSPACE, 0, p->p_ucred, &resid, p)) { if (kl_addr ) kfree(kl_addr, kl_size); - return(EIO); + return(LOAD_IOERROR); } /* ubc_map(vp); */ /* NOT HERE */ @@ -376,13 +380,13 @@ case LC_LOAD_DYLINKER: if (pass != 2) break; - if (depth == 1 || dlp == 0) + if ((depth == 1) && (dlp == 0)) dlp = (struct dylinker_command *)lcp; else ret = LOAD_FAILURE; break; default: - ret = KERN_SUCCESS;/* ignore other stuff */ + ret = LOAD_SUCCESS;/* ignore other stuff */ } if (ret != LOAD_SUCCESS) break; @@ -390,7 +394,7 @@ if (ret != LOAD_SUCCESS) break; } - if (ret == LOAD_SUCCESS && dlp != 0) { + if ((ret == LOAD_SUCCESS) && (depth == 1)) { vm_offset_t addr; shared_region_mapping_t shared_region; struct shared_region_task_mappings map_info; @@ -408,33 +412,91 @@ &(map_info.client_base), &(map_info.alternate_base), &(map_info.alternate_next), + &(map_info.fs_base), + &(map_info.system), &(map_info.flags), &next); - if((map_info.self != (vm_offset_t)system_shared_region) && - (map_info.flags & SHARED_REGION_SYSTEM)) { - shared_region_mapping_ref(system_shared_region); - vm_set_shared_region(task, system_shared_region); - shared_region_mapping_dealloc( + if((map_info.flags & SHARED_REGION_FULL) || + (map_info.flags & SHARED_REGION_STALE)) { + shared_region_mapping_t system_region; + system_region = lookup_default_shared_region( + map_info.fs_base, map_info.system); + if((map_info.self != (vm_offset_t)system_region) && + (map_info.flags & SHARED_REGION_SYSTEM)) { + if(system_region == NULL) { + shared_file_boot_time_init( + map_info.fs_base, map_info.system); + } else { + vm_set_shared_region(task, system_region); + } + shared_region_mapping_dealloc( (shared_region_mapping_t)map_info.self); - goto RedoLookup; + goto RedoLookup; + } else if (map_info.flags & SHARED_REGION_SYSTEM) { + shared_region_mapping_dealloc(system_region); + shared_file_boot_time_init( + map_info.fs_base, map_info.system); + shared_region_mapping_dealloc( + (shared_region_mapping_t)map_info.self); + } else { + shared_region_mapping_dealloc(system_region); + } } if (dylink_test) { p->p_flag |= P_NOSHLIB; /* no shlibs in use */ addr = map_info.client_base; - vm_map(map, &addr, map_info.text_size, 0, + if(clean_regions) { + vm_map(map, &addr, map_info.text_size, + 0, SHARED_LIB_ALIAS, + map_info.text_region, 0, FALSE, + VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); + } else { + vm_map(map, &addr, map_info.text_size, 0, (VM_MEMORY_SHARED_PMAP << 24) | SHARED_LIB_ALIAS, map_info.text_region, 0, FALSE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); + } addr = map_info.client_base + map_info.text_size; vm_map(map, &addr, map_info.data_size, 0, SHARED_LIB_ALIAS, map_info.data_region, 0, TRUE, VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); + + while (next) { + /* this should be fleshed out for the general case */ + /* but this is not necessary for now. Indeed we */ + /* are handling the com page inside of the */ + /* shared_region mapping create calls for now for */ + /* simplicities sake. If more general support is */ + /* needed the code to manipulate the shared range */ + /* chain can be pulled out and moved to the callers*/ + shared_region_mapping_info(next, + &(map_info.text_region), + &(map_info.text_size), + &(map_info.data_region), + &(map_info.data_size), + &(map_info.region_mappings), + &(map_info.client_base), + &(map_info.alternate_base), + &(map_info.alternate_next), + &(map_info.fs_base), + &(map_info.system), + &(map_info.flags), &next); + + addr = map_info.client_base; + vm_map(map, &addr, map_info.text_size, + 0, SHARED_LIB_ALIAS, + map_info.text_region, 0, FALSE, + VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE); + } } - ret = load_dylinker(dlp, map, thr_act, depth, result); + if (dlp != 0) { + ret = load_dylinker(dlp, map, thr_act, + depth, result, clean_regions); + } } if (kl_addr ) @@ -467,9 +529,6 @@ caddr_t tmp; vm_prot_t initprot; vm_prot_t maxprot; -#if 1 - extern int print_map_addr; -#endif /* 1 */ /* * Make sure what we get from the file is really ours (as specified @@ -478,15 +537,15 @@ if (scp->fileoff + scp->filesize > macho_size) return (LOAD_BADMACHO); - seg_size = round_page(scp->vmsize); + seg_size = round_page_32(scp->vmsize); if (seg_size == 0) return(KERN_SUCCESS); /* * Round sizes to page size. */ - map_size = round_page(scp->filesize); - map_addr = trunc_page(scp->vmaddr); + map_size = round_page_32(scp->filesize); + map_addr = trunc_page_32(scp->vmaddr); map_offset = pager_offset + scp->fileoff; @@ -504,10 +563,6 @@ if (ret != KERN_SUCCESS) return(LOAD_NOSPACE); -#if 1 - if (print_map_addr) - printf("LSegment: Mapped addr= %x; size = %x\n", map_addr, map_size); -#endif /* 1 */ /* * If the file didn't end on a page boundary, * we need to zero the leftover. @@ -570,18 +625,16 @@ load_return_t load_unixthread( struct thread_command *tcp, - thread_act_t thr_act, + thread_act_t thread, load_result_t *result ) { - thread_t thread = current_thread(); load_return_t ret; int customstack =0; if (result->thread_count != 0) return (LOAD_FAILURE); - thread = getshuttle_thread(thr_act); ret = load_threadstack(thread, (unsigned long *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), @@ -620,25 +673,23 @@ load_return_t load_thread( struct thread_command *tcp, - thread_act_t thr_act, + thread_act_t thread, load_result_t *result ) { - thread_t thread; kern_return_t kret; load_return_t lret; task_t task; int customstack=0; - task = get_threadtask(thr_act); - thread = getshuttle_thread(thr_act); + task = get_threadtask(thread); /* if count is 0; same as thr_act */ if (result->thread_count != 0) { kret = thread_create(task, &thread); if (kret != KERN_SUCCESS) return(LOAD_RESOURCE); - thread_deallocate(thread); + act_deallocate(thread); } lret = load_threadstate(thread, @@ -706,7 +757,7 @@ total_size -= (size+2)*sizeof(unsigned long); if (total_size < 0) return(LOAD_BADMACHO); - ret = thread_setstatus(getact_thread(thread), flavor, ts, size); + ret = thread_setstatus(thread, flavor, ts, size); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); ts += size; /* ts is a (unsigned long *) */ @@ -783,7 +834,8 @@ vm_map_t map, thread_act_t thr_act, int depth, - load_result_t *result + load_result_t *result, + boolean_t clean_regions ) { char *name; @@ -798,6 +850,7 @@ vm_map_copy_t tmp; vm_offset_t dyl_start, map_addr; vm_size_t dyl_length; + extern pmap_t pmap_create(vm_size_t size); /* XXX */ name = (char *)lcp + lcp->name.offset; /* @@ -824,7 +877,7 @@ ret = parse_machfile(vp, copy_map, thr_act, &header, file_offset, macho_size, - depth, &myresult); + depth, &myresult, clean_regions); if (ret) goto out; @@ -898,7 +951,7 @@ struct proc *p = current_proc(); /* XXXX */ boolean_t is_fat; struct fat_arch fat_arch; - int error = KERN_SUCCESS; + int error = LOAD_SUCCESS; int resid; union { struct mach_header mach_header; @@ -907,6 +960,7 @@ } header; off_t fsize = (off_t)0; struct ucred *cred = p->p_ucred; + int err2; ndp = &nid; atp = &attr; @@ -914,24 +968,31 @@ /* init the namei data to point the file user's program name */ NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p); - if (error = namei(ndp)) + if (error = namei(ndp)) { + if (error == ENOENT) + error = LOAD_ENOENT; + else + error = LOAD_FAILURE; return(error); + } vp = ndp->ni_vp; /* check for regular file */ if (vp->v_type != VREG) { - error = EACCES; + error = LOAD_PROTECT; goto bad1; } /* get attributes */ - if (error = VOP_GETATTR(vp, &attr, cred, p)) + if (error = VOP_GETATTR(vp, &attr, cred, p)) { + error = LOAD_FAILURE; goto bad1; + } /* Check mount point */ if (vp->v_mount->mnt_flag & MNT_NOEXEC) { - error = EACCES; + error = LOAD_PROTECT; goto bad1; } @@ -939,28 +1000,33 @@ atp->va_mode &= ~(VSUID | VSGID); /* check access. for root we have to see if any exec bit on */ - if (error = VOP_ACCESS(vp, VEXEC, cred, p)) + if (error = VOP_ACCESS(vp, VEXEC, cred, p)) { + error = LOAD_PROTECT; goto bad1; + } if ((atp->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { - error = EACCES; + error = LOAD_PROTECT; goto bad1; } /* hold the vnode for the IO */ if (UBCINFOEXISTS(vp) && !ubc_hold(vp)) { - error = ENOENT; + error = LOAD_ENOENT; goto bad1; } /* try to open it */ if (error = VOP_OPEN(vp, FREAD, cred, p)) { + error = LOAD_PROTECT; ubc_rele(vp); goto bad1; } if(error = vn_rdwr(UIO_READ, vp, (caddr_t)&header, sizeof(header), 0, - UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p)) + UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p)) { + error = LOAD_IOERROR; goto bad2; + } if (header.mach_header.magic == MH_MAGIC) is_fat = FALSE; @@ -979,11 +1045,11 @@ goto bad2; /* Read the Mach-O header out of it */ - error = vn_rdwr(UIO_READ, vp, &header.mach_header, + error = vn_rdwr(UIO_READ, vp, (caddr_t)&header.mach_header, sizeof(header.mach_header), fat_arch.offset, UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, p); if (error) { - error = LOAD_FAILURE; + error = LOAD_IOERROR; goto bad2; } @@ -1012,7 +1078,7 @@ bad2: VOP_UNLOCK(vp, 0, p); - error = VOP_CLOSE(vp, FREAD, cred, p); + err2 = VOP_CLOSE(vp, FREAD, cred, p); ubc_rele(vp); vrele(vp); return (error); diff -urN xnu-344.49/bsd/kern/mach_loader.h xnu-517/bsd/kern/mach_loader.h --- xnu-344.49/bsd/kern/mach_loader.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/mach_loader.h Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -38,20 +38,19 @@ #define _BSD_KERN_MACH_LOADER_H_ #include - #include typedef int load_return_t; typedef struct _load_result { - vm_offset_t mach_header; - vm_offset_t entry_point; - vm_offset_t user_stack; - int thread_count; - unsigned int - /* boolean_t */ unixproc :1, - dynlinker :1, - customstack :1, + vm_offset_t mach_header; + vm_offset_t entry_point; + vm_offset_t user_stack; + int thread_count; + unsigned int + /* boolean_t */ unixproc :1, + dynlinker :1, + customstack :1, :0; } load_result_t; @@ -62,7 +61,8 @@ unsigned long macho_size, load_result_t *result, thread_act_t thr_act, - vm_map_t map); + vm_map_t map, + boolean_t clean_regions); #define LOAD_SUCCESS 0 #define LOAD_BADARCH 1 /* CPU type/subtype not found */ @@ -72,5 +72,7 @@ #define LOAD_NOSPACE 5 /* No VM available */ #define LOAD_PROTECT 6 /* protection violation */ #define LOAD_RESOURCE 7 /* resource allocation failure */ +#define LOAD_ENOENT 8 /* resource not found */ +#define LOAD_IOERROR 9 /* IO error */ #endif /* _BSD_KERN_MACH_LOADER_H_ */ diff -urN xnu-344.49/bsd/kern/mach_process.c xnu-517/bsd/kern/mach_process.c --- xnu-344.49/bsd/kern/mach_process.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/mach_process.c Sat Oct 25 00:25:25 2003 @@ -117,7 +117,7 @@ int *locr0; int error = 0; #if defined(ppc) - struct ppc_thread_state statep; + struct ppc_thread_state64 statep; #elif defined(i386) struct i386_saved_state statep; #else @@ -291,8 +291,8 @@ goto errorLabel; } #elif defined(ppc) - state_count = PPC_THREAD_STATE_COUNT; - if (thread_getstatus(th_act, PPC_THREAD_STATE, &statep, &state_count) != KERN_SUCCESS) { + state_count = PPC_THREAD_STATE64_COUNT; + if (thread_getstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { goto errorLabel; } #else @@ -306,9 +306,9 @@ if (!ALIGNED((int)uap->addr, sizeof(int))) return (ERESTART); - statep.srr0 = (int)uap->addr; - state_count = PPC_THREAD_STATE_COUNT; - if (thread_setstatus(th_act, PPC_THREAD_STATE, &statep, &state_count) != KERN_SUCCESS) { + statep.srr0 = (uint64_t)((uint32_t)uap->addr); + state_count = PPC_THREAD_STATE64_COUNT; + if (thread_setstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { goto errorLabel; } #undef ALIGNED @@ -324,8 +324,8 @@ psignal_lock(t, uap->data, 0); } #if defined(ppc) - state_count = PPC_THREAD_STATE_COUNT; - if (thread_getstatus(th_act, PPC_THREAD_STATE, &statep, &state_count) != KERN_SUCCESS) { + state_count = PPC_THREAD_STATE64_COUNT; + if (thread_getstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { goto errorLabel; } #endif @@ -349,8 +349,8 @@ #endif } #if defined (ppc) - state_count = PPC_THREAD_STATE_COUNT; - if (thread_setstatus(th_act, PPC_THREAD_STATE, &statep, &state_count) != KERN_SUCCESS) { + state_count = PPC_THREAD_STATE64_COUNT; + if (thread_setstatus(th_act, PPC_THREAD_STATE64, &statep, &state_count) != KERN_SUCCESS) { goto errorLabel; } #endif @@ -359,7 +359,8 @@ t->p_stat = SRUN; if (t->sigwait) { wakeup((caddr_t)&(t->sigwait)); - task_release(task); + if ((t->p_flag & P_SIGEXC) == 0) + task_release(task); } break; diff -urN xnu-344.49/bsd/kern/netboot.c xnu-517/bsd/kern/netboot.c --- xnu-344.49/bsd/kern/netboot.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/netboot.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -348,7 +348,7 @@ if (PE_parse_boot_arg("vndevice", vndevice) == TRUE) { use_hdix = FALSE; } - _FREE_ZONE(vndevice, MAXPATHLEN, M_NAMEI); + FREE_ZONE(vndevice, MAXPATHLEN, M_NAMEI); info = (struct netboot_info *)kalloc(sizeof(*info)); bzero(info, sizeof(*info)); @@ -412,7 +412,7 @@ printf("netboot: root path uses unrecognized format\n"); } } - _FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); + FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); return (info); } diff -urN xnu-344.49/bsd/kern/posix_sem.c xnu-517/bsd/kern/posix_sem.c --- xnu-344.49/bsd/kern/posix_sem.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/posix_sem.c Sat Oct 25 00:25:25 2003 @@ -138,8 +138,10 @@ struct proc *p)); static int psem_closefile __P((struct file *fp, struct proc *p)); +static int psem_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); + struct fileops psemops = - { psem_read, psem_write, psem_ioctl, psem_select, psem_closefile }; + { psem_read, psem_write, psem_ioctl, psem_select, psem_closefile, psem_kqfilter }; /* * Lookup an entry in the cache @@ -310,7 +312,7 @@ register struct filedesc *fdp = p->p_fd; register struct file *fp; register struct vnode *vp; - int flags, i; + int i; struct file *nfp; int type, indx, error; struct psemname nd; @@ -334,7 +336,7 @@ MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); pathlen = MAXPATHLEN; - error = copyinstr(uap->name, pnbuf, + error = copyinstr((void *)uap->name, pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; @@ -446,13 +448,13 @@ pinfo->psem_flags &= ~PSEM_INCREATE; pinfo->psem_usecount++; pnode->pinfo = pinfo; - fp->f_flag = flags & FMASK; + fp->f_flag = fmode & FMASK; fp->f_type = DTYPE_PSXSEM; fp->f_ops = &psemops; fp->f_data = (caddr_t)pnode; *fdflags(p, indx) &= ~UF_RESERVED; *retval = indx; - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (0); bad3: @@ -473,7 +475,7 @@ fdrelse(p, indx); ffree(nfp); bad: - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } @@ -553,7 +555,7 @@ MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); pathlen = MAXPATHLEN; - error = copyinstr(uap->name, pnbuf, + error = copyinstr((void *)uap->name, pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; @@ -624,7 +626,7 @@ _FREE(pcache, M_SHM); error = 0; bad: - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } @@ -943,3 +945,13 @@ { return(EOPNOTSUPP); } + +static int +psem_kqfilter(fp, kn, p) + struct file *fp; + struct knote *kn; + struct proc *p; +{ + return (EOPNOTSUPP); +} + diff -urN xnu-344.49/bsd/kern/posix_shm.c xnu-517/bsd/kern/posix_shm.c --- xnu-344.49/bsd/kern/posix_shm.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/posix_shm.c Sat Oct 25 00:25:25 2003 @@ -143,8 +143,10 @@ struct proc *p)); static int pshm_closefile __P((struct file *fp, struct proc *p)); +static int pshm_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); + struct fileops pshmops = - { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile }; + { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile, pshm_kqfilter }; /* * Lookup an entry in the cache @@ -210,8 +212,8 @@ { register struct pshmcache *pcp; register struct pshmhashhead *pcpp; - register struct pshminfo *dpinfo; - register struct pshmcache *dpcp; + struct pshminfo *dpinfo; + struct pshmcache *dpcp; #if DIAGNOSTIC if (pnp->pshm_namelen > NCHNAMLEN) @@ -337,7 +339,7 @@ MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); pathlen = MAXPATHLEN; - error = copyinstr(uap->name, pnbuf, + error = copyinstr((void *)uap->name, (void *)pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; @@ -384,11 +386,13 @@ } else incache = 1; fmode = FFLAGS(uap->oflag); - if ((fmode & (FREAD | FWRITE))==0) - return(EINVAL); + if ((fmode & (FREAD | FWRITE))==0) { + error = EINVAL; + goto bad; + } if (error = falloc(p, &nfp, &indx)) - return (error); + goto bad; fp = nfp; cmode &= ALLPERMS; @@ -462,7 +466,7 @@ fp->f_data = (caddr_t)pnode; *fdflags(p, indx) &= ~UF_RESERVED; *retval = indx; - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (0); bad3: _FREE(pnode, M_SHM); @@ -474,7 +478,7 @@ fdrelse(p, indx); ffree(nfp); bad: - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); } @@ -510,7 +514,7 @@ return(EINVAL); } - size = round_page (length); + size = round_page_64(length); kret = vm_allocate(current_map(), &user_addr, size, TRUE); if (kret != KERN_SUCCESS) goto out; @@ -616,8 +620,8 @@ int pshm_mmap(struct proc *p, struct mmap_args *uap, register_t *retval, struct file *fp, vm_size_t pageoff) { - vm_offset_t user_addr = uap->addr; - vm_size_t user_size = uap->len ; + vm_offset_t user_addr = (vm_offset_t)uap->addr; + vm_size_t user_size = (vm_size_t)uap->len ; int prot = uap->prot; int flags = uap->flags; vm_object_offset_t file_pos = (vm_object_offset_t)uap->pos; @@ -664,9 +668,9 @@ if ((flags & MAP_FIXED) == 0) { find_space = TRUE; - user_addr = round_page(user_addr); + user_addr = round_page_32(user_addr); } else { - if (user_addr != trunc_page(user_addr)) + if (user_addr != trunc_page_32(user_addr)) return (EINVAL); find_space = FALSE; (void) vm_deallocate(user_map, user_addr, user_size); @@ -738,7 +742,7 @@ MALLOC_ZONE(pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); pathlen = MAXPATHLEN; - error = copyinstr(uap->name, pnbuf, + error = copyinstr((void *)uap->name, (void *)pnbuf, MAXPATHLEN, &pathlen); if (error) { goto bad; @@ -808,7 +812,7 @@ pinfo->pshm_flags |= PSHM_REMOVED; error = 0; bad: - _FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); + FREE_ZONE(pnbuf, MAXPATHLEN, M_NAMEI); return (error); out: switch (kret) { @@ -897,6 +901,15 @@ struct file *fp; int which; void *wql; + struct proc *p; +{ + return(EOPNOTSUPP); +} + +static int +pshm_kqfilter(fp, kn, p) + struct file *fp; + struct knote *kn; struct proc *p; { return(EOPNOTSUPP); diff -urN xnu-344.49/bsd/kern/qsort.c xnu-517/bsd/kern/qsort.c --- xnu-344.49/bsd/kern/qsort.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/qsort.c Sat Oct 25 00:25:25 2003 @@ -129,16 +129,16 @@ loop: SWAPINIT(a, es); swap_cnt = 0; if (n < 7) { - for (pm = a + es; pm < (char *) a + n * es; pm += es) + for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es) for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } - pm = a + (n / 2) * es; + pm = (char *)a + (n / 2) * es; if (n > 7) { pl = a; - pn = a + (n - 1) * es; + pn = (char *)a + (n - 1) * es; if (n > 40) { d = (n / 8) * es; pl = med3(pl, pl + d, pl + 2 * d, cmp); @@ -148,9 +148,9 @@ pm = med3(pl, pm, pn, cmp); } swap(a, pm); - pa = pb = a + es; + pa = pb = (char *)a + es; - pc = pd = a + (n - 1) * es; + pc = pd = (char *)a + (n - 1) * es; for (;;) { while (pb <= pc && (r = cmp(pb, a)) <= 0) { if (r == 0) { @@ -176,14 +176,14 @@ pc -= es; } if (swap_cnt == 0) { /* Switch to insertion sort */ - for (pm = a + es; pm < (char *) a + n * es; pm += es) + for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es) for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } - pn = a + n * es; + pn = (char *)a + n * es; r = min(pa - (char *)a, pb - pa); vecswap(a, pb - r, r); r = min(pd - pc, pn - pd - es); diff -urN xnu-344.49/bsd/kern/subr_log.c xnu-517/bsd/kern/subr_log.c --- xnu-344.49/bsd/kern/subr_log.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/subr_log.c Sat Oct 25 00:25:25 2003 @@ -259,7 +259,7 @@ /*ARGSUSED*/ int -logioctl(com, data, flag) +logioctl(dev, com, data, flag) caddr_t data; { long l; diff -urN xnu-344.49/bsd/kern/subr_prf.c xnu-517/bsd/kern/subr_prf.c --- xnu-344.49/bsd/kern/subr_prf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/subr_prf.c Sat Oct 25 00:25:25 2003 @@ -210,14 +210,13 @@ flags |= TOTTY; tp = sess->s_ttyp; } - if (tp != NULL) { - pca.flags = TOTTY; - pca.tty = tp; - - va_start(ap, fmt); - __doprnt(fmt, &ap, putchar, &pca, 10); - va_end(ap); - } + + pca.flags = flags; + pca.tty = tp; + va_start(ap, fmt); + __doprnt(fmt, &ap, putchar, &pca, 10); + va_end(ap); + logwakeup(); } diff -urN xnu-344.49/bsd/kern/subr_prof.c xnu-517/bsd/kern/subr_prof.c --- xnu-344.49/bsd/kern/subr_prof.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/subr_prof.c Sat Oct 25 00:25:25 2003 @@ -75,6 +75,8 @@ #include #include +decl_simple_lock_data(,mcount_lock); + /* * Froms is actually a bunch of unsigned shorts indexing tos */ @@ -121,6 +123,7 @@ p->kcount = (u_short *)cp; cp += p->kcountsize; p->froms = (u_short *)cp; + simple_lock_init(&mcount_lock); } /* @@ -183,7 +186,6 @@ register struct tostruct *top, *prevtop; struct gmonparam *p = &_gmonparam; register long toindex; - MCOUNT_INIT; /* * check that we are profiling @@ -192,7 +194,7 @@ if (p->state != GMON_PROF_ON) return; - MCOUNT_ENTER; + usimple_lock(&mcount_lock); /* * check that frompcindex is a reasonable pc value. @@ -275,25 +277,20 @@ } done: - MCOUNT_EXIT; + usimple_unlock(&mcount_lock); return; overflow: p->state = GMON_PROF_ERROR; - MCOUNT_EXIT; + usimple_unlock(&mcount_lock); printf("mcount: tos overflow\n"); return; } #endif /* GPROF */ -#if NCPUS > 1 #define PROFILE_LOCK(x) simple_lock(x) #define PROFILE_UNLOCK(x) simple_unlock(x) -#else -#define PROFILE_LOCK(x) -#define PROFILE_UNLOCK(x) -#endif struct profil_args { short *bufbase; @@ -319,7 +316,7 @@ } /* Block profile interrupts while changing state. */ - s = splstatclock(); + s = ml_set_interrupts_enabled(FALSE); PROFILE_LOCK(&upp->pr_lock); upp->pr_base = (caddr_t)uap->bufbase; upp->pr_size = uap->bufsize; @@ -335,7 +332,7 @@ upp->pr_next = 0; PROFILE_UNLOCK(&upp->pr_lock); startprofclock(p); - splx(s); + ml_set_interrupts_enabled(s); return(0); } @@ -356,7 +353,7 @@ if (upp->pr_scale == 0) return (0); - s = splstatclock(); + s = ml_set_interrupts_enabled(FALSE); upc = (struct uprof *) kalloc(sizeof (struct uprof)); upc->pr_base = (caddr_t)uap->bufbase; upc->pr_size = uap->bufsize; @@ -366,7 +363,7 @@ upc->pr_next = upp->pr_next; upp->pr_next = upc; PROFILE_UNLOCK(&upp->pr_lock); - splx(s); + ml_set_interrupts_enabled(s); return(0); } diff -urN xnu-344.49/bsd/kern/sys_generic.c xnu-517/bsd/kern/sys_generic.c --- xnu-344.49/bsd/kern/sys_generic.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sys_generic.c Sat Oct 25 00:25:25 2003 @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -109,13 +110,10 @@ #if KTRACE #include #endif +#include -static int dofileread __P((struct proc *, struct file *, int, void *, - size_t, off_t, int, int*)); -static int dofilewrite __P((struct proc *, struct file *, int, - const void *, size_t, off_t, int, int*)); -static struct file* +__private_extern__ struct file* holdfp(fdp, fd, flag) struct filedesc* fdp; int fd, flag; @@ -191,13 +189,18 @@ uap->offset, FOF_OFFSET, retval); } frele(fp); + + if (!error) + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE), + uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); + return(error); } /* * Code common for read and pread */ -int +__private_extern__ int dofileread(p, fp, fd, buf, nbyte, offset, flags, retval) struct proc *p; struct file *fp; @@ -357,10 +360,15 @@ uap->offset, FOF_OFFSET, retval); } frele(fp); + + if (!error) + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE), + uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); + return(error); } -static int +__private_extern__ int dofilewrite(p, fp, fd, buf, nbyte, offset, flags, retval) struct proc *p; struct file *fp; @@ -407,8 +415,9 @@ if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; - if (error == EPIPE) - psignal(p, SIGPIPE); + /* The socket layer handles SIGPIPE */ + if (error == EPIPE && fp->f_type != DTYPE_SOCKET) + psignal(p, SIGPIPE); } cnt -= auio.uio_resid; #if KTRACE @@ -1031,6 +1040,7 @@ int nfunnel = 0; int count, nfcount; char * wql_ptr; + struct vnode *vp; /* * Problems when reboot; due to MacOSX signal probs @@ -1072,7 +1082,18 @@ wql_ptr = (char *)0; else wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK); - if (fp->f_ops && (fp->f_type != DTYPE_SOCKET) + /* + * Merlot: need to remove the bogus f_data check + * from the following "if" statement. It's there + * because of various problems stemming from + * races due to the split-funnels and lack of real + * referencing on sockets... + */ + if (fp->f_ops && (fp->f_type != DTYPE_SOCKET) + && (fp->f_data != (caddr_t)-1) + && !(fp->f_type == DTYPE_VNODE + && (vp = (struct vnode *)fp->f_data) + && vp->v_type == VFIFO) && fo_select(fp, flag[msk], wql_ptr, p)) { optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); n++; @@ -1105,8 +1126,13 @@ wql_ptr = (char *)0; else wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK); - if (fp->f_ops && (fp->f_type == DTYPE_SOCKET) && - fo_select(fp, flag[msk], wql_ptr, p)) { + if (fp->f_ops + && (fp->f_type == DTYPE_SOCKET + || (fp->f_type == DTYPE_VNODE + && (vp = (struct vnode *)fp->f_data) + && vp != (struct vnode *)-1 + && vp->v_type == VFIFO)) + && fo_select(fp, flag[msk], wql_ptr, p)) { optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); n++; } @@ -1150,6 +1176,7 @@ static int flag[3] = { FREAD, FWRITE, 0 }; u_int32_t *iptr, *fptr, *fbits; u_int nw; + struct vnode *vp; /* * Problems when reboot; due to MacOSX signal probs @@ -1177,7 +1204,10 @@ *nfcount=0; return(EBADF); } - if (fp->f_type == DTYPE_SOCKET) + if (fp->f_type == DTYPE_SOCKET || + (fp->f_type == DTYPE_VNODE + && (vp = (struct vnode *)fp->f_data) + && vp->v_type == VFIFO)) nfc++; n++; } @@ -1212,7 +1242,7 @@ } if ((sip->si_flags & SI_INITED) == 0) { - wait_queue_init(&sip->wait_queue, SYNC_POLICY_FIFO); + wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO); sip->si_flags |= SI_INITED; sip->si_flags &= ~SI_CLEAR; } @@ -1223,8 +1253,8 @@ sip->si_flags &= ~SI_COLL; sip->si_flags |= SI_RECORDED; - if (!wait_queue_member(&sip->wait_queue, ut->uu_wqsub)) - wait_queue_link_noalloc(&sip->wait_queue, ut->uu_wqsub, (wait_queue_link_t)p_wql); + if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqsub)) + wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqsub, (wait_queue_link_t)p_wql); return; } @@ -1248,7 +1278,7 @@ } if (sip->si_flags & SI_RECORDED) { - wait_queue_wakeup_all(&sip->wait_queue, &selwait, THREAD_AWAKENED); + wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED); sip->si_flags &= ~SI_RECORDED; } @@ -1267,7 +1297,7 @@ sip->si_flags &= ~(SI_RECORDED | SI_COLL); } sip->si_flags |= SI_CLEAR; - wait_queue_unlinkall_nofree(&sip->wait_queue); + wait_queue_unlinkall_nofree(&sip->si_wait_queue); } @@ -1644,7 +1674,7 @@ } if (interval != 0) - clock_absolutetime_interval_to_deadline(interval, &abstime) + clock_absolutetime_interval_to_deadline(interval, &abstime); KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0); error = tsleep1(&p->p_evlist, PSOCK | PCATCH, @@ -1702,8 +1732,10 @@ return(EBADF); if (fp->f_type != DTYPE_SOCKET) return(EINVAL); // for now must be sock sp = (struct socket *)fp->f_data; - assert(sp != NULL); + /* soo_close sets f_data to 0 before switching funnel */ + if (sp == (struct socket *)0) + return(EBADF); // locate event if possible for (evq = sp->so_evlist.tqh_first; diff -urN xnu-344.49/bsd/kern/sys_socket.c xnu-517/bsd/kern/sys_socket.c --- xnu-344.49/bsd/kern/sys_socket.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sys_socket.c Sat Oct 25 00:25:25 2003 @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -80,8 +81,10 @@ int soo_select __P((struct file *fp, int which, void * wql, struct proc *p)); +int soo_kqfilter __P((struct file *fp, struct knote *kn, struct proc *p)); + struct fileops socketops = - { soo_read, soo_write, soo_ioctl, soo_select, soo_close }; + { soo_read, soo_write, soo_ioctl, soo_select, soo_close, soo_kqfilter }; /* ARGSUSED */ int @@ -346,6 +349,7 @@ register int s = splnet(); int retnum=0; + if (so == NULL || so == (struct socket*)-1) goto done; switch (which) { @@ -414,14 +418,17 @@ struct proc *p; { int error = 0; + struct socket *sp; + + sp = (struct socket *)fp->f_data; + fp->f_data = NULL; thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - if (fp->f_data) - error = soclose((struct socket *)fp->f_data); + if (sp) + error = soclose(sp); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - fp->f_data = 0; return (error); } diff -urN xnu-344.49/bsd/kern/syscalls.c xnu-517/bsd/kern/syscalls.c --- xnu-344.49/bsd/kern/syscalls.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/syscalls.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -128,7 +128,11 @@ "getpriority", /* 100 = getpriority */ "old_send", /* 101 = old send */ "old_recv", /* 102 = old recv */ +#ifdef __ppc__ + "osigreturn", /* 103 = sigreturn */ +#else "sigreturn", /* 103 = sigreturn */ +#endif "bind", /* 104 = bind */ "setsockopt", /* 105 = setsockopt */ "listen", /* 106 = listen */ @@ -209,7 +213,11 @@ "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ +#ifdef __ppc__ + "sigreturn", /* 184 = sigreturn */ +#else "#184", /* 184 = nosys */ +#endif "#185", /* 185 = nosys */ "#186", /* 186 = nosys */ "#187", /* 187 = nosys */ @@ -239,8 +247,8 @@ "ATPgetreq", /* 211 = ATPgetreq */ "ATPgetrsp", /* 212 = ATPgetrsp */ "#213", /* 213 = Reserved for AppleTalk */ - "#214", /* 214 = Reserved for AppleTalk */ - "#215", /* 215 = Reserved for AppleTalk */ + "kqueue_from_portset_np", /* 214 = kqueue_from_portset_np */ + "kqueue_portset_np", /* 215 = kqueue_portset_np */ "#216", /* 216 = Reserved */ "#217", /* 217 = Reserved */ "#218", /* 218 = Reserved */ @@ -272,8 +280,8 @@ "#244", /* 244 = nosys */ "#245", /* 245 = nosys */ "#246", /* 246 = nosys */ - "#247", /* 247 = nosys */ - "#248", /* 248 = nosys */ + "nfsclnt", /* 247 = nfsclnt */ + "fhopen", /* 248 = fhopen */ "#249", /* 249 = nosys */ "minherit", /* 250 = minherit */ "semsys", /* 251 = semsys */ @@ -338,41 +346,61 @@ "getsid", /* 310 = getsid */ "#311", /* 311 = setresuid */ "#312", /* 312 = setresgid */ - "#313", /* 313 = obsolete signanosleep */ - "#314", /* 314 = aio_return */ - "#315", /* 315 = aio_suspend */ - "#316", /* 316 = aio_cancel */ - "#317", /* 317 = aio_error */ - "#318", /* 318 = aio_read */ - "#319", /* 319 = aio_write */ - "#320", /* 320 = lio_listio */ + "aio_fsync", /* 313 = aio_fsync */ + "aio_return", /* 314 = aio_return */ + "aio_suspend", /* 315 = aio_suspend */ + "aio_cancel", /* 316 = aio_cancel */ + "aio_error", /* 317 = aio_error */ + "aio_read", /* 318 = aio_read */ + "aio_write", /* 319 = aio_write */ + "lio_listio", /* 320 = lio_listio */ "#321", /* 321 = yield */ "#322", /* 322 = thr_sleep */ "#323", /* 323 = thr_wakeup */ "mlockall", /* 324 = mlockall */ "munlockall", /* 325 = munlockall */ - "#326", /* 326 */ + "#326", /* 326 */ "issetugid", /* 327 = issetugid */ "__pthread_kill", /* 328 = __pthread_kill */ "pthread_sigmask", /* 329 = pthread_sigmask */ - "sigwait", /* 330 = sigwait */ - "#331", /* 331 */ - "#332", /* 332 */ - "#333", /* 333 */ - "#334", /* 334 */ - "utrace", /* 335 = utrace */ - "#336", /* 336 */ - "#337", /* 337 */ - "#338", /* 338 */ - "#339", /* 339 */ - "#340", /* 340 */ - "#341", /* 341 */ - "#342", /* 342 */ - "#343", /* 343 */ - "#344", /* 344 */ - "#345", /* 345 */ - "#346", /* 346 */ - "#347", /* 347 */ - "#348", /* 348 */ - "#349" /* 349 */ + "sigwait", /* 330 = sigwait */ + "#331", /* 331 */ + "#332", /* 332 */ + "#333", /* 333 */ + "#334", /* 334 */ + "utrace", /* 335 = utrace */ + "#336", /* 336 */ + "#337", /* 337 */ + "#338", /* 338 */ + "#339", /* 339 */ + "#340", /* 340 = TBD sigprocmask */ + "#341", /* 341 = TBD sigsuspend */ + "#342", /* 342 = TBD sigaction */ + "#343", /* 343 = TBD sigpending */ + "#344", /* 344 = TBD sigreturn */ + "#345", /* 345 = TBD sigtimedwait */ + "#346", /* 346 = TBD sigwaitinfo */ + "#347", /* 347 */ + "#348", /* 348 */ + "#349" /* 349 */ + "audit", /* 350 */ + "auditon", /* 351 */ + "auditsvc", /* 352 */ + "getauid", /* 353 */ + "setauid", /* 354 */ + "getaudit", /* 355 */ + "setaudit", /* 356 */ + "getaudit_addr", /* 357 */ + "setaudit_addr", /* 358 */ + "auditctl", /* 359 */ + "#360", /* 360 */ + "#361", /* 361 */ + "kqueue", /* 362 = kqueue */ + "kevent", /* 363 = kevent */ + "#364", /* 364 */ + "#365", /* 365 */ + "#366", /* 366 */ + "#367", /* 367 */ + "#368", /* 368 */ + "#369" /* 369 */ }; diff -urN xnu-344.49/bsd/kern/sysctl_init.c xnu-517/bsd/kern/sysctl_init.c --- xnu-344.49/bsd/kern/sysctl_init.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sysctl_init.c Sat Oct 25 00:25:25 2003 @@ -82,23 +82,29 @@ extern struct sysctl_oid sysctl__hw_l3cachesize_compat; extern struct sysctl_oid sysctl__hw_tbfrequency_compat; -extern struct sysctl_oid sysctl__hw__cpu_capabilities; - extern struct sysctl_oid sysctl__kern_sysv_shmmax; extern struct sysctl_oid sysctl__kern_sysv_shmmin; extern struct sysctl_oid sysctl__kern_sysv_shmmni; extern struct sysctl_oid sysctl__kern_sysv_shmseg; extern struct sysctl_oid sysctl__kern_sysv_shmall; +extern struct sysctl_oid sysctl__kern_sysv_semmni; +extern struct sysctl_oid sysctl__kern_sysv_semmns; +extern struct sysctl_oid sysctl__kern_sysv_semmnu; +extern struct sysctl_oid sysctl__kern_sysv_semmsl; +extern struct sysctl_oid sysctl__kern_sysv_semume; + extern struct sysctl_oid sysctl__kern_dummy; extern struct sysctl_oid sysctl__kern_ipc_maxsockbuf; extern struct sysctl_oid sysctl__kern_ipc_nmbclusters; extern struct sysctl_oid sysctl__kern_ipc_sockbuf_waste_factor; extern struct sysctl_oid sysctl__kern_ipc_somaxconn; extern struct sysctl_oid sysctl__kern_ipc_sosendminchain; +extern struct sysctl_oid sysctl__kern_ipc_sorecvmincopy; extern struct sysctl_oid sysctl__kern_ipc_maxsockets; extern struct sysctl_oid sysctl__net_inet_icmp_icmplim; extern struct sysctl_oid sysctl__net_inet_icmp_maskrepl; +extern struct sysctl_oid sysctl__net_inet_icmp_timestamp; extern struct sysctl_oid sysctl__net_inet_icmp_bmcastecho; extern struct sysctl_oid sysctl__net_inet_icmp_log_redirect; extern struct sysctl_oid sysctl__net_inet_icmp_drop_redirect; @@ -123,6 +129,7 @@ extern struct sysctl_oid sysctl__net_inet_ip_maxfragpackets; extern struct sysctl_oid sysctl__net_inet_ip_check_interface; extern struct sysctl_oid sysctl__net_inet_ip_check_route_selfref; +extern struct sysctl_oid sysctl__net_inet_ip_use_route_genid; #if NGIF > 0 extern struct sysctl_oid sysctl__net_inet_ip_gifttl; #endif @@ -181,6 +188,7 @@ #if TCPDEBUG extern struct sysctl_oid sysctl__net_inet_tcp_tcpconsdebug; #endif +extern struct sysctl_oid sysctl__net_inet_tcp_sockthreshold; extern struct sysctl_oid sysctl__net_inet_udp_log_in_vain; extern struct sysctl_oid sysctl__net_inet_udp_checksum; extern struct sysctl_oid sysctl__net_inet_udp_maxdgram; @@ -246,6 +254,15 @@ extern struct sysctl_oid sysctl__vfs_nfs_diskless_swappath; extern struct sysctl_oid sysctl__vfs_nfs_nfsstats; #endif +extern struct sysctl_oid sysctl__vfs_generic_nfs_client_initialdowndelay; +extern struct sysctl_oid sysctl__vfs_generic_nfs_client_nextdowndelay; +extern struct sysctl_oid sysctl__vfs_generic_nfs_client; +extern struct sysctl_oid sysctl__vfs_generic_nfs; + +extern struct sysctl_oid sysctl__vfs_generic; +extern struct sysctl_oid sysctl__vfs_generic_vfsidlist; +extern struct sysctl_oid sysctl__vfs_generic_ctlbyfsid; +extern struct sysctl_oid sysctl__vfs_generic_noremotehang; extern struct sysctl_oid sysctl__kern_ipc; extern struct sysctl_oid sysctl__kern_sysv; @@ -301,6 +318,7 @@ extern struct sysctl_oid sysctl__net_inet_udp_stats; extern struct sysctl_oid sysctl__kern; extern struct sysctl_oid sysctl__hw; +extern struct sysctl_oid sysctl__machdep; extern struct sysctl_oid sysctl__net; extern struct sysctl_oid sysctl__debug; extern struct sysctl_oid sysctl__vfs; @@ -336,7 +354,6 @@ extern struct sysctl_oid sysctl__net_inet6_ip6_rtmaxcache; extern struct sysctl_oid sysctl__net_inet6_ip6_temppltime; extern struct sysctl_oid sysctl__net_inet6_ip6_tempvltime; -extern struct sysctl_oid sysctl__net_inet6_ip6_auto_on; #if IPV6FIREWALL extern struct sysctl_oid sysctl__net_inet6_ip6_fw; extern struct sysctl_oid sysctl__net_inet6_ip6_fw_debug; @@ -371,6 +388,7 @@ #endif #if IPSEC extern struct sysctl_oid sysctl__net_inet_ipsec; +extern struct sysctl_oid sysctl__net_inet_ipsec_esp_port; extern struct sysctl_oid sysctl__net_inet_ipsec_bypass; extern struct sysctl_oid sysctl__net_inet_ipsec_def_policy; extern struct sysctl_oid sysctl__net_inet_ipsec_esp_randpad; @@ -386,6 +404,7 @@ extern struct sysctl_oid sysctl__net_inet_ipsec_stats; extern struct sysctl_oid sysctl__net_key; extern struct sysctl_oid sysctl__net_key_debug; +extern struct sysctl_oid sysctl__net_key_prefered_oldsa; extern struct sysctl_oid sysctl__net_key_spi_trycnt; extern struct sysctl_oid sysctl__net_key_spi_minval; extern struct sysctl_oid sysctl__net_key_spi_maxval; @@ -394,7 +413,9 @@ extern struct sysctl_oid sysctl__net_key_blockacq_count; extern struct sysctl_oid sysctl__net_key_blockacq_lifetime; extern struct sysctl_oid sysctl__net_key_esp_keymin; +extern struct sysctl_oid sysctl__net_key_esp_auth; extern struct sysctl_oid sysctl__net_key_ah_keymin; +extern struct sysctl_oid sysctl__net_key_natt_keepalive_interval; #endif @@ -402,6 +423,7 @@ { &sysctl__kern, &sysctl__hw, + &sysctl__machdep, &sysctl__net, &sysctl__debug, &sysctl__vfs, @@ -423,12 +445,18 @@ ,&sysctl__kern_sysv_shmmni ,&sysctl__kern_sysv_shmseg ,&sysctl__kern_sysv_shmall + ,&sysctl__kern_sysv_semmni + ,&sysctl__kern_sysv_semmns + ,&sysctl__kern_sysv_semmnu + ,&sysctl__kern_sysv_semmsl + ,&sysctl__kern_sysv_semume ,&sysctl__kern_dummy ,&sysctl__kern_ipc_maxsockbuf ,&sysctl__kern_ipc_nmbclusters ,&sysctl__kern_ipc_sockbuf_waste_factor ,&sysctl__kern_ipc_somaxconn ,&sysctl__kern_ipc_sosendminchain + ,&sysctl__kern_ipc_sorecvmincopy ,&sysctl__kern_ipc_maxsockets ,&sysctl__hw_machine @@ -471,10 +499,9 @@ ,&sysctl__hw_l3cachesize_compat ,&sysctl__hw_tbfrequency_compat - ,&sysctl__hw__cpu_capabilities - ,&sysctl__net_inet_icmp_icmplim ,&sysctl__net_inet_icmp_maskrepl + ,&sysctl__net_inet_icmp_timestamp ,&sysctl__net_inet_icmp_bmcastecho ,&sysctl__net_inet_icmp_drop_redirect ,&sysctl__net_inet_icmp_log_redirect @@ -497,6 +524,7 @@ ,&sysctl__net_inet_ip_maxfragpackets ,&sysctl__net_inet_ip_check_interface ,&sysctl__net_inet_ip_check_route_selfref + ,&sysctl__net_inet_ip_use_route_genid #if NGIF > 0 ,&sysctl__net_inet_ip_gifttl #endif @@ -552,6 +580,7 @@ #if TCPDEBUG ,&sysctl__net_inet_tcp_tcpconsdebug #endif + ,&sysctl__net_inet_tcp_sockthreshold ,&sysctl__net_inet_udp_log_in_vain ,&sysctl__net_inet_udp_checksum ,&sysctl__net_inet_udp_maxdgram @@ -616,6 +645,14 @@ ,&sysctl__vfs_nfs_diskless_swappath ,&sysctl__vfs_nfs_nfsstats #endif + ,&sysctl__vfs_generic + ,&sysctl__vfs_generic_vfsidlist + ,&sysctl__vfs_generic_ctlbyfsid + ,&sysctl__vfs_generic_noremotehang + ,&sysctl__vfs_generic_nfs + ,&sysctl__vfs_generic_nfs_client + ,&sysctl__vfs_generic_nfs_client_initialdowndelay + ,&sysctl__vfs_generic_nfs_client_nextdowndelay ,&sysctl__kern_ipc ,&sysctl__kern_sysv ,&sysctl__net_inet @@ -689,7 +726,6 @@ ,&sysctl__net_inet6_ip6_rtmaxcache ,&sysctl__net_inet6_ip6_temppltime ,&sysctl__net_inet6_ip6_tempvltime - ,&sysctl__net_inet6_ip6_auto_on ,&sysctl__net_inet6_icmp6_rediraccept ,&sysctl__net_inet6_icmp6_redirtimeout ,&sysctl__net_inet6_icmp6_nd6_prune @@ -725,6 +761,7 @@ #if IPSEC ,&sysctl__net_key ,&sysctl__net_key_debug + ,&sysctl__net_key_prefered_oldsa ,&sysctl__net_key_spi_trycnt ,&sysctl__net_key_spi_minval ,&sysctl__net_key_spi_maxval @@ -733,7 +770,9 @@ ,&sysctl__net_key_blockacq_count ,&sysctl__net_key_blockacq_lifetime ,&sysctl__net_key_esp_keymin + ,&sysctl__net_key_esp_auth ,&sysctl__net_key_ah_keymin + ,&sysctl__net_key_natt_keepalive_interval ,&sysctl__net_inet_ipsec ,&sysctl__net_inet_ipsec_stats ,&sysctl__net_inet_ipsec_def_policy @@ -748,6 +787,7 @@ ,&sysctl__net_inet_ipsec_debug ,&sysctl__net_inet_ipsec_esp_randpad ,&sysctl__net_inet_ipsec_bypass + ,&sysctl__net_inet_ipsec_esp_port #endif ,(struct sysctl_oid *) 0 }; diff -urN xnu-344.49/bsd/kern/sysv_msg.c xnu-517/bsd/kern/sysv_msg.c --- xnu-344.49/bsd/kern/sysv_msg.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sysv_msg.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ #include #include #include +#include static void msginit __P((void *)); SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL) @@ -209,6 +210,8 @@ printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr); #endif + AUDIT_ARG(svipc_cmd, cmd); + AUDIT_ARG(svipc_id, msqid); msqid = IPCID_TO_IX(msqid); if (msqid < 0 || msqid >= msginfo.msgmni) { @@ -426,6 +429,7 @@ found: /* Construct the unique msqid */ p->p_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); + AUDIT_ARG(svipc_id, p->p_retval[0]); return(0); } @@ -458,6 +462,7 @@ msgflg); #endif + AUDIT_ARG(svipc_id, msqid); msqid = IPCID_TO_IX(msqid); if (msqid < 0 || msqid >= msginfo.msgmni) { @@ -796,6 +801,7 @@ msgsz, msgtyp, msgflg); #endif + AUDIT_ARG(svipc_id, msqid); msqid = IPCID_TO_IX(msqid); if (msqid < 0 || msqid >= msginfo.msgmni) { diff -urN xnu-344.49/bsd/kern/sysv_sem.c xnu-517/bsd/kern/sysv_sem.c --- xnu-344.49/bsd/kern/sysv_sem.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sysv_sem.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -43,6 +43,8 @@ #include #include +#include +#include /*#include */ /*#include */ @@ -573,7 +575,7 @@ /* Didn't find the right entry - create it */ if (adjval == 0) return(0); - if (suptr->un_cnt != seminfo.semume) { + if (suptr->un_cnt != limitseminfo.semume) { sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; @@ -641,13 +643,15 @@ struct semid_ds sbuf; register struct semid_ds *semaptr; + AUDIT_ARG(svipc_cmd, cmd); + AUDIT_ARG(svipc_id, semid); SUBSYSTEM_LOCK_AQUIRE(p); #ifdef SEM_DEBUG printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg); #endif semid = IPCID_TO_IX(semid); - if (semid < 0 || semid >= seminfo.semmsl) + if (semid < 0 || semid >= seminfo.semmni) { #ifdef SEM_DEBUG printf("Invalid semid\n"); @@ -864,7 +868,7 @@ printf("need to allocate an id for the request\n"); #endif if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { - if (nsems <= 0 || nsems > seminfo.semmsl) { + if (nsems <= 0 || nsems > limitseminfo.semmsl) { #ifdef SEM_DEBUG printf("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl); @@ -931,6 +935,7 @@ found: *retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm); + AUDIT_ARG(svipc_id, *retval); #ifdef SEM_DEBUG printf("semget is done, returning %d\n", *retval); #endif @@ -963,6 +968,7 @@ int i, j, eval; int do_wakeup, do_undos; + AUDIT_ARG(svipc_id, uap->semid); SUBSYSTEM_LOCK_AQUIRE(p); #ifdef SEM_DEBUG printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops); @@ -970,7 +976,7 @@ semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ - if (semid < 0 || semid >= seminfo.semmsl) + if (semid < 0 || semid >= seminfo.semmni) UNLOCK_AND_RETURN(EINVAL); semaptr = &sema[semid]; @@ -1366,4 +1372,49 @@ SUBSYSTEM_LOCK_RELEASE; } +/* (struct sysctl_oid *oidp, void *arg1, int arg2, \ + struct sysctl_req *req) */ +static int +sysctl_seminfo SYSCTL_HANDLER_ARGS +{ + int error = 0; + + error = SYSCTL_OUT(req, arg1, sizeof(int)); + if (error || !req->newptr) + return(error); + + SUBSYSTEM_LOCK_AQUIRE(current_proc()); + /* Set the values only if shared memory is not initialised */ + if ((sem == (struct sem *) 0) && + (sema == (struct semid_ds *) 0) && + (semu == (struct semid_ds *) 0) && + (semu_list == (struct sem_undo *) 0)) { + if (error = SYSCTL_IN(req, arg1, sizeof(int))) { + goto out; + } + } else + error = EINVAL; +out: + SUBSYSTEM_LOCK_RELEASE; + return(error); + +} + +/* SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW, 0, "SYSV"); */ +extern struct sysctl_oid_list sysctl__kern_sysv_children; +SYSCTL_PROC(_kern_sysv, KSYSV_SEMMNI, semmni, CTLTYPE_INT | CTLFLAG_RW, + &limitseminfo.semmni, 0, &sysctl_seminfo ,"I","semmni"); + +SYSCTL_PROC(_kern_sysv, KSYSV_SEMMNS, semmns, CTLTYPE_INT | CTLFLAG_RW, + &limitseminfo.semmns, 0, &sysctl_seminfo ,"I","semmns"); + +SYSCTL_PROC(_kern_sysv, KSYSV_SEMMNU, semmnu, CTLTYPE_INT | CTLFLAG_RW, + &limitseminfo.semmnu, 0, &sysctl_seminfo ,"I","semmnu"); + +SYSCTL_PROC(_kern_sysv, KSYSV_SEMMSL, semmsl, CTLTYPE_INT | CTLFLAG_RW, + &limitseminfo.semmsl, 0, &sysctl_seminfo ,"I","semmsl"); + +SYSCTL_PROC(_kern_sysv, KSYSV_SEMUNE, semume, CTLTYPE_INT | CTLFLAG_RW, + &limitseminfo.semume, 0, &sysctl_seminfo ,"I","semume"); + diff -urN xnu-344.49/bsd/kern/sysv_shm.c xnu-517/bsd/kern/sysv_shm.c --- xnu-344.49/bsd/kern/sysv_shm.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/sysv_shm.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -120,7 +121,7 @@ static void shm_deallocate_segment __P((struct shmid_ds *)); static int shm_find_segment_by_key __P((key_t)); static struct shmid_ds *shm_find_segment_by_shmid __P((int)); -static int shm_delete_mapping __P((struct proc *, struct shmmap_state *)); +static int shm_delete_mapping __P((struct proc *, struct shmmap_state *, int)); #ifdef __APPLE_API_PRIVATE struct shminfo shminfo = { @@ -173,7 +174,7 @@ char * ptr; shm_handle = shmseg->shm_internal; - size = round_page(shmseg->shm_segsz); + size = round_page_32(shmseg->shm_segsz); mach_destroy_memory_entry(shm_handle->shm_object); FREE((caddr_t)shm_handle, M_SHM); shmseg->shm_internal = NULL; @@ -183,9 +184,10 @@ } static int -shm_delete_mapping(p, shmmap_s) +shm_delete_mapping(p, shmmap_s, deallocate) struct proc *p; struct shmmap_state *shmmap_s; + int deallocate; { struct shmid_ds *shmseg; int segnum, result; @@ -193,10 +195,12 @@ segnum = IPCID_TO_IX(shmmap_s->shmid); shmseg = &shmsegs[segnum]; - size = round_page(shmseg->shm_segsz); + size = round_page_32(shmseg->shm_segsz); + if (deallocate) { result = vm_deallocate(current_map(), shmmap_s->va, size); if (result != KERN_SUCCESS) return EINVAL; + } shmmap_s->shmid = -1; shmseg->shm_dtime = time_second; if ((--shmseg->shm_nattch <= 0) && @@ -220,6 +224,7 @@ struct shmmap_state *shmmap_s; int i; + AUDIT_ARG(svipc_addr, uap->shmaddr); if (!shm_inited) return(EINVAL); shmmap_s = (struct shmmap_state *)p->vm_shm; @@ -231,7 +236,7 @@ break; if (i == shminfo.shmseg) return EINVAL; - return shm_delete_mapping(p, shmmap_s); + return shm_delete_mapping(p, shmmap_s, 1); } #ifndef _SYS_SYSPROTO_H_ @@ -258,6 +263,8 @@ vm_size_t size; kern_return_t rv; + AUDIT_ARG(svipc_id, uap->shmid); + AUDIT_ARG(svipc_addr, uap->shmaddr); if (!shm_inited) return(EINVAL); shmmap_s = (struct shmmap_state *)p->vm_shm; @@ -271,6 +278,8 @@ shmseg = shm_find_segment_by_shmid(uap->shmid); if (shmseg == NULL) return EINVAL; + + AUDIT_ARG(svipc_perm, &shmseg->shm_perm); error = ipcperm(cred, &shmseg->shm_perm, (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); if (error) @@ -282,7 +291,7 @@ } if (i >= shminfo.shmseg) return EMFILE; - size = round_page(shmseg->shm_segsz); + size = round_page_32(shmseg->shm_segsz); prot = VM_PROT_READ; if ((uap->shmflg & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; @@ -296,7 +305,7 @@ else return EINVAL; } else { - attach_va = round_page(uap->shmaddr); + attach_va = round_page_32((unsigned int)uap->shmaddr); } shm_handle = shmseg->shm_internal; @@ -413,11 +422,18 @@ struct shmid_ds inbuf; struct shmid_ds *shmseg; + AUDIT_ARG(svipc_cmd, uap->cmd); + AUDIT_ARG(svipc_id, uap->shmid); if (!shm_inited) return(EINVAL); shmseg = shm_find_segment_by_shmid(uap->shmid); if (shmseg == NULL) return EINVAL; + /* XXAUDIT: This is the perms BEFORE any change by this call. This + * may not be what is desired. + */ + AUDIT_ARG(svipc_perm, &shmseg->shm_perm); + switch (uap->cmd) { case IPC_STAT: error = ipcperm(cred, &shmseg->shm_perm, IPC_R); @@ -525,7 +541,7 @@ return EINVAL; if (shm_nused >= shminfo.shmmni) /* any shmids left? */ return ENOSPC; - size = round_page(uap->size); + size = round_page_32(uap->size); if (shm_committed + btoc(size) > shminfo.shmall) return ENOMEM; if (shm_last_free < 0) { @@ -573,6 +589,7 @@ shmseg->shm_ctime = time_second; shm_committed += btoc(size); shm_nused++; + AUDIT_ARG(svipc_perm, &shmseg->shm_perm); if (shmseg->shm_perm.mode & SHMSEG_WANTED) { /* * Somebody else wanted this key while we were asleep. Wake @@ -582,6 +599,7 @@ wakeup((caddr_t)shmseg); } *retval = shmid; + AUDIT_ARG(svipc_id, shmid); return 0; out: switch (kret) { @@ -604,6 +622,7 @@ { int segnum, mode, error; + /* Auditing is actually done in shmget_allocate_segment() */ if (!shm_inited) return(EINVAL); @@ -676,7 +695,28 @@ shmmap_s = (struct shmmap_state *)p->vm_shm; for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) if (shmmap_s->shmid != -1) - shm_delete_mapping(p, shmmap_s); + shm_delete_mapping(p, shmmap_s, 1); + FREE((caddr_t)p->vm_shm, M_SHM); + p->vm_shm = NULL; +} + +/* + * shmexec() is like shmexit(), only it doesn't delete the mappings, + * since the old address space has already been destroyed and the new + * one instantiated. Instead, it just does the housekeeping work we + * need to do to keep the System V shared memory subsystem sane. + */ +__private_extern__ void +shmexec(p) + struct proc *p; +{ + struct shmmap_state *shmmap_s; + int i; + + shmmap_s = (struct shmmap_state *)p->vm_shm; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shm_delete_mapping(p, shmmap_s, 0); FREE((caddr_t)p->vm_shm, M_SHM); p->vm_shm = NULL; } @@ -732,7 +772,7 @@ (shminfo.shmmni != -1) && (shminfo.shmseg != -1) && (shminfo.shmall != -1)) { - shminit(); + shminit(NULL); } } return(0); diff -urN xnu-344.49/bsd/kern/tty_pty.c xnu-517/bsd/kern/tty_pty.c --- xnu-344.49/bsd/kern/tty_pty.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/tty_pty.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -84,14 +84,13 @@ #include #define FREE_BSDSTATIC __private_extern__ -#define d_open_t open_close_fcn_t -#define d_close_t open_close_fcn_t #define d_devtotty_t struct tty ** -#define d_ioctl_t ioctl_fcn_t -#define d_read_t read_write_fcn_t -#define d_write_t read_write_fcn_t -#define d_select_t select_fcn_t + +#ifdef d_stop_t +#undef d_stop_t +#endif typedef void d_stop_t __P((struct tty *tp, int rw)); + #endif /* NeXT */ #ifdef notyet @@ -238,7 +237,7 @@ done: return (0); } -#endif DEVFS +#endif /* DEVFS */ /*ARGSUSED*/ FREE_BSDSTATIC int diff -urN xnu-344.49/bsd/kern/ubc_subr.c xnu-517/bsd/kern/ubc_subr.c --- xnu-344.49/bsd/kern/ubc_subr.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/ubc_subr.c Sat Oct 25 00:25:25 2003 @@ -102,7 +102,7 @@ while (ISSET(uip->ui_flags, UI_BUSY)) { - if (uip->ui_owner == (void *)current_thread()) + if (uip->ui_owner == (void *)current_act()) return (2); SET(uip->ui_flags, UI_WANTED); @@ -111,7 +111,7 @@ if (!UBCINFOEXISTS(vp)) return (0); } - uip->ui_owner = (void *)current_thread(); + uip->ui_owner = (void *)current_act(); SET(uip->ui_flags, UI_BUSY); @@ -321,7 +321,8 @@ memory_object_control_t control; kern_return_t kret; - assert(nsize >= (off_t)0); + if (nsize < (off_t)0) + return (0); if (UBCINVALID(vp)) return (0); @@ -590,6 +591,9 @@ if (UBCINVALID(vp)) return (0); + if (flags & UBC_FOR_PAGEOUT) + return(vp->v_ubcinfo->ui_control); + if ((recursed = ubc_busy(vp)) == 0) return (0); @@ -747,7 +751,7 @@ control = uip->ui_control; assert(control); - vp->v_flag &= ~VHASDIRTY; + cluster_release(vp); vp->v_clen = 0; /* Write the dirty data in the file and discard cached pages */ @@ -854,9 +858,28 @@ int recursed; memory_object_control_t object; +retry: + if (UBCINVALID(vp)) return (0); + ubc_lock(vp); + if (ISSET(vp->v_flag, VUINIT)) { + /* + * other thread is not done initializing this + * yet, wait till it's done and try again + */ + while (ISSET(vp->v_flag, VUINIT)) { + SET(vp->v_flag, VUWANT); /* XXX overloaded! */ + ubc_unlock(vp); + (void) tsleep((caddr_t)vp, PINOD, "ubchold", 0); + ubc_lock(vp); + } + ubc_unlock(vp); + goto retry; + } + ubc_unlock(vp); + if ((recursed = ubc_busy(vp)) == 0) { /* must be invalid or dying vnode */ assert(UBCINVALID(vp) || @@ -972,6 +995,12 @@ (uip->ui_refcount == 1) && !uip->ui_mapped) { control = uip->ui_control; assert(control); + + // XXXdbg + if (vp->v_flag & VDELETED) { + ubc_setsize(vp, (off_t)0); + } + CLR(uip->ui_flags, UI_HASOBJREF); kret = memory_object_release_name(control, MEMORY_OBJECT_RESPECT_CACHE); @@ -1102,24 +1131,22 @@ * Returns 1 if file is in use by UBC, 0 if not */ int -ubc_isinuse(struct vnode *vp, int tookref) +ubc_isinuse(struct vnode *vp, int busycount) { - int busycount = tookref ? 2 : 1; - if (!UBCINFOEXISTS(vp)) return (0); - if (tookref == 0) { + if (busycount == 0) { printf("ubc_isinuse: called without a valid reference" ": v_tag = %d\v", vp->v_tag); vprint("ubc_isinuse", vp); return (0); } - if (vp->v_usecount > busycount) + if (vp->v_usecount > busycount+1) return (1); - if ((vp->v_usecount == busycount) + if ((vp->v_usecount == busycount+1) && (vp->v_ubcinfo->ui_mapped == 1)) return (1); else @@ -1166,7 +1193,7 @@ struct vnode *vp, off_t f_offset, int ops, - vm_offset_t *phys_entryp, + ppnum_t *phys_entryp, int *flagsp) { memory_object_control_t control; @@ -1182,6 +1209,42 @@ flagsp)); } +__private_extern__ kern_return_t +ubc_page_op_with_control( + memory_object_control_t control, + off_t f_offset, + int ops, + ppnum_t *phys_entryp, + int *flagsp) +{ + return (memory_object_page_op(control, + (memory_object_offset_t)f_offset, + ops, + phys_entryp, + flagsp)); +} + +kern_return_t +ubc_range_op( + struct vnode *vp, + off_t f_offset_beg, + off_t f_offset_end, + int ops, + int *range) +{ + memory_object_control_t control; + + control = ubc_getobject(vp, UBC_FLAGS_NONE); + if (control == MEMORY_OBJECT_CONTROL_NULL) + return KERN_INVALID_ARGUMENT; + + return (memory_object_range_op(control, + (memory_object_offset_t)f_offset_beg, + (memory_object_offset_t)f_offset_end, + ops, + range)); +} + kern_return_t ubc_create_upl( struct vnode *vp, @@ -1192,18 +1255,29 @@ int uplflags) { memory_object_control_t control; - int count; - off_t file_offset; - kern_return_t kr; + int count; + int ubcflags; + off_t file_offset; + kern_return_t kr; if (bufsize & 0xfff) return KERN_INVALID_ARGUMENT; - control = ubc_getobject(vp, UBC_FLAGS_NONE); + if (uplflags & UPL_FOR_PAGEOUT) { + uplflags &= ~UPL_FOR_PAGEOUT; + ubcflags = UBC_FOR_PAGEOUT; + } else + ubcflags = UBC_FLAGS_NONE; + + control = ubc_getobject(vp, ubcflags); if (control == MEMORY_OBJECT_CONTROL_NULL) return KERN_INVALID_ARGUMENT; - uplflags |= (UPL_NO_SYNC|UPL_CLEAN_IN_PLACE|UPL_SET_INTERNAL); + if (uplflags & UPL_WILL_BE_DUMPED) { + uplflags &= ~UPL_WILL_BE_DUMPED; + uplflags |= (UPL_NO_SYNC|UPL_SET_INTERNAL); + } else + uplflags |= (UPL_NO_SYNC|UPL_CLEAN_IN_PLACE|UPL_SET_INTERNAL); count = 0; kr = memory_object_upl_request(control, f_offset, bufsize, uplp, NULL, &count, uplflags); diff -urN xnu-344.49/bsd/kern/uipc_mbuf.c xnu-517/bsd/kern/uipc_mbuf.c --- xnu-344.49/bsd/kern/uipc_mbuf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_mbuf.c Sat Oct 25 00:25:25 2003 @@ -82,10 +82,22 @@ #include #include +#include + #define _MCLREF(p) (++mclrefcnt[mtocl(p)]) #define _MCLUNREF(p) (--mclrefcnt[mtocl(p)] == 0) - -extern kernel_pmap; /* The kernel's pmap */ +#define _M_CLEAR_PKTHDR(mbuf_ptr) (mbuf_ptr)->m_pkthdr.rcvif = NULL; \ + (mbuf_ptr)->m_pkthdr.len = 0; \ + (mbuf_ptr)->m_pkthdr.header = NULL; \ + (mbuf_ptr)->m_pkthdr.csum_flags = 0; \ + (mbuf_ptr)->m_pkthdr.csum_data = 0; \ + (mbuf_ptr)->m_pkthdr.aux = (struct mbuf*)NULL; \ + (mbuf_ptr)->m_pkthdr.reserved1 = NULL; \ + (mbuf_ptr)->m_pkthdr.reserved2 = NULL; + +extern pmap_t kernel_pmap; /* The kernel's pmap */ +/* kernel translater */ +extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); decl_simple_lock_data(, mbuf_slock); struct mbuf *mfree; /* mbuf free list */ @@ -95,6 +107,7 @@ extern int nmbclusters; /* max number of mapped clusters */ short *mclrefcnt; /* mapped cluster reference counts */ int *mcl_paddr; +static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ union mcluster *mclfree; /* mapped cluster free list */ int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ @@ -112,9 +125,11 @@ /* The number of cluster mbufs that are allocated, to start. */ #define MINCL max(16, 2) -extern int dlil_input_thread_wakeup; -extern int dlil_expand_mcl; -extern int dlil_initialized; +static int mbuf_expand_thread_wakeup = 0; +static int mbuf_expand_mcl = 0; +static int mbuf_expand_thread_initialized = 0; + +static void mbuf_expand_thread_init(void); #if 0 static int mfree_munge = 0; @@ -168,10 +183,11 @@ { int s,m; int initmcl = 32; + int mcl_pages; if (nclpp) return; - nclpp = round_page(MCLBYTES) / MCLBYTES; /* see mbufgc() */ + nclpp = round_page_32(MCLBYTES) / MCLBYTES; /* see mbufgc() */ if (nclpp < 1) nclpp = 1; MBUF_LOCKINIT(); // NETISR_LOCKINIT(); @@ -191,11 +207,14 @@ for (m = 0; m < nmbclusters; m++) mclrefcnt[m] = -1; - MALLOC(mcl_paddr, int *, (nmbclusters/(PAGE_SIZE/CLBYTES)) * sizeof (int), - M_TEMP, M_WAITOK); + /* Calculate the number of pages assigned to the cluster pool */ + mcl_pages = nmbclusters/(PAGE_SIZE/CLBYTES); + MALLOC(mcl_paddr, int *, mcl_pages * sizeof(int), M_TEMP, M_WAITOK); if (mcl_paddr == 0) panic("mbinit1"); - bzero((char *)mcl_paddr, (nmbclusters/(PAGE_SIZE/CLBYTES)) * sizeof (int)); + /* Register with the I/O Bus mapper */ + mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); + bzero((char *)mcl_paddr, mcl_pages * sizeof(int)); embutl = (union mcluster *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); @@ -204,6 +223,9 @@ if (m_clalloc(max(PAGE_SIZE/CLBYTES, 1) * initmcl, M_WAIT) == 0) goto bad; MBUF_UNLOCK(); + + (void) kernel_thread(kernel_task, mbuf_expand_thread_init); + return; bad: panic("mbinit"); @@ -236,11 +258,11 @@ if (ncl < i) ncl = i; - size = round_page(ncl * MCLBYTES); + size = round_page_32(ncl * MCLBYTES); mcl = (union mcluster *)kmem_mb_alloc(mb_map, size); if (mcl == 0 && ncl > 1) { - size = round_page(MCLBYTES); /* Try for 1 if failed */ + size = round_page_32(MCLBYTES); /* Try for 1 if failed */ mcl = (union mcluster *)kmem_mb_alloc(mb_map, size); } @@ -250,8 +272,19 @@ for (i = 0; i < ncl; i++) { if (++mclrefcnt[mtocl(mcl)] != 0) panic("m_clalloc already there"); - if (((int)mcl & PAGE_MASK) == 0) - mcl_paddr[((char *)mcl - (char *)mbutl)/PAGE_SIZE] = pmap_extract(kernel_pmap, (char *)mcl); + if (((int)mcl & PAGE_MASK) == 0) { + ppnum_t offset = ((char *)mcl - (char *)mbutl)/PAGE_SIZE; + ppnum_t new_page = pmap_find_phys(kernel_pmap, (vm_address_t) mcl); + + /* + * In the case of no mapper being available + * the following code nops and returns the + * input page, if there is a mapper the I/O + * page appropriate is returned. + */ + new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); + mcl_paddr[offset] = new_page << 12; + } mcl->mcl_next = mclfree; mclfree = mcl++; @@ -268,16 +301,14 @@ * pool or if the number of free clusters is less than requested. */ if ((nowait == M_DONTWAIT) && (i > 0 || ncl >= mbstat.m_clfree)) { - dlil_expand_mcl = 1; - if (dlil_initialized) - wakeup((caddr_t)&dlil_input_thread_wakeup); + mbuf_expand_mcl = 1; + if (mbuf_expand_thread_initialized) + wakeup((caddr_t)&mbuf_expand_thread_wakeup); } if (mbstat.m_clfree >= ncl) return 1; - mbstat.m_drops++; - return 0; } @@ -345,37 +376,39 @@ break; MBUF_LOCK(); wait = m_want++; - dlil_expand_mcl = 1; + mbuf_expand_mcl = 1; if (wait == 0) mbstat.m_drain++; else mbstat.m_wait++; MBUF_UNLOCK(); - if (dlil_initialized) - wakeup((caddr_t)&dlil_input_thread_wakeup); + if (mbuf_expand_thread_initialized) + wakeup((caddr_t)&mbuf_expand_thread_wakeup); /* - * Grab network funnel because m_reclaim calls into the + * Need to be inside network funnel for m_reclaim because it calls into the * socket domains and tsleep end-up calling splhigh */ fnl = thread_funnel_get(); - if (fnl && (fnl == kernel_flock)) { - fnl_switch = 1; - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - } else - funnel_state = thread_funnel_set(network_flock, TRUE); - if (wait == 0) { + if (wait == 0 && fnl == network_flock) { m_reclaim(); + } else if (fnl != THR_FUNNEL_NULL) { + /* Sleep with a small timeout as insurance */ + (void) tsleep((caddr_t)&mfree, PZERO-1, "m_retry", hz); } else { - /* Sleep with a small timeout as insurance */ - (void) tsleep((caddr_t)&mfree, PZERO-1, "m_retry", hz); + /* We are called from a non-BSD context: use mach primitives */ + u_int64_t abstime = 0; + + assert_wait((event_t)&mfree, THREAD_UNINT); + clock_interval_to_deadline(hz, NSEC_PER_SEC / hz, &abstime); + thread_set_timer_deadline(abstime); + if (thread_block(THREAD_CONTINUE_NULL) != THREAD_TIMED_OUT) + thread_cancel_timer(); } - if (fnl_switch) - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - else - thread_funnel_set(network_flock, funnel_state); } + if (m == 0) + mbstat.m_drops++; return (m); } @@ -391,14 +424,7 @@ if (m = m_retry(canwait, type)) { m->m_flags |= M_PKTHDR; m->m_data = m->m_pktdat; - m->m_pkthdr.rcvif = NULL; - m->m_pkthdr.len = 0; - m->m_pkthdr.header = NULL; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.aux = (struct mbuf *)NULL; - m->m_pkthdr.reserved1 = NULL; - m->m_pkthdr.reserved2 = NULL; + _M_CLEAR_PKTHDR(m); } return (m); } @@ -450,13 +476,7 @@ m->m_type = type; m->m_data = m->m_pktdat; m->m_flags = M_PKTHDR; - m->m_pkthdr.rcvif = NULL; - m->m_pkthdr.header = NULL; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.aux = (struct mbuf *)NULL; - m->m_pkthdr.reserved1 = NULL; - m->m_pkthdr.reserved2 = NULL; + _M_CLEAR_PKTHDR(m) } else m = m_retryhdr(nowait, type); @@ -564,6 +584,8 @@ ++mclrefcnt[mtocl(p)]; mbstat.m_clfree--; mclfree = ((union mcluster *)p)->mcl_next; + } else { + mbstat.m_drops++; } MBUF_UNLOCK(); @@ -630,14 +652,7 @@ m->m_type = MT_DATA; m->m_data = m->m_ext.ext_buf; m->m_flags = M_PKTHDR | M_EXT; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = NULL; - m->m_pkthdr.header = NULL; - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.aux = (struct mbuf *)NULL; - m->m_pkthdr.reserved1 = 0; - m->m_pkthdr.reserved2 = 0; + _M_CLEAR_PKTHDR(m) m->m_ext.ext_free = 0; m->m_ext.ext_size = MCLBYTES; m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = @@ -705,14 +720,7 @@ m->m_flags = M_EXT; else { m->m_flags = M_PKTHDR | M_EXT; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = NULL; - m->m_pkthdr.header = NULL; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.aux = (struct mbuf *)NULL; - m->m_pkthdr.reserved1 = NULL; - m->m_pkthdr.reserved2 = NULL; + _M_CLEAR_PKTHDR(m); num_with_pkthdrs--; } @@ -778,14 +786,7 @@ m->m_type = MT_DATA; m->m_flags = M_PKTHDR; m->m_data = m->m_pktdat; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = NULL; - m->m_pkthdr.header = NULL; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.aux = (struct mbuf *)NULL; - m->m_pkthdr.reserved1 = NULL; - m->m_pkthdr.reserved2 = NULL; + _M_CLEAR_PKTHDR(m); } else { @@ -835,11 +836,13 @@ if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.aux) { /* * Treat the current m as the nextpkt and set m - * to the aux data. This lets us free the aux - * data in this loop without having to call - * m_freem recursively, which wouldn't work - * because we've still got the lock. + * to the aux data. Preserve nextpkt in m->m_nextpkt. + * This lets us free the aux data in this loop + * without having to call m_freem recursively, + * which wouldn't work because we've still got + * the lock. */ + m->m_nextpkt = nextpkt; nextpkt = m; m = nextpkt->m_pkthdr.aux; nextpkt->m_pkthdr.aux = NULL; @@ -1154,14 +1157,7 @@ } else { n->m_data = n->m_pktdat; n->m_flags = M_PKTHDR; - n->m_pkthdr.len = 0; - n->m_pkthdr.rcvif = NULL; - n->m_pkthdr.header = NULL; - n->m_pkthdr.csum_flags = 0; - n->m_pkthdr.csum_data = 0; - n->m_pkthdr.aux = (struct mbuf *)NULL; - n->m_pkthdr.reserved1 = NULL; - n->m_pkthdr.reserved2 = NULL; + _M_CLEAR_PKTHDR(n); } } else { MBUF_UNLOCK(); @@ -1810,54 +1806,29 @@ panic("mget MCHECK: m_type=%x m=%x", m->m_type, m); } -#if 0 -#include - -static int mhog_num = 0; -static struct mbuf *mhog_chain = 0; -static int mhog_wait = 1; - -static int -sysctl_mhog_num SYSCTL_HANDLER_ARGS +void +mbuf_expand_thread(void) { - int old = mhog_num; - int error; - - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); - if (!error && req->newptr) { - int i; - struct mbuf *m; - - if (mhog_chain) { - m_freem(mhog_chain); - mhog_chain = 0; - } - - for (i = 0; i < mhog_num; i++) { - MGETHDR(m, mhog_wait ? M_WAIT : M_DONTWAIT, MT_DATA); - if (m == 0) - break; - - MCLGET(m, mhog_wait ? M_WAIT : M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - m = 0; - break; - } - m->m_next = mhog_chain; - mhog_chain = m; + while (1) { + int expand_mcl; + MBUF_LOCK(); + expand_mcl = mbuf_expand_mcl; + mbuf_expand_mcl = 0; + MBUF_UNLOCK(); + if (expand_mcl) { + caddr_t p; + MCLALLOC(p, M_WAIT); + if (p) MCLFREE(p); } - mhog_num = i; + assert_wait(&mbuf_expand_thread_wakeup, THREAD_UNINT); + (void) thread_block(mbuf_expand_thread); } - - return error; } -SYSCTL_NODE(_kern_ipc, OID_AUTO, mhog, CTLFLAG_RW, 0, "mbuf hog"); - -SYSCTL_PROC(_kern_ipc_mhog, OID_AUTO, cluster, CTLTYPE_INT|CTLFLAG_RW, - &mhog_num, 0, &sysctl_mhog_num, "I", ""); -SYSCTL_INT(_kern_ipc_mhog, OID_AUTO, wait, CTLFLAG_RW, &mhog_wait, - 0, ""); -#endif +void +mbuf_expand_thread_init(void) +{ + mbuf_expand_thread_initialized++; + mbuf_expand_thread(); +} diff -urN xnu-344.49/bsd/kern/uipc_mbuf2.c xnu-517/bsd/kern/uipc_mbuf2.c --- xnu-344.49/bsd/kern/uipc_mbuf2.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_mbuf2.c Sat Oct 25 00:25:25 2003 @@ -382,7 +382,7 @@ if (n) return n; - MGET(n, M_WAIT, m->m_type); + MGET(n, M_DONTWAIT, m->m_type); if (n == NULL) return NULL; diff -urN xnu-344.49/bsd/kern/uipc_socket.c xnu-517/bsd/kern/uipc_socket.c --- xnu-344.49/bsd/kern/uipc_socket.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_socket.c Sat Oct 25 00:25:25 2003 @@ -62,12 +62,15 @@ #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -98,6 +101,19 @@ #include +static void filt_sordetach(struct knote *kn); +static int filt_soread(struct knote *kn, long hint); +static void filt_sowdetach(struct knote *kn); +static int filt_sowrite(struct knote *kn, long hint); +static int filt_solisten(struct knote *kn, long hint); + +static struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + int socket_debug = 0; int socket_zone = M_SOCKET; so_gen_t so_gencnt; /* generation count for sockets */ @@ -123,8 +139,11 @@ /* Should we get a maximum also ??? */ static int sosendmaxchain = 65536; static int sosendminchain = 16384; +static int sorecvmincopy = 16384; SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, + 0, ""); void so_cache_timer(); struct mbuf *m_getpackets(int, int, int); @@ -366,7 +385,9 @@ register struct protosw *prp; register struct socket *so; register int error = 0; - +#if TCPDEBUG + extern int tcpconsdebug; +#endif if (proto) prp = pffindproto(dom, proto, type); else @@ -414,6 +435,11 @@ #endif error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); if (error) { + /* + * Warning: + * If so_pcb is not zero, the socket will be leaked, + * so protocol attachment handler must be coded carefuly + */ so->so_state |= SS_NOFDREF; sofree(so); return (error); @@ -422,7 +448,12 @@ prp->pr_domain->dom_refs++; so->so_rcv.sb_so = so->so_snd.sb_so = so; TAILQ_INIT(&so->so_evlist); +#if TCPDEBUG + if (tcpconsdebug == 2) + so->so_options |= SO_DEBUG; +#endif #endif + *aso = so; return (0); } @@ -968,7 +999,7 @@ if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) snderr(EMSGSIZE); - if (space < resid + clen && uio && + if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if (so->so_state & SS_NBIO) snderr(EWOULDBLOCK); @@ -1209,15 +1240,20 @@ struct mbuf **controlp; int *flagsp; { - register struct mbuf *m, **mp; - register struct mbuf *free_list, *ml; + register struct mbuf *m, **mp, *ml; register int flags, len, error, s, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; int orig_resid = uio->uio_resid; struct kextcb *kp; - + volatile struct mbuf *free_list; + volatile int delayed_copy_len; + int can_delay; + int need_event; + struct proc *p = current_proc(); + + KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio->uio_resid, @@ -1231,8 +1267,10 @@ error = (*kp->e_soif->sf_soreceive)(so, psa, &uio, mp0, controlp, flagsp, kp); - if (error) + if (error) { + KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0); return((error == EJUSTRETURN) ? 0 : error); + } } kp = kp->e_next; } @@ -1256,8 +1294,10 @@ (so->so_options & SO_OOBINLINE) == 0 && (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) { m = m_get(M_WAIT, MT_DATA); - if (m == NULL) + if (m == NULL) { + KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, ENOBUFS,0,0,0,0); return (ENOBUFS); + } error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; @@ -1292,6 +1332,9 @@ if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreqs->pru_rcvd)(so, 0); + + free_list = (struct mbuf *)0; + delayed_copy_len = 0; restart: error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) { @@ -1314,9 +1357,10 @@ */ if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid) && - (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { + KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); if (so->so_error) { if (m) @@ -1351,6 +1395,7 @@ sbunlock(&so->so_rcv); if (socket_debug) printf("Waiting for socket data\n"); + error = sbwait(&so->so_rcv); if (socket_debug) printf("SORECEIVE - sbwait returned %d\n", error); @@ -1365,7 +1410,16 @@ #ifndef __APPLE__ if (uio->uio_procp) uio->uio_procp->p_stats->p_ru.ru_msgrcv++; -#endif +#else /* __APPLE__ */ + /* + * 2207985 + * This should be uio->uio-procp; however, some callers of this + * function use auto variables with stack garbage, and fail to + * fill out the uio structure properly. + */ + if (p) + p->p_stats->p_ru.ru_msgrcv++; +#endif /* __APPLE__ */ nextrecord = m->m_nextpkt; if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { KASSERT(m->m_type == MT_SONAME, ("receive 1a")); @@ -1417,10 +1471,15 @@ moff = 0; offset = 0; - free_list = m; - ml = (struct mbuf *)0; + if (!(flags & MSG_PEEK) && uio->uio_resid > sorecvmincopy) + can_delay = 1; + else + can_delay = 0; + + need_event = 0; - while (m && uio->uio_resid > 0 && error == 0) { + + while (m && (uio->uio_resid - delayed_copy_len) > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; @@ -1447,7 +1506,7 @@ } #endif so->so_state &= ~SS_RCVATMARK; - len = uio->uio_resid; + len = uio->uio_resid - delayed_copy_len; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) @@ -1461,13 +1520,48 @@ * block interrupts again. */ if (mp == 0) { - splx(s); - error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); - s = splnet(); - if (error) - goto release; + if (can_delay && len == m->m_len) { + /* + * only delay the copy if we're consuming the + * mbuf and we're NOT in MSG_PEEK mode + * and we have enough data to make it worthwile + * to drop and retake the funnel... can_delay + * reflects the state of the 2 latter constraints + * moff should always be zero in these cases + */ + delayed_copy_len += len; + } else { + splx(s); + + if (delayed_copy_len) { + error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + + if (error) { + s = splnet(); + goto release; + } + if (m != so->so_rcv.sb_mb) { + /* + * can only get here if MSG_PEEK is not set + * therefore, m should point at the head of the rcv queue... + * if it doesn't, it means something drastically changed + * while we were out from behind the funnel in sodelayed_copy... + * perhaps a RST on the stream... in any event, the stream has + * been interrupted... it's probably best just to return + * whatever data we've moved and let the caller sort it out... + */ + break; + } + } + error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + + s = splnet(); + if (error) + goto release; + } } else uio->uio_resid -= len; + if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; @@ -1477,6 +1571,7 @@ } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); + if (mp) { *mp = m; mp = &m->m_next; @@ -1484,7 +1579,9 @@ *mp = (struct mbuf *)0; } else { m->m_nextpkt = 0; - if (ml != 0) + if (free_list == NULL) + free_list = m; + else ml->m_next = m; ml = m; so->so_rcv.sb_mb = m = m->m_next; @@ -1509,7 +1606,11 @@ so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK; - postevent(so, 0, EV_OOB); + /* + * delay posting the actual event until after + * any delayed copy processing has finished + */ + need_event = 1; break; } } else { @@ -1521,38 +1622,49 @@ if (flags & MSG_EOR) break; /* - * If the MSG_WAITALL flag is set (for non-atomic socket), + * If the MSG_WAITALL or MSG_WAITSTREAM flag is set (for non-atomic socket), * we must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return * with a short count but without error. * Keep sockbuf locked against other readers. */ - while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 && (uio->uio_resid - delayed_copy_len) > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) - break; + goto release; - if (ml) { - m_freem_list(free_list); + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreqs->pru_rcvd)(so, flags); + if (sbwait(&so->so_rcv)) { + error = 0; + goto release; } - error = sbwait(&so->so_rcv); - if (error) { - sbunlock(&so->so_rcv); - splx(s); - KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, 0,0,0,0,0); - return (0); + /* + * have to wait until after we get back from the sbwait to do the copy because + * we will drop the funnel if we have enough data that has been delayed... by dropping + * the funnel we open up a window allowing the netisr thread to process the incoming packets + * and to change the state of this socket... we're issuing the sbwait because + * the socket is empty and we're expecting the netisr thread to wake us up when more + * packets arrive... if we allow that processing to happen and then sbwait, we + * could stall forever with packets sitting in the socket if no further packets + * arrive from the remote side. + * + * we want to copy before we've collected all the data to satisfy this request to + * allow the copy to overlap the incoming packet processing on an MP system + */ + if (delayed_copy_len > sorecvmincopy && (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) { + + error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + + if (error) + goto release; } m = so->so_rcv.sb_mb; if (m) { nextrecord = m->m_nextpkt; - free_list = m; } - ml = (struct mbuf *)0; } } - if (ml) { - m_freem_list(free_list); - } if (m && pr->pr_flags & PR_ATOMIC) { #ifdef __APPLE__ @@ -1576,6 +1688,19 @@ #ifdef __APPLE__ if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) flags |= MSG_HAVEMORE; + + if (delayed_copy_len) { + error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + + if (error) + goto release; + } + if (free_list) { + m_freem_list((struct mbuf *)free_list); + free_list = (struct mbuf *)0; + } + if (need_event) + postevent(so, 0, EV_OOB); #endif if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { @@ -1587,6 +1712,12 @@ if (flagsp) *flagsp |= flags; release: + if (delayed_copy_len) { + error = sodelayed_copy(uio, &free_list, &delayed_copy_len); + } + if (free_list) { + m_freem_list((struct mbuf *)free_list); + } sbunlock(&so->so_rcv); splx(s); @@ -1600,6 +1731,38 @@ return (error); } + +int sodelayed_copy(struct uio *uio, struct mbuf **free_list, int *resid) +{ + int error = 0; + boolean_t dropped_funnel = FALSE; + struct mbuf *m; + + m = *free_list; + + if (*resid >= sorecvmincopy) { + dropped_funnel = TRUE; + + (void)thread_funnel_set(network_flock, FALSE); + } + while (m && error == 0) { + + error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio); + + m = m->m_next; + } + m_freem_list(*free_list); + + *free_list = (struct mbuf *)NULL; + *resid = 0; + + if (dropped_funnel == TRUE) + (void)thread_funnel_set(network_flock, TRUE); + + return (error); +} + + int soshutdown(so, how) register struct socket *so; @@ -1615,8 +1778,10 @@ while (kp) { if (kp->e_soif && kp->e_soif->sf_soshutdown) { ret = (*kp->e_soif->sf_soshutdown)(so, how, kp); - if (ret) + if (ret) { + KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0); return((ret == EJUSTRETURN) ? 0 : ret); + } } kp = kp->e_next; } @@ -1665,12 +1830,10 @@ #endif asb = *sb; bzero((caddr_t)sb, sizeof (*sb)); -#ifndef __APPLE__ if (asb.sb_flags & SB_KNOTE) { sb->sb_sel.si_note = asb.sb_sel.si_note; sb->sb_flags = SB_KNOTE; } -#endif splx(s); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); @@ -1887,6 +2050,18 @@ break; + case SO_NOADDRERR: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + goto bad; + if (optval) + so->so_flags |= SOF_NOADDRAVAIL; + else + so->so_flags &= ~SOF_NOADDRAVAIL; + + break; + default: error = ENOPROTOOPT; break; @@ -2060,6 +2235,10 @@ optval = (so->so_flags & SOF_NOSIGPIPE); goto integer; + case SO_NOADDRERR: + optval = (so->so_flags & SOF_NOADDRAVAIL); + goto integer; + default: error = ENOPROTOOPT; break; @@ -2297,3 +2476,115 @@ splx(s); return (revents); } + + +int +soo_kqfilter(struct file *fp, struct knote *kn, struct proc *p) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + struct sockbuf *sb; + int s; + + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); + } + + if (sb->sb_sel.si_flags & SI_INITED) + return (1); + + s = splnet(); + if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn)) + sb->sb_flags |= SB_KNOTE; + splx(s); + return (0); +} + +static void +filt_sordetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + int s = splnet(); + + if (so->so_rcv.sb_flags & SB_KNOTE && + !(so->so_rcv.sb_sel.si_flags & SI_INITED)) + if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) + so->so_rcv.sb_flags &= ~SB_KNOTE; + splx(s); +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); +} + +static void +filt_sowdetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + int s = splnet(); + + if(so->so_snd.sb_flags & SB_KNOTE && + !(so->so_snd.sb_sel.si_flags & SI_INITED)) + if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) + so->so_snd.sb_flags &= ~SB_KNOTE; + splx(s); +} + +/*ARGSUSED*/ +static int +filt_sowrite(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = sbspace(&so->so_snd); + if (so->so_state & SS_CANTSENDMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); +} + +/*ARGSUSED*/ +static int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_data; + + kn->kn_data = so->so_qlen; + return (! TAILQ_EMPTY(&so->so_comp)); +} + diff -urN xnu-344.49/bsd/kern/uipc_socket2.c xnu-517/bsd/kern/uipc_socket2.c --- xnu-344.49/bsd/kern/uipc_socket2.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_socket2.c Sat Oct 25 00:25:25 2003 @@ -456,6 +456,9 @@ } if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); + if (sb->sb_flags & SB_KNOTE && + !(sb->sb_sel.si_flags & SI_INITED)) + KNOTE(&sb->sb_sel.si_note, 0); } /* @@ -607,8 +610,10 @@ kp = sotokextcb(sbtoso(sb)); while (kp) { if (kp->e_sout && kp->e_sout->su_sbappend) { - if ((*kp->e_sout->su_sbappend)(sb, m, kp)) + if ((*kp->e_sout->su_sbappend)(sb, m, kp)) { + KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, kp, 0, 0); return; + } } kp = kp->e_next; } @@ -619,6 +624,7 @@ do { if (n->m_flags & M_EOR) { sbappendrecord(sb, m); /* XXXXXX!!!! */ + KERNEL_DEBUG((DBG_FNC_SBAPPEND | DBG_FUNC_END), sb, sb->sb_cc, 0, 0, 0); return; } } while (n->m_next && (n = n->m_next)); @@ -945,8 +951,7 @@ kp = kp->e_next; } - if (sb->sb_flags & SB_LOCK) - sb_lock(sb); + (void)sblock(sb, M_WAIT); while (sb->sb_mbcnt) { /* * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: @@ -958,6 +963,9 @@ } if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); + + sbunlock(sb); + postevent(0, sb, EV_RWBYTES); } @@ -986,8 +994,10 @@ kp = sotokextcb(sbtoso(sb)); while (kp) { if (kp->e_sout && kp->e_sout->su_sbdrop) { - if ((*kp->e_sout->su_sbdrop)(sb, len, kp)) + if ((*kp->e_sout->su_sbdrop)(sb, len, kp)) { + KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, len, kp, 0, 0); return; + } } kp = kp->e_next; } @@ -1278,7 +1288,7 @@ int sb_notify(struct sockbuf *sb) { - return ((sb->sb_flags & (SB_WAIT|SB_SEL|SB_ASYNC|SB_UPCALL)) != 0); + return ((sb->sb_flags & (SB_WAIT|SB_SEL|SB_ASYNC|SB_UPCALL|SB_KNOTE)) != 0); } /* diff -urN xnu-344.49/bsd/kern/uipc_syscalls.c xnu-517/bsd/kern/uipc_syscalls.c --- xnu-344.49/bsd/kern/uipc_syscalls.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_syscalls.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include #endif #include +#include #include @@ -96,29 +97,29 @@ #endif struct getsockname_args { - int fdes; + int fdes; caddr_t asa; - int *alen; + socklen_t *alen; }; struct getsockopt_args { - int s; - int level; - int name; + int s; + int level; + int name; caddr_t val; - int *avalsize; + socklen_t *avalsize; } ; struct accept_args { - int s; - caddr_t name; - int *anamelen; + int s; + caddr_t name; + socklen_t *anamelen; }; struct getpeername_args { - int fdes; - caddr_t asa; - int *alen; + int fdes; + caddr_t asa; + socklen_t *alen; }; @@ -172,6 +173,7 @@ struct file *fp; int fd, error; + AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); error = falloc(p, &fp, &fd); thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); @@ -196,9 +198,9 @@ } struct bind_args { - int s; - caddr_t name; - int namelen; + int s; + caddr_t name; + socklen_t namelen; }; /* ARGSUSED */ @@ -212,13 +214,18 @@ struct sockaddr *sa; int error; + AUDIT_ARG(fd, uap->s); error = getsock(p->p_fd, uap->s, &fp); if (error) return (error); error = getsockaddr(&sa, uap->name, uap->namelen); if (error) return (error); - error = sobind((struct socket *)fp->f_data, sa); + AUDIT_ARG(sockaddr, p, sa); + if (fp->f_data != NULL) + error = sobind((struct socket *)fp->f_data, sa); + else + error = EBADF; FREE(sa, M_SONAME); return (error); } @@ -239,10 +246,14 @@ struct file *fp; int error; + AUDIT_ARG(fd, uap->s); error = getsock(p->p_fd, uap->s, &fp); if (error) return (error); - return (solisten((struct socket *)fp->f_data, uap->backlog)); + if (fp->f_data != NULL) + return (solisten((struct socket *)fp->f_data, uap->backlog)); + else + return (EBADF); } #ifndef COMPAT_OLDSOCK @@ -267,6 +278,7 @@ short fflag; /* type must match fp->f_flag */ int tmpfd; + AUDIT_ARG(fd, uap->s); if (uap->name) { error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, sizeof (namelen)); @@ -278,6 +290,10 @@ return (error); s = splnet(); head = (struct socket *)fp->f_data; + if (head == NULL) { + splx(s); + return (EBADF); + } if ((head->so_options & SO_ACCEPTCONN) == 0) { splx(s); return (EINVAL); @@ -352,6 +368,7 @@ goto gotnoname; return 0; } + AUDIT_ARG(sockaddr, p, sa); if (uap->name) { /* check sa_len before it is destroyed */ if (namelen > sa->sa_len) @@ -395,9 +412,9 @@ #endif /* COMPAT_OLDSOCK */ struct connect_args { - int s; - caddr_t name; - int namelen; + int s; + caddr_t name; + socklen_t namelen; }; /* ARGSUSED */ int @@ -411,15 +428,19 @@ struct sockaddr *sa; int error, s; + AUDIT_ARG(fd, uap->s); error = getsock(p->p_fd, uap->s, &fp); if (error) return (error); so = (struct socket *)fp->f_data; + if (so == NULL) + return (EBADF); if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) return (EALREADY); error = getsockaddr(&sa, uap->name, uap->namelen); if (error) return (error); + AUDIT_ARG(sockaddr, p, sa); error = soconnect(so, sa); if (error) goto bad; @@ -464,6 +485,7 @@ struct socket *so1, *so2; int fd, error, sv[2]; + AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol); error = socreate(uap->domain, &so1, uap->type, uap->protocol); if (error) return (error); @@ -583,6 +605,7 @@ KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0); return (error); } + AUDIT_ARG(sockaddr, p, to); } else to = 0; if (mp->msg_control) { @@ -628,8 +651,11 @@ #endif len = auio.uio_resid; so = (struct socket *)fp->f_data; - error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, - flags); + if (so == NULL) + error = EBADF; + else + error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, + flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -686,6 +712,7 @@ int stat; KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_START, 0,0,0,0,0); + AUDIT_ARG(fd, uap->s); msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; @@ -798,6 +825,7 @@ int error; KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_START, 0,0,0,0,0); + AUDIT_ARG(fd, uap->s); if (error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg))) { KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error,0,0,0,0); @@ -884,9 +912,13 @@ #endif len = auio.uio_resid; so = (struct socket *)fp->f_data; - error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, - (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, - &mp->msg_flags); + if (so == NULL) + error = EBADF; + else + error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, + (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, + &mp->msg_flags); + AUDIT_ARG(sockaddr, p, fromsa); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -1019,6 +1051,7 @@ int error; KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_START, 0,0,0,0,0); + AUDIT_ARG(fd, uap->s); if (uap->fromlenaddr) { error = copyin((caddr_t)uap->fromlenaddr, @@ -1047,7 +1080,7 @@ { uap->flags |= MSG_COMPAT; - return (recvfrom(p, uap)); + return (recvfrom(p, uap, retval)); } #endif @@ -1148,6 +1181,7 @@ register int error; KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_START, 0,0,0,0,0); + AUDIT_ARG(fd, uap->s); if (error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg))) { @@ -1203,9 +1237,12 @@ struct file *fp; int error; + AUDIT_ARG(fd, uap->s); error = getsock(p->p_fd, uap->s, &fp); if (error) return (error); + if (fp->f_data == NULL) + return (EBADF); return (soshutdown((struct socket *)fp->f_data, uap->how)); } @@ -1215,11 +1252,11 @@ /* ARGSUSED */ struct setsockopt_args { - int s; - int level; - int name; - caddr_t val; - int valsize; + int s; + int level; + int name; + caddr_t val; + socklen_t valsize; }; int @@ -1232,6 +1269,7 @@ struct sockopt sopt; int error; + AUDIT_ARG(fd, uap->s); if (uap->val == 0 && uap->valsize != 0) return (EFAULT); if (uap->valsize < 0) @@ -1248,6 +1286,8 @@ sopt.sopt_valsize = uap->valsize; sopt.sopt_p = p; + if (fp->f_data == NULL) + return (EBADF); return (sosetopt((struct socket *)fp->f_data, &sopt)); } @@ -1283,6 +1323,8 @@ sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ sopt.sopt_p = p; + if (fp->f_data == NULL) + return (EBADF); error = sogetopt((struct socket *)fp->f_data, &sopt); if (error == 0) { valsize = sopt.sopt_valsize; @@ -1382,6 +1424,8 @@ if (error) return (error); so = (struct socket *)fp->f_data; + if (so == NULL) + return (EBADF); sa = 0; error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); if (error) @@ -1450,6 +1494,8 @@ if (error) return (error); so = (struct socket *)fp->f_data; + if (so == NULL) + return (EBADF); if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) return (ENOTCONN); error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); @@ -1735,6 +1781,10 @@ if (error) goto done; so = (struct socket *)fp->f_data; + if (so == NULL) { + error = EBADF; + goto done; + } if (so->so_type != SOCK_STREAM) { error = EINVAL; goto done; diff -urN xnu-344.49/bsd/kern/uipc_usrreq.c xnu-517/bsd/kern/uipc_usrreq.c --- xnu-344.49/bsd/kern/uipc_usrreq.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/kern/uipc_usrreq.c Sat Oct 25 00:25:25 2003 @@ -663,6 +663,18 @@ goto bad; } thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + + /* + * Check if socket was connected while we were trying to + * acquire the funnel. + * XXX - probably shouldn't return an error for SOCK_DGRAM + */ + if ((so->so_state & SS_ISCONNECTED) != 0) { + error = EISCONN; + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + goto bad; + } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == 0) { diff -urN xnu-344.49/bsd/man/man2/Makefile xnu-517/bsd/man/man2/Makefile --- xnu-344.49/bsd/man/man2/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/Makefile Tue Oct 21 21:24:55 2003 @@ -35,6 +35,7 @@ fpathconf.2 \ fstat.2 \ fstatfs.2 \ + fsctl.2 \ fsync.2 \ ftruncate.2 \ futimes.2 \ @@ -64,6 +65,7 @@ ioctl.2 \ issetugid.2 \ kill.2 \ + kqueue.2 \ ktrace.2 \ lchown.2 \ link.2 \ @@ -105,6 +107,9 @@ rmdir.2 \ sbrk.2 \ select.2 \ + semctl.2 \ + semget.2 \ + semop.2 \ send.2 \ sendmsg.2 \ sendto.2 \ diff -urN xnu-344.49/bsd/man/man2/chflags.2 xnu-517/bsd/man/man2/chflags.2 --- xnu-344.49/bsd/man/man2/chflags.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/chflags.2 Tue Oct 21 21:24:55 2003 @@ -134,7 +134,7 @@ .It Bq Er EBADF The descriptor is not valid. .It Bq Er EINVAL -.Fa Fd +.Fa fd refers to a socket, not to a file. .It Bq Er EPERM The effective user ID does not match the owner of the file and diff -urN xnu-344.49/bsd/man/man2/chmod.2 xnu-517/bsd/man/man2/chmod.2 --- xnu-344.49/bsd/man/man2/chmod.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/chmod.2 Tue Oct 21 21:24:55 2003 @@ -161,7 +161,7 @@ .It Bq Er EBADF The descriptor is not valid. .It Bq Er EINVAL -.Fa Fd +.Fa fd refers to a socket, not to a file. .It Bq Er EROFS The file resides on a read-only file system. diff -urN xnu-344.49/bsd/man/man2/chown.2 xnu-517/bsd/man/man2/chown.2 --- xnu-344.49/bsd/man/man2/chown.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/chown.2 Tue Oct 21 21:24:55 2003 @@ -117,10 +117,10 @@ will fail if: .Bl -tag -width Er .It Bq Er EBADF -.Fa Fd +.Fa fd does not refer to a valid descriptor. .It Bq Er EINVAL -.Fa Fd +.Fa fd refers to a socket, not a file. .It Bq Er EPERM The effective user ID is not the super-user. diff -urN xnu-344.49/bsd/man/man2/connect.2 xnu-517/bsd/man/man2/connect.2 --- xnu-344.49/bsd/man/man2/connect.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/connect.2 Tue Oct 21 21:24:55 2003 @@ -69,7 +69,10 @@ .Fn connect multiple times to change their association. Datagram sockets may dissolve the association -by connecting to an invalid address, such as a null address. +by connecting to an invalid address, such as a null address +or an address with +the address family set to AF_UNPSEC (the error +EAFNOSUPPORT will be harmlessly returned). .Sh RETURN VALUES If the connection or binding succeeds, 0 is returned. Otherwise a -1 is returned, and a more specific error diff -urN xnu-344.49/bsd/man/man2/execve.2 xnu-517/bsd/man/man2/execve.2 --- xnu-344.49/bsd/man/man2/execve.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/execve.2 Tue Oct 21 21:24:55 2003 @@ -233,10 +233,10 @@ .It Bq Er E2BIG The number of bytes in the new process's argument list is larger than the system-imposed limit. -The limit in the system as released is 20480 bytes -.Pf ( Dv NCARGS -in -.Ao Pa sys/param.h Ac ) . +This limit is specified by the +.Xr sysctl 3 +MIB variable +.Dv KERN_ARGMAX . .It Bq Er EFAULT The new process file is not as long as indicated by the size values in its header. diff -urN xnu-344.49/bsd/man/man2/fork.2 xnu-517/bsd/man/man2/fork.2 --- xnu-344.49/bsd/man/man2/fork.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/fork.2 Tue Oct 21 21:24:55 2003 @@ -106,6 +106,6 @@ .Xr wait 2 .Sh HISTORY A -.Fn fork 2 +.Fn fork function call appeared in .At v6 . diff -urN xnu-344.49/bsd/man/man2/fsctl.2 xnu-517/bsd/man/man2/fsctl.2 --- xnu-344.49/bsd/man/man2/fsctl.2 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man2/fsctl.2 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,135 @@ +.\" +.\" Copyright (c) 2003 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd January 14, 2003 +.Dt FSCTL 2 +.Os "Mac OS X" +.Sh NAME +.Nm fsctl +.Nd control filesystems +.Sh SYNOPSIS +.Fd #include +.Fd #include +.Ft int +.Fn fsctl "const char *path" "unsigned long request" "void *data" "unsigned long options" +.Sh DESCRIPTION +The +.Fn fsctl +function manipulates the filesystem controlling mounted volumes. +In particular, many filesystem-specific operating +characteristics of mounted filesystems may be controlled +with +.Fn fsctl +requests. +.Fn fsctl +requests can also be used to extract filesystem-specific +information for a mounted volumes. +.Pp +.Fa path +is the path name of any file within the mounted filesystem. +An fsctl +.Fa request +has encoded in it whether the argument is an +.Dq in +parameter +or +.Dq out +parameter, and the size of the argument +.Fa data +in bytes. +Values for +.Fa request +are entirely filesystem-specific except for the following, defined in +.Ao Pa sys/fsctl.h Ac : +.Bd -literal -offset indent +FSGETMOUNTINFOSIZE /* Return size of mount info data */ +.Ed +.Pp +Macros and defines used in specifying an fsctl +.Fa request +are the same as for +.Fn ioctl +requests and are located in the file +.Ao Pa sys/ioccom.h Ac . +.Fa options +may specify special flags for the processing of the +.Fn fsctl +call. The options are specified by +.Em or Ns 'ing +the option values. The only option currently defined is +.Bd -literal -offset indent +#define FSOPT_NOFOLLOW 0x00000001 /* Don't follow symlinks */ +.Ed +.Pp +which is interpreted by the +.Fn fsctl +call to prevent following of symlinks. The +.Fa options +argument is passed to the filesystem, which may define and handle +additional +.Fa options +bit values. +.Sh RETURN VALUES +.Pp +If an error has occurred, a value of -1 is returned and +.Va errno +is set to indicate the error. +.Sh ERRORS +.Fn fsctl +will fail if: +.Bl -tag -width Er +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +.It Bq Er ENAMETOOLONG +A component of a pathname exceeded +.Dv {NAME_MAX} +characters, or an entire path name exceeded +.Dv {PATH_MAX} +characters. +.It Bq Er ENOENT +The named file does not exist. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +.It Bq Er EFAULT +.Fa path +or +.Em data +points to an invalid address. +.It Bq Er EIO +An +.Tn I/O +error occurred while reading from or writing to the file system. +.It Bq Er EINVAL +.Fa request +or +.Fa data +is not valid. +.El +.Sh SEE ALSO +.Xr ioctl 2 , +.Xr getattrlist 2 , +.Xr setattrlist 2 +.Sh HISTORY +The +.Fn fsctl +function call appeared in Mac OS X version 10.0. diff -urN xnu-344.49/bsd/man/man2/fsync.2 xnu-517/bsd/man/man2/fsync.2 --- xnu-344.49/bsd/man/man2/fsync.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/fsync.2 Tue Oct 21 21:24:55 2003 @@ -64,10 +64,10 @@ fails if: .Bl -tag -width Er .It Bq Er EBADF -.Fa Fd +.Fa fd is not a valid descriptor. .It Bq Er EINVAL -.Fa Fd +.Fa fd refers to a socket, not to a file. .It Bq Er EIO An I/O error occurred while reading from or writing to the file system. diff -urN xnu-344.49/bsd/man/man2/getdirentries.2 xnu-517/bsd/man/man2/getdirentries.2 --- xnu-344.49/bsd/man/man2/getdirentries.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/getdirentries.2 Tue Oct 21 21:24:55 2003 @@ -40,7 +40,8 @@ .Nm getdirentries .Nd "get directory entries in a filesystem independent format" .Sh SYNOPSIS -.Fd #include +.Fd #include +.Fd #include .Ft int .Fn getdirentries "int fd" "char *buf" "int nbytes" "long *basep" .Sh DESCRIPTION @@ -67,9 +68,10 @@ .Em dirent structures each containing the following entries: .Bd -literal -offset indent -unsigned long d_fileno; -unsigned short d_reclen; -unsigned short d_namlen; +u_int32_t d_fileno; /* file number of entry */ +u_int16_t d_reclen; /* length of this record */ +u_int8_t d_type; /* file type, see below */ +u_int8_t d_namlen; /* length of string in d_name */ char d_name[MAXNAMELEN + 1]; /* see below */ .Ed .Pp @@ -81,6 +83,12 @@ .Xr link 2 ) have the same .Fa d_fileno . +Users of +.Fn getdirentries +should skip +entries with +.Fa d_fileno += 0, as such entries represent files which have been deleted but not yet removed from the directory entry. The .Fa d_reclen entry is the length, in bytes, of the directory record. @@ -95,6 +103,20 @@ may vary from 1 to .Dv MAXNAMELEN \&+ 1. +.Fa d_type +is a integer representing the type of the directory entry. The following types are defined in +.Aq sys/dirent.h : +.Bd -literal -offset indent +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 +.Ed .Pp Entries may be separated by extra space. The diff -urN xnu-344.49/bsd/man/man2/getfsstat.2 xnu-517/bsd/man/man2/getfsstat.2 --- xnu-344.49/bsd/man/man2/getfsstat.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/getfsstat.2 Tue Oct 21 21:24:55 2003 @@ -55,12 +55,12 @@ .Bd -literal typedef struct { int32_t val[2]; } fsid_t; -#define MFSNAMELEN 16 /* length of fs type name, including nul */ -#define MNAMELEN 32 /* length of buffer for returned name */ +#define MFSNAMELEN 15 /* length of fs type name, not inc. nul */ +#define MNAMELEN 90 /* length of buffer for returned name */ struct statfs { - short f_type; /* type of file system (unused; zero) */ - short f_flags; /* copy of mount flags */ + short f_otype; /* type of file system (reserved: zero) */ + short f_oflags; /* copy of mount flags (reserved: zero) */ long f_bsize; /* fundamental file system block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in file system */ @@ -68,12 +68,17 @@ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in file system */ long f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ + fsid_t f_fsid; /* file system id (super-user only) */ uid_t f_owner; /* user that mounted the file system */ - long f_spare[4]; /* spare for later */ + short f_reserved1; /* reserved for future use */ + short f_type; /* type of file system (reserved) */ + long f_flags; /* copy of mount flags (reserved) */ + long f_reserved2[2]; /* reserved for future use */ char f_fstypename[MFSNAMELEN]; /* fs type name */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ char f_mntfromname[MNAMELEN]; /* mounted file system */ + char f_reserved3; /* reserved for future use */ + long f_reserved4[4]; /* reserved for future use */ }; .Ed .Pp diff -urN xnu-344.49/bsd/man/man2/getsockopt.2 xnu-517/bsd/man/man2/getsockopt.2 --- xnu-344.49/bsd/man/man2/getsockopt.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/getsockopt.2 Tue Oct 21 21:24:55 2003 @@ -294,7 +294,12 @@ receiving additional data, it returns with a short count or with the error .Er EWOULDBLOCK -if no data were received. +if no data were received. The struct timeval parameter must represent a +positive time interval less than SHRT_MAX * 10 milliseconds (5 minutes +and 28 seconds) otherwise +.Fn setsockopt +returns with the error +.Er EDOM . .Pp .Dv SO_NOSIGPIPE is an option that prevents SIGPIPE from being raised when a write fails on a socket to which there is no reader; @@ -341,6 +346,8 @@ this error may also be returned if .Fa optlen is not in a valid part of the process address space. +.It Bq Er EDOM +The argument value is out of bounds. .El .Sh SEE ALSO .Xr ioctl 2 , diff -urN xnu-344.49/bsd/man/man2/intro.2 xnu-517/bsd/man/man2/intro.2 --- xnu-344.49/bsd/man/man2/intro.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/intro.2 Tue Oct 21 21:24:55 2003 @@ -404,6 +404,47 @@ .It Er 78 ENOSYS Em "Function not implemented" . Attempted a system call that is not available on this system. +.It Er 79 EFTYPE Em "Inappropriate file type or format" . +The file was the wrong type for the operation, or a data +file had the wrong format. +.It Er 80 EAUTH Em "Authentication error" . +Attempted to use an invalid authentication ticket to +mount an NFS file system. +.It Er 81 ENEEDAUTH Em "Need authenticator" . +An authentication ticket must be obtained before the +given NFS file system may be mounted. +.It Er 82 EPWROFF Em "Device power is off" . +The device power is off. +.It Er 83 EDEVERR Em "Device error" . +A device error has occurred, e.g. a printer running out of paper. +.It Er 84 EOVERFLOW Em "Value too large to be stored in data type" . +A numerical result of the function was too large to be +stored in the caller provided space. +.It Er 85 EBADEXEC Em "Bad executable (or shared library)" . +The executable or shared library being referenced was malformed. +.It Er 86 EBADARCH Em "Bad CPU type in executable" . +The executable in question does not support the current CPU. +.It Er 87 ESHLIBVERS Em "Shared library version mismatch" . +The version of the shared library on the system does not match +the version which was expected. +.It Er 88 EBADMACHO Em "Malformed Mach-o file" . +The Mach object file is malformed. +.It Er 89 ECANCELED Em "Operation canceled" . +The scheduled operation was canceled. +.It Er 90 EIDRM Em "Identifier removed" . +An IPC identifier was removed while the current process +was waiting on it. +.It Er 91 ENOMSG Em "No message of desired type" . +An IPC message queue does not contain a message of the +desired type, or a message catalog does not contain the +requested message. +.It Er 92 EILSEQ Em "Illegal byte sequence" . +While decoding a multibyte character the function came +along an invalid or an incomplete sequence of bytes or +the given wide character is invalid. +.It Er 93 ENOATTR Em "Attribute not found" . +The specified extended attribute does not exist. +.El .Sh DEFINITIONS .Bl -tag -width Ds .It Process ID . @@ -664,8 +705,8 @@ of a certain format. An Address Family is the set of addresses for a specific group of protocols. Each socket has an address chosen from the address family in which the socket was created. +.El .Sh SEE ALSO -.Xr intro 3 , .Xr perror 3 .Sh HISTORY An diff -urN xnu-344.49/bsd/man/man2/kqueue.2 xnu-517/bsd/man/man2/kqueue.2 --- xnu-344.49/bsd/man/man2/kqueue.2 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man2/kqueue.2 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,499 @@ +.\" Copyright (c) 2000 Jonathan Lemon +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/kqueue.2,v 1.32 2002/12/19 09:40:25 ru Exp $ +.\" +.Dd April 14, 2000 +.Dt KQUEUE 2 +.Os +.Sh NAME +.Nm kqueue , +.Nm kevent +.Nd kernel event notification mechanism +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In sys/event.h +.In sys/time.h +.Ft int +.Fn kqueue "void" +.Ft int +.Fn kevent "int kq" "const struct kevent *changelist" "int nchanges" "struct kevent *eventlist" "int nevents" "const struct timespec *timeout" +.Fn EV_SET "&kev" ident filter flags fflags data udata +.Sh DESCRIPTION +The +.Fn kqueue +system call +provides a generic method of notifying the user when an kernel +event (kevent) happens or a condition holds, based on the results +of small pieces of kernel code termed filters. +A kevent is identified by an (ident, filter) pair and specifies +the interesting conditions to be notified about for that pair. +An (ident, filter) pair can only appear once is a given kqueue. +Subsequent attempts to register the same pair for a given kqueue +will result in the replacement of the conditions being watched, +not an addition. +.Pp +The filter identified in a kevent is executed upon the initial +registration of that event in order to detect whether a preexisting +condition is present, and is also executed whenever an event is +passed to the filter for evaluation. +If the filter determines that the condition should be reported, +then the kevent is placed on the kqueue for the user to retrieve. +.Pp +The filter is also run when the user attempts to retrieve the kevent +from the kqueue. +If the filter indicates that the condition that triggered +the event no longer holds, the kevent is removed from the kqueue and +is not returned. +.Pp +Multiple events which trigger the filter do not result in multiple +kevents being placed on the kqueue; instead, the filter will aggregate +the events into a single struct kevent. +Calling +.Fn close +on a file descriptor will remove any kevents that reference the descriptor. +.Pp +The +.Fn kqueue +system call +creates a new kernel event queue and returns a descriptor. +The queue is not inherited by a child created with +.Xr fork 2 . +.Pp +The +.Fn kevent +system call +is used to register events with the queue, and return any pending +events to the user. +The +.Fa changelist +argument +is a pointer to an array of +.Va kevent +structures, as defined in +.Aq Pa sys/event.h . +All changes contained in the +.Fa changelist +are applied before any pending events are read from the queue. +The +.Fa nchanges +argument +gives the size of +.Fa changelist . +The +.Fa eventlist +argument +is a pointer to an array of kevent structures. +The +.Fa nevents +argument +determines the size of +.Fa eventlist . +If +.Fa timeout +is a non-NULL pointer, it specifies a maximum interval to wait +for an event, which will be interpreted as a struct timespec. If +.Fa timeout +is a NULL pointer, +.Fn kevent +waits indefinitely. To effect a poll, the +.Fa timeout +argument should be non-NULL, pointing to a zero-valued +.Va timespec +structure. The same array may be used for the +.Fa changelist +and +.Fa eventlist . +.Pp +The +.Fn EV_SET +macro is provided for ease of initializing a +kevent structure. +.Pp +The +.Va kevent +structure is defined as: +.Bd -literal +struct kevent { + uintptr_t ident; /* identifier for this event */ + short filter; /* filter for event */ + u_short flags; /* action flags for kqueue */ + u_int fflags; /* filter flag value */ + intptr_t data; /* filter data value */ + void *udata; /* opaque user data identifier */ +}; +.Ed +.Pp +The fields of +.Fa struct kevent +are: +.Bl -tag -width XXXfilter +.It ident +Value used to identify this event. +The exact interpretation is determined by the attached filter, +but often is a file descriptor. +.It filter +Identifies the kernel filter used to process this event. The pre-defined +system filters are described below. +.It flags +Actions to perform on the event. +.It fflags +Filter-specific flags. +.It data +Filter-specific data value. +.It udata +Opaque user-defined value passed through the kernel unchanged. +.El +.Pp +The +.Va flags +field can contain the following values: +.Bl -tag -width XXXEV_ONESHOT +.It EV_ADD +Adds the event to the kqueue. Re-adding an existing event +will modify the parameters of the original event, and not result +in a duplicate entry. Adding an event automatically enables it, +unless overridden by the EV_DISABLE flag. +.It EV_ENABLE +Permit +.Fn kevent +to return the event if it is triggered. +.It EV_DISABLE +Disable the event so +.Fn kevent +will not return it. The filter itself is not disabled. +.It EV_DELETE +Removes the event from the kqueue. Events which are attached to +file descriptors are automatically deleted on the last close of +the descriptor. +.It EV_ONESHOT +Causes the event to return only the first occurrence of the filter +being triggered. After the user retrieves the event from the kqueue, +it is deleted. +.It EV_CLEAR +After the event is retrieved by the user, its state is reset. +This is useful for filters which report state transitions +instead of the current state. Note that some filters may automatically +set this flag internally. +.It EV_EOF +Filters may set this flag to indicate filter-specific EOF condition. +.It EV_ERROR +See +.Sx RETURN VALUES +below. +.El +.Pp +The predefined system filters are listed below. +Arguments may be passed to and from the filter via the +.Va fflags +and +.Va data +fields in the kevent structure. +.Bl -tag -width EVFILT_SIGNAL +.It EVFILT_READ +Takes a file descriptor as the identifier, and returns whenever +there is data available to read. +The behavior of the filter is slightly different depending +on the descriptor type. +.Pp +.Bl -tag -width 2n +.It Sockets +Sockets which have previously been passed to +.Fn listen +return when there is an incoming connection pending. +.Va data +contains the size of the listen backlog. +.Pp +Other socket descriptors return when there is data to be read, +subject to the +.Dv SO_RCVLOWAT +value of the socket buffer. +This may be overridden with a per-filter low water mark at the +time the filter is added by setting the +NOTE_LOWAT +flag in +.Va fflags , +and specifying the new low water mark in +.Va data . +On return, +.Va data +contains the number of bytes of protocol data available to read. +.Pp +If the read direction of the socket has shutdown, then the filter +also sets EV_EOF in +.Va flags , +and returns the socket error (if any) in +.Va fflags . +It is possible for EOF to be returned (indicating the connection is gone) +while there is still data pending in the socket buffer. +.It Vnodes +Returns when the file pointer is not at the end of file. +.Va data +contains the offset from current position to end of file, +and may be negative. +.It "Fifos, Pipes" +Returns when the there is data to read; +.Va data +contains the number of bytes available. +.Pp +When the last writer disconnects, the filter will set EV_EOF in +.Va flags . +This may be cleared by passing in EV_CLEAR, at which point the +filter will resume waiting for data to become available before +returning. +.El +.It EVFILT_WRITE +Takes a file descriptor as the identifier, and returns whenever +it is possible to write to the descriptor. For sockets, pipes +and fifos, +.Va data +will contain the amount of space remaining in the write buffer. +The filter will set EV_EOF when the reader disconnects, and for +the fifo case, this may be cleared by use of EV_CLEAR. +Note that this filter is not supported for vnodes. +.Pp +For sockets, the low water mark and socket error handling is +identical to the EVFILT_READ case. +.It EVFILT_AIO +This filter is currently unsupported. +.\"The sigevent portion of the AIO request is filled in, with +.\".Va sigev_notify_kqueue +.\"containing the descriptor of the kqueue that the event should +.\"be attached to, +.\".Va sigev_value +.\"containing the udata value, and +.\".Va sigev_notify +.\"set to SIGEV_KEVENT. +.\"When the +.\".Fn aio_* +.\"system call is made, the event will be registered +.\"with the specified kqueue, and the +.\".Va ident +.\"argument set to the +.\".Fa struct aiocb +.\"returned by the +.\".Fn aio_* +.\"system call. +.\"The filter returns under the same conditions as aio_error. +.\".Pp +.\"Alternatively, a kevent structure may be initialized, with +.\".Va ident +.\"containing the descriptor of the kqueue, and the +.\"address of the kevent structure placed in the +.\".Va aio_lio_opcode +.\"field of the AIO request. However, this approach will not work on +.\"architectures with 64-bit pointers, and should be considered deprecated. +.It EVFILT_VNODE +Takes a file descriptor as the identifier and the events to watch for in +.Va fflags , +and returns when one or more of the requested events occurs on the descriptor. +The events to monitor are: +.Bl -tag -width XXNOTE_RENAME +.It NOTE_DELETE +The +.Fn unlink +system call +was called on the file referenced by the descriptor. +.It NOTE_WRITE +A write occurred on the file referenced by the descriptor. +.It NOTE_EXTEND +The file referenced by the descriptor was extended. +.It NOTE_ATTRIB +The file referenced by the descriptor had its attributes changed. +.It NOTE_LINK +The link count on the file changed. +.It NOTE_RENAME +The file referenced by the descriptor was renamed. +.It NOTE_REVOKE +Access to the file was revoked via +.Xr revoke 2 +or the underlying fileystem was unmounted. +.El +.Pp +On return, +.Va fflags +contains the events which triggered the filter. +.It EVFILT_PROC +Takes the process ID to monitor as the identifier and the events to watch for +in +.Va fflags , +and returns when the process performs one or more of the requested events. +If a process can normally see another process, it can attach an event to it. +The events to monitor are: +.Bl -tag -width XXNOTE_TRACKERR +.It NOTE_EXIT +The process has exited. +.It NOTE_FORK +The process has called +.Fn fork . +.It NOTE_EXEC +The process has executed a new process via +.Xr execve 2 +or similar call. +.It NOTE_TRACK +Follow a process across +.Fn fork +calls. The parent process will return with NOTE_TRACK set in the +.Va fflags +field, while the child process will return with NOTE_CHILD set in +.Va fflags +and the parent PID in +.Va data . +.It NOTE_TRACKERR +This flag is returned if the system was unable to attach an event to +the child process, usually due to resource limitations. +.El +.Pp +On return, +.Va fflags +contains the events which triggered the filter. +.It EVFILT_SIGNAL +Takes the signal number to monitor as the identifier and returns +when the given signal is delivered to the process. +This coexists with the +.Fn signal +and +.Fn sigaction +facilities, and has a lower precedence. The filter will record +all attempts to deliver a signal to a process, even if the signal has +been marked as SIG_IGN. Event notification happens after normal +signal delivery processing. +.Va data +returns the number of times the signal has occurred since the last call to +.Fn kevent . +This filter automatically sets the EV_CLEAR flag internally. +.It EVFILT_TIMER +This filter is currently unsupported. +.\"Establishes an arbitrary timer identified by +.\".Va ident . +.\"When adding a timer, +.\".Va data +.\"specifies the timeout period in milliseconds. +.\"The timer will be periodic unless EV_ONESHOT is specified. +.\"On return, +.\".Va data +.\"contains the number of times the timeout has expired since the last call to +.\".Fn kevent . +.\"This filter automatically sets the EV_CLEAR flag internally. +.El +.Sh RETURN VALUES +The +.Fn kqueue +system call +creates a new kernel event queue and returns a file descriptor. +If there was an error creating the kernel event queue, a value of -1 is +returned and errno set. +.Pp +The +.Fn kevent +system call +returns the number of events placed in the +.Fa eventlist , +up to the value given by +.Fa nevents . +If an error occurs while processing an element of the +.Fa changelist +and there is enough room in the +.Fa eventlist , +then the event will be placed in the +.Fa eventlist +with +.Dv EV_ERROR +set in +.Va flags +and the system error in +.Va data . +Otherwise, +.Dv -1 +will be returned, and +.Dv errno +will be set to indicate the error condition. +If the time limit expires, then +.Fn kevent +returns 0. +.Sh ERRORS +The +.Fn kqueue +system call fails if: +.Bl -tag -width Er +.It Bq Er ENOMEM +The kernel failed to allocate enough memory for the kernel queue. +.It Bq Er EMFILE +The per-process descriptor table is full. +.It Bq Er ENFILE +The system file table is full. +.El +.Pp +The +.Fn kevent +system call fails if: +.Bl -tag -width Er +.It Bq Er EACCES +The process does not have permission to register a filter. +.It Bq Er EFAULT +There was an error reading or writing the +.Va kevent +structure. +.It Bq Er EBADF +The specified descriptor is invalid. +.It Bq Er EINTR +A signal was delivered before the timeout expired and before any +events were placed on the kqueue for return. +.It Bq Er EINVAL +The specified time limit or filter is invalid. +.It Bq Er ENOENT +The event could not be found to be modified or deleted. +.It Bq Er ENOMEM +No memory was available to register the event. +.It Bq Er ESRCH +The specified process to attach to does not exist. +.El +.Sh SEE ALSO +.Xr aio_error 2 , +.Xr aio_read 2 , +.Xr aio_return 2 , +.Xr read 2 , +.Xr select 2 , +.Xr sigaction 2 , +.Xr write 2 , +.Xr signal 3 +.Sh HISTORY +The +.Fn kqueue +and +.Fn kevent +system calls first appeared in +.Fx 4.1 . +.Sh AUTHORS +The +.Fn kqueue +system and this manual page were written by +.An Jonathan Lemon Aq jlemon@FreeBSD.org . +.Sh BUGS +Not all filesystem types support kqueue-style notifications. +And even some that do, like some remote filesystems, may only +support a subset of the notification semantics described +here. diff -urN xnu-344.49/bsd/man/man2/mmap.2 xnu-517/bsd/man/man2/mmap.2 --- xnu-344.49/bsd/man/man2/mmap.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/mmap.2 Tue Oct 21 21:24:55 2003 @@ -125,10 +125,6 @@ .It Dv MAP_HASSEMAPHORE Notify the kernel that the region may contain semaphores and that special handling may be necessary. -.It Dv MAP_INHERIT -Permit regions to be inherited across -.Xr exec 2 -system calls. .It Dv MAP_PRIVATE Modifications are private. .It Dv MAP_SHARED diff -urN xnu-344.49/bsd/man/man2/mount.2 xnu-517/bsd/man/man2/mount.2 --- xnu-344.49/bsd/man/man2/mount.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/mount.2 Tue Oct 21 21:24:55 2003 @@ -82,9 +82,6 @@ .It Dv MNT_RDONLY The filesystem should be treated as read-only; Even the super-user may not write on it. -.It Dv MNT_NOATIME -Do not update the access time on files in the filesystem unless -the modification or status change times are also being updated. .It Dv MNT_NOEXEC Do not allow files to be executed from the filesystem. .It Dv MNT_NOSUID @@ -107,58 +104,20 @@ For example, most filesystems will not allow a change from read-write to read-only. .Pp +The flag +.Dv MNT_RELOAD +causes the vfs subsystem to update its data structures pertaining to +the specified already mounted filesystem. +.Pp The .Fa type argument defines the type of the filesystem. -The types of filesystems known to the system are defined in -.Aq Pa sys/mount.h . +.Pp .Fa Data is a pointer to a structure that contains the type specific arguments to mount. -The currently supported types of filesystems and -their type specific data are: -.Pp -.Dv MOUNT_FFS -.Bd -literal -offset indent -compact -struct ufs_args { - char *fspec; /* block special file to mount */ - struct export_args export; /* network export information */ -}; -.Ed -.Pp -.Dv MOUNT_NFS -.Bd -literal -offset indent -compact -struct nfs_args { - int version; /* args structure version */ - struct sockaddr *addr; /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - u_char *fh; /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* Term (sec) of lease */ - int deadthresh; /* Retrans threshold */ - char *hostname; /* server's name */ -}; -.Ed -.Pp -.Dv MOUNT_MFS -.Bd -literal -offset indent -compact -struct mfs_args { - char *fspec; /* name to export for statfs */ - struct export_args export; /* if we can export an MFS */ - caddr_t base; /* base of filesystem in mem */ - u_long size; /* size of filesystem */ -}; -.Ed +The format for these argument structures is described in the +manual page for each filesystem. .Pp The .Fn umount @@ -193,7 +152,8 @@ will fail when one of the following occurs: .Bl -tag -width [ENAMETOOLONG] .It Bq Er EPERM -The caller is not the super-user. +The caller is not the super-user, and the device-node and the mountpoint +do not have adequate ownership and permissions. .It Bq Er ENAMETOOLONG A component of a pathname exceeded .Dv {NAME_MAX} @@ -223,78 +183,13 @@ points outside the process's allocated address space. .El .Pp -The following errors can occur for a -.Em ufs -filesystem mount: -.Bl -tag -width [ENOTBLK] -.It Bq Er ENODEV -A component of ufs_args -.Ar fspec -does not exist. -.It Bq Er ENOTBLK -.Ar Fspec -is not a block device. -.It Bq Er ENXIO -The major device number of -.Ar fspec -is out of range (this indicates no device driver exists -for the associated hardware). -.It Bq Er EBUSY -.Ar Fspec -is already mounted. -.It Bq Er EMFILE -No space remains in the mount table. -.It Bq Er EINVAL -The super block for the filesystem had a bad magic -number or an out of range block size. -.It Bq Er ENOMEM -Not enough memory was available to read the cylinder -group information for the filesystem. -.It Bq Er EIO -An I/O error occurred while reading the super block or -cylinder group information. -.It Bq Er EFAULT -.Ar Fspec -points outside the process's allocated address space. -.El -.Pp -The following errors can occur for a -.Em nfs -filesystem mount: -.Bl -tag -width [ETIMEDOUT] -.It Bq Er ETIMEDOUT -.Em Nfs -timed out trying to contact the server. -.It Bq Er EFAULT -Some part of the information described by nfs_args -points outside the process's allocated address space. -.El -.Pp -The following errors can occur for a -.Em mfs -filesystem mount: -.Bl -tag -width [EMFILE] -.It Bq Er EMFILE -No space remains in the mount table. -.It Bq Er EINVAL -The super block for the filesystem had a bad magic -number or an out of range block size. -.It Bq Er ENOMEM -Not enough memory was available to read the cylinder -group information for the filesystem. -.It Bq Er EIO -A paging error occurred while reading the super block or -cylinder group information. -.It Bq Er EFAULT -.Em Name -points outside the process's allocated address space. -.El -.Pp .Nm Umount may fail with one of the following errors: .Bl -tag -width [ENAMETOOLONG] .It Bq Er EPERM -The caller is not the super-user. +The caller is not the super-user, and the +.Nm mount() +was not done by the user. .It Bq Er ENOTDIR A component of the path is not a directory. .It Bq Er EINVAL @@ -318,17 +213,9 @@ .Fa Dir points outside the process's allocated address space. .El -.Pp -A -.Em ufs -or -.Em mfs -mount can also fail if the maximum number of filesystems are currently -mounted. .Sh SEE ALSO .Xr mount 8 , .Xr umount 8 , -.Xr mfs 8 .Sh BUGS Some of the error codes need translation to more obvious messages. .Sh HISTORY diff -urN xnu-344.49/bsd/man/man2/msync.2 xnu-517/bsd/man/man2/msync.2 --- xnu-344.49/bsd/man/man2/msync.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/msync.2 Tue Oct 21 21:24:55 2003 @@ -48,17 +48,9 @@ .Sh DESCRIPTION The .Fn msync -system call -writes any modified pages back to the filesystem and updates -the file modification time. -If -.Fa len -is 0, all modified pages within the region containing -.Fa addr -will be flushed; -if -.Fa len -is non-zero, only those pages containing +system call writes modified whole pages back to the filesystem +and updates the file modification time. +Only those pages containing .Fa addr and .Fa len-1 @@ -71,6 +63,10 @@ MS_SYNC Perform synchronous writes MS_INVALIDATE Invalidate all cached data .Ed +.Pp +The +.Fa MS_ASYNC +flag is not permitted to be combined with other flags. .Sh RETURN VALUES If any errors occur, -1 is returned and errno is set to indicate the error. @@ -84,11 +80,10 @@ is not a multiple of the hardware page size. .It Bq Er EINVAL .Fa len -is too large or negative. +is too large, or less than 1. .It Bq Er EINVAL .Fa flags -was both MS_ASYNC and MS_INVALIDATE. -Only one of these flags is allowed. +combined MS_ASYNC with another flag, which is not permitted. .It Bq Er EIO An I/O error occurred while writing to the file system. .El diff -urN xnu-344.49/bsd/man/man2/munmap.2 xnu-517/bsd/man/man2/munmap.2 --- xnu-344.49/bsd/man/man2/munmap.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/munmap.2 Tue Oct 21 21:24:55 2003 @@ -70,6 +70,7 @@ parameter was negative, or some part of the region being unmapped is not part of the currently valid address space. +.El .Sh "SEE ALSO" .Xr getpagesize 3 , .Xr msync 2 , diff -urN xnu-344.49/bsd/man/man2/ptrace.2 xnu-517/bsd/man/man2/ptrace.2 --- xnu-344.49/bsd/man/man2/ptrace.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/ptrace.2 Tue Oct 21 21:24:55 2003 @@ -370,6 +370,7 @@ .Dv PT_ATTACH above. .El +.El .Sh BUGS On the SPARC, the PC is set to the provided PC value for .Dv PT_CONTINUE diff -urN xnu-344.49/bsd/man/man2/select.2 xnu-517/bsd/man/man2/select.2 --- xnu-344.49/bsd/man/man2/select.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/select.2 Tue Oct 21 21:24:55 2003 @@ -40,6 +40,8 @@ .Nm select .Nd synchronous I/O multiplexing .Sh SYNOPSIS +.Fd #include +.D1 "- or -" .Fd #include .Fd #include .Fd #include diff -urN xnu-344.49/bsd/man/man2/semctl.2 xnu-517/bsd/man/man2/semctl.2 --- xnu-344.49/bsd/man/man2/semctl.2 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man2/semctl.2 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,202 @@ +.\" +.\" Copyright (c) 1995 David Hovemeyer +.\" +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/semctl.2,v 1.18 2002/12/19 09:40:25 ru Exp $ +.\" +.Dd September 12, 1995 +.Dt SEMCTL 2 +.Os +.Sh NAME +.Nm semctl +.Nd control operations on a semaphore set +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In sys/ipc.h +.In sys/sem.h +.Ft int +.Fn semctl "int semid" "int semnum" "int cmd" ... +.Sh DESCRIPTION +The +.Fn semctl +system call +performs the operation indicated by +.Fa cmd +on the semaphore set indicated by +.Fa semid . +A fourth argument, a +.Fa "union semun arg" , +is required for certain values of +.Fa cmd . +For the commands that use the +.Fa arg +argument, +.Fa "union semun" +is defined as follows: +.Bd -literal +.\" +.\" From : +.\" +union semun { + int val; /* value for SETVAL */ + struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */ + u_short *array; /* array for GETALL & SETALL */ +}; +.Ed +.Pp +Commands are performed as follows: +.\" +.\" This section based on Stevens, _Advanced Programming in the UNIX +.\" Environment_. +.\" +.Bl -tag -width IPC_RMIDXXX +.It Dv IPC_STAT +Fetch the semaphore set's +.Fa "struct semid_ds" , +storing it in the memory pointed to by +.Fa arg.buf . +.It Dv IPC_SET +Changes the +.Fa sem_perm.uid , +.Fa sem_perm.gid , +and +.Fa sem_perm.mode +members of the semaphore set's +.Fa "struct semid_ds" +to match those of the struct pointed to by +.Fa arg.buf . +The calling process's effective uid must +match either +.Fa sem_perm.uid +or +.Fa sem_perm.cuid , +or it must have superuser privileges. +.It IPC_RMID +Immediately removes the semaphore set from the system. The calling +process's effective uid must equal the semaphore set's +.Fa sem_perm.uid +or +.Fa sem_perm.cuid , +or the process must have superuser privileges. +.It Dv GETVAL +Return the value of semaphore number +.Fa semnum . +.It Dv SETVAL +Set the value of semaphore number +.Fa semnum +to +.Fa arg.val . +Outstanding adjust on exit values for this semaphore in any process +are cleared. +.It Dv GETPID +Return the pid of the last process to perform an operation on +semaphore number +.Fa semnum . +.It Dv GETNCNT +Return the number of processes waiting for semaphore number +.Fa semnum Ns 's +value to become greater than its current value. +.It Dv GETZCNT +Return the number of processes waiting for semaphore number +.Fa semnum Ns 's +value to become 0. +.It Dv GETALL +Fetch the value of all of the semaphores in the set into the +array pointed to by +.Fa arg.array . +.It Dv SETALL +Set the values of all of the semaphores in the set to the values +in the array pointed to by +.Fa arg.array . +Outstanding adjust on exit values for all semaphores in this set, +in any process are cleared. +.El +.Pp +The +.Vt "struct semid_ds" +is defined as follows: +.Bd -literal +.\" +.\" Taken straight from . +.\" +struct semid_ds { + struct ipc_perm sem_perm; /* operation permission struct */ + struct sem *sem_base; /* pointer to first semaphore in set */ + u_short sem_nsems; /* number of sems in set */ + time_t sem_otime; /* last operation time */ + long sem_pad1; /* SVABI/386 says I need this here */ + time_t sem_ctime; /* last change time */ + /* Times measured in secs since */ + /* 00:00:00 GMT, Jan. 1, 1970 */ + long sem_pad2; /* SVABI/386 says I need this here */ + long sem_pad3[4]; /* SVABI/386 says I need this here */ +}; +.Ed +.Sh RETURN VALUES +On success, when +.Fa cmd +is one of +.Dv GETVAL , GETPID , GETNCNT +or +.Dv GETZCNT , +.Fn semctl +returns the corresponding value; otherwise, 0 is returned. +On failure, -1 is returned, and +.Va errno +is set to indicate the error. +.Sh ERRORS +The +.Fn semctl +system call +will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +No semaphore set corresponds to +.Fa semid . +.It Bq Er EINVAL +The +.Fa semnum +argument +is not in the range of valid semaphores for given semaphore set. +.It Bq Er EPERM +The calling process's effective uid does not match the uid of +the semaphore set's owner or creator. +.It Bq Er EACCES +Permission denied due to mismatch between operation and mode of +semaphore set. +.It Bq Er ERANGE +.Dv SETVAL +or +.Dv SETALL +attempted to set a semaphore outside the allowable range +.Bq 0 .. Dv SEMVMX . +.El +.Sh SEE ALSO +.Xr semget 2 , +.Xr semop 2 +.Sh BUGS +.Dv SETALL +may update some semaphore elements before returning an error. diff -urN xnu-344.49/bsd/man/man2/semget.2 xnu-517/bsd/man/man2/semget.2 --- xnu-344.49/bsd/man/man2/semget.2 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man2/semget.2 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,146 @@ +.\" +.\" Copyright (c) 1995 David Hovemeyer +.\" +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/semget.2,v 1.14 2002/12/19 09:40:25 ru Exp $ +.\" +.Dd September 12, 1995 +.Dt SEMGET 2 +.Os +.Sh NAME +.Nm semget +.Nd obtain a semaphore id +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In sys/ipc.h +.In sys/sem.h +.Ft int +.Fn semget "key_t key" "int nsems" "int flag" +.Sh DESCRIPTION +Based on the values of +.Fa key +and +.Fa flag , +.Fn semget +returns the identifier of a newly created or previously existing +set of semaphores. +.\" +.\" This is copied verbatim from the shmget manpage. Perhaps +.\" it should go in a common manpage, such as .Xr ipc 2 +.\" +The key +is analogous to a filename: it provides a handle that names an +IPC object. There are three ways to specify a key: +.Bl -bullet +.It +IPC_PRIVATE may be specified, in which case a new IPC object +will be created. +.It +An integer constant may be specified. If no IPC object corresponding +to +.Fa key +is specified and the IPC_CREAT bit is set in +.Fa flag , +a new one will be created. +.It +The +.Xr ftok 3 +function +may be used to generate a key from a pathname. +.El +.\" +.\" Likewise for this section, except SHM_* becomes SEM_*. +.\" +.Pp +The mode of a newly created IPC object is determined by +.Em OR Ns 'ing +the following constants into the +.Fa flag +argument: +.Bl -tag -width XSEM_WXX6XXX +.It Dv SEM_R +Read access for user. +.It Dv SEM_A +Alter access for user. +.It Dv ( SEM_R>>3 ) +Read access for group. +.It Dv ( SEM_A>>3 ) +Alter access for group. +.It Dv ( SEM_R>>6 ) +Read access for other. +.It Dv ( SEM_A>>6 ) +Alter access for other. +.El +.Pp +If a new set of semaphores is being created, +.Fa nsems +is used to indicate the number of semaphores the set should contain. +Otherwise, +.Fa nsems +may be specified as 0. +.Sh RETURN VALUES +The +.Fn semget +system call +returns the id of a semaphore set if successful; otherwise, -1 +is returned and +.Va errno +is set to indicate the error. +.Sh ERRORS +The +.Fn semget +system call +will fail if: +.Bl -tag -width Er +.\" ipcperm could fail (we're opening to read and write, as it were) +.It Bq Er EACCES +Access permission failure. +.\" +.\" sysv_sem.c is quite explicit about these, so I'm pretty sure +.\" this is accurate +.\" +.It Bq Er EEXIST +IPC_CREAT and IPC_EXCL were specified, and a semaphore set +corresponding to +.Fa key +already exists. +.It Bq Er EINVAL +The number of semaphores requested exceeds the system imposed maximum +per set. +.It Bq Er ENOSPC +Insufficiently many semaphores are available. +.It Bq Er ENOSPC +The kernel could not allocate a +.Fa "struct semid_ds" . +.It Bq Er ENOENT +No semaphore set was found corresponding to +.Fa key , +and IPC_CREAT was not specified. +.El +.Sh SEE ALSO +.Xr semctl 2 , +.Xr semop 2 , +.Xr ftok 3 diff -urN xnu-344.49/bsd/man/man2/semop.2 xnu-517/bsd/man/man2/semop.2 --- xnu-344.49/bsd/man/man2/semop.2 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man2/semop.2 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,289 @@ +.\" +.\" Copyright (c) 1995 David Hovemeyer +.\" +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/sys/semop.2,v 1.18 2003/01/25 21:27:37 alfred Exp $ +.\" +.Dd September 22, 1995 +.Dt SEMOP 2 +.Os +.Sh NAME +.Nm semop +.Nd atomic array of operations on a semaphore set +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In sys/ipc.h +.In sys/sem.h +.Ft int +.Fn semop "int semid" "struct sembuf *array" "size_t nops" +.Sh DESCRIPTION +The +.Fn semop +system call +atomically performs the array of operations indicated by +.Fa array +on the semaphore set indicated by +.Fa semid . +The length of +.Fa array +is indicated by +.Fa nops . +Each operation is encoded in a +.Vt "struct sembuf" , +which is defined as follows: +.Bd -literal +.\" +.\" From +.\" +struct sembuf { + u_short sem_num; /* semaphore # */ + short sem_op; /* semaphore operation */ + short sem_flg; /* operation flags */ +}; +.Ed +.Pp +For each element in +.Fa array , +.Va sem_op +and +.Va sem_flg +determine an operation to be performed on semaphore number +.Va sem_num +in the set. +The values +.Dv SEM_UNDO +and +.Dv IPC_NOWAIT +may be +.Em OR Ns 'ed +into the +.Va sem_flg +member in order to modify the behavior of the given operation. +.Pp +The operation performed depends as follows on the value of +.Va sem_op : +.\" +.\" This section is based on the description of semop() in +.\" Stevens, _Advanced Programming in the UNIX Environment_, +.\" and the semop(2) description in The Open Group Unix2 specification. +.\" +.Bl -bullet +.It +When +.Va sem_op +is positive and the process has alter permission, +the semaphore's value is incremented by +.Va sem_op Ns 's +value. +If +.Dv SEM_UNDO +is specified, the semaphore's adjust on exit value is decremented by +.Va sem_op Ns 's +value. +A positive value for +.Va sem_op +generally corresponds to a process releasing a resource +associated with the semaphore. +.It +The behavior when +.Va sem_op +is negative and the process has alter permission, +depends on the current value of the semaphore: +.Bl -bullet +.It +If the current value of the semaphore is greater than or equal to +the absolute value of +.Va sem_op , +then the value is decremented by the absolute value of +.Va sem_op . +If +.Dv SEM_UNDO +is specified, the semaphore's adjust on exit +value is incremented by the absolute value of +.Va sem_op . +.It +If the current value of the semaphore is less than the absolute value of +.Va sem_op , +one of the following happens: +.\" XXX a *second* sublist? +.Bl -bullet +.It +If +.Dv IPC_NOWAIT +was specified, then +.Fn semop +returns immediately with a return value of +.Er EAGAIN . +.It +Otherwise, the calling process is put to sleep until one of the following +conditions is satisfied: +.\" XXX We already have two sublists, why not a third? +.Bl -bullet +.It +Some other process removes the semaphore with the +.Dv IPC_RMID +option of +.Xr semctl 2 . +In this case, +.Fn semop +returns immediately with a return value of +.Er EIDRM . +.It +The process receives a signal that is to be caught. +In this case, the process will resume execution as defined by +.Xr sigaction 2 . +.It +The semaphore's +value is greater than or equal to the absolute value of +.Va sem_op . +When this condition becomes true, the semaphore's value is decremented +by the absolute value of +.Va sem_op , +the semaphore's adjust on exit value is incremented by the +absolute value of +.Va sem_op . +.El +.El +.El +.Pp +A negative value for +.Va sem_op +generally means that a process is waiting for a resource to become +available. +.It +When +.Va sem_op +is zero and the process has read permission, +one of the following will occur: +.Bl -bullet +.It +If the current value of the semaphore is equal to zero +then +.Fn semop +can return immediately. +.It +If +.Dv IPC_NOWAIT +was specified, then +.Fn semop +returns immediately with a return value of +.Er EAGAIN . +.It +Otherwise, the calling process is put to sleep until one of the following +conditions is satisfied: +.\" XXX Another nested sublists +.Bl -bullet +.It +Some other process removes the semaphore with the +.Dv IPC_RMID +option of +.Xr semctl 2 . +In this case, +.Fn semop +returns immediately with a return value of +.Er EIDRM . +.It +The process receives a signal that is to be caught. +In this case, the process will resume execution as defined by +.Xr sigaction 2 . +.It +The semaphore's value becomes zero. +.El +.El +.El +.Pp +For each semaphore a process has in use, the kernel maintains an +.Dq "adjust on exit" +value, as alluded to earlier. +When a process +exits, either voluntarily or involuntarily, the adjust on exit value +for each semaphore is added to the semaphore's value. +This can +be used to insure that a resource is released if a process terminates +unexpectedly. +.Sh RETURN VALUES +.Rv -std semop +.Sh ERRORS +The +.Fn semop +system call will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +No semaphore set corresponds to +.Fa semid , +or the process would exceed the system-defined limit for the number of +per-process +.Dv SEM_UNDO +structures. +.It Bq Er EACCES +Permission denied due to mismatch between operation and mode of +semaphore set. +.It Bq Er EAGAIN +The semaphore's value would have resulted in the process being put to sleep +and +.Dv IPC_NOWAIT +was specified. +.It Bq Er E2BIG +Too many operations were specified. +.Bq Dv SEMOPM +.It Bq Er EFBIG +.\" +.\" I'd have thought this would be EINVAL, but the source says +.\" EFBIG. +.\" +.Va sem_num +was not in the range of valid semaphores for the set. +.It Bq Er EIDRM +The semaphore set was removed from the system. +.It Bq Er EINTR +The +.Fn semop +system call was interrupted by a signal. +.It Bq Er ENOSPC +The system +.Dv SEM_UNDO +pool +.Bq Dv SEMMNU +is full. +.It Bq Er ERANGE +The requested operation would cause either +the semaphore's current value +.Bq Dv SEMVMX +or its adjust on exit value +.Bq Dv SEMAEM +to exceed the system-imposed limits. +.El +.Sh SEE ALSO +.Xr semctl 2 , +.Xr semget 2 , +.Xr sigaction 2 +.Sh BUGS +The +.Fn semop +system call +may block waiting for memory even if +.Dv IPC_NOWAIT +was specified. diff -urN xnu-344.49/bsd/man/man2/setpgid.2 xnu-517/bsd/man/man2/setpgid.2 --- xnu-344.49/bsd/man/man2/setpgid.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/setpgid.2 Tue Oct 21 21:24:55 2003 @@ -69,7 +69,7 @@ .Fn Setpgid will fail and the process group will not be altered if: .Bl -tag -width Er -.It Bq Er EACCESS +.It Bq Er EACCES The value of the .Fa pid argument matches the process ID of a child process of the calling process, diff -urN xnu-344.49/bsd/man/man2/shmat.2 xnu-517/bsd/man/man2/shmat.2 --- xnu-344.49/bsd/man/man2/shmat.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/shmat.2 Tue Oct 21 21:24:55 2003 @@ -62,11 +62,9 @@ (SHMLBA is defined in .Aq Pa sys/shm.h ). - A shared memory segment can be mapped read-only by specifying the SHM_RDONLY flag in .Fa shmflg . - .Fn shmdt unmaps the shared memory segment that is currently mapped at .Fa shmaddr @@ -91,7 +89,7 @@ .Fn shmat will fail if: .Bl -tag -width Er -.It Bq Er EACCESS +.It Bq Er EACCES The calling process has no permission to access this shared memory segment. .It Bq Er ENOMEM There is not enough available data space for the calling process to @@ -99,19 +97,18 @@ .It Bq Er EINVAL .Fa shmid is not a valid shared memory identifier. - .Fa shmaddr specifies an illegal address. .It Bq Er EMFILE The number of shared memory segments has reached the system-wide limit. .El - .Fn shmdt will fail if: .Bl -tag -width Er .It Bq Er EINVAL .Fa shmaddr is not the start address of a mapped shared memory segment. +.El .Sh SEE ALSO .Xr shmctl 2 , .Xr shmget 2 , diff -urN xnu-344.49/bsd/man/man2/shmctl.2 xnu-517/bsd/man/man2/shmctl.2 --- xnu-344.49/bsd/man/man2/shmctl.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/shmctl.2 Tue Oct 21 21:24:55 2003 @@ -48,13 +48,11 @@ system call performs some control operations on the shared memory area specified by .Fa shmid . - Each shared memory segment has a data structure associated with it, parts of which may be altered by .Fn shmctl and parts of which determine the actions of .Fn shmctl . - This structure is defined as follows in .Aq Pa sys/shm.h : .Bd -literal @@ -92,7 +90,6 @@ key_t key; /* user specified msg/sem/shm key */ }; .Ed - The operation to be performed by .Fn shmctl is specified in @@ -120,7 +117,6 @@ or .Va shm_perm.uid in the data structure associated with the shared memory segment. - .It Dv IPC_RMID Remove the shared memory segment specified by .Fa shmid @@ -131,7 +127,6 @@ .Va shm_perm.uid values in the data structure associated with the queue can do this. .El - The read and write permissions on a shared memory identifier are determined by the .Va shm_perm.mode @@ -164,18 +159,16 @@ or .Va shm_perm.cuid fields of the data structure associated with the shared memory segment. - An attempt is made to increase the value of .Va shm_qbytes through IPC_SET but the caller is not the super-user. -.It Bq Er EACCESS +.It Bq Er EACCES The command is IPC_STAT and the caller has no read permission for this shared memory segment. .It Bq Er EINVAL .Fa shmid is not a valid shared memory segment identifier. - .Va cmd is not a valid command. .It Bq Er EFAULT diff -urN xnu-344.49/bsd/man/man2/sigaction.2 xnu-517/bsd/man/man2/sigaction.2 --- xnu-344.49/bsd/man/man2/sigaction.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/sigaction.2 Tue Oct 21 21:24:55 2003 @@ -1,5 +1,3 @@ -.\" $NetBSD: sigaction.2,v 1.7 1995/10/12 15:41:16 jtc Exp $ -.\" .\" Copyright (c) 1980, 1990, 1993 .\" The Regents of the University of California. All rights reserved. .\" @@ -31,7 +29,8 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.\" @(#)sigaction.2 8.2 (Berkeley) 4/3/94 +.\" From: @(#)sigaction.2 8.2 (Berkeley) 4/3/94 +.\" $FreeBSD: src/lib/libc/sys/sigaction.2,v 1.48 2003/03/24 16:07:19 charnier Exp $ .\" .Dd April 3, 1994 .Dt SIGACTION 2 @@ -39,24 +38,36 @@ .Sh NAME .Nm sigaction .Nd software signal facilities +.Sh LIBRARY +.Lb libc .Sh SYNOPSIS -.Fd #include +.In signal.h .Bd -literal -struct sigaction { - void (*sa_handler)(); - sigset_t sa_mask; - int sa_flags; +struct sigaction { + union { + void (*__sa_handler)(int); + void (*__sa_sigaction)(int, struct __siginfo *, void *); + } __sigaction_u; /* signal handler */ + int sa_flags; /* see signal options below */ + sigset_t sa_mask; /* signal mask to apply */ }; + +#define sa_handler __sigaction_u.__sa_handler +#define sa_sigaction __sigaction_u.__sa_sigaction .Ed .Ft int -.Fn sigaction "int sig" "const struct sigaction *act" "struct sigaction *oact" +.Fo sigaction +.Fa "int sig" +.Fa "const struct sigaction * restrict act" +.Fa "struct sigaction * restrict oact" +.Fc .Sh DESCRIPTION The system defines a set of signals that may be delivered to a process. Signal delivery resembles the occurrence of a hardware interrupt: -the signal is blocked from further occurrence, the current process +the signal is normally blocked from further occurrence, the current process context is saved, and a new one is built. A process may specify a .Em handler -to which a signal is delivered, or specify that a signal is to be +to which a signal is delivered, or specify that a signal is to be .Em ignored . A process may also specify that a default action is to be taken by the system when a signal occurs. @@ -71,11 +82,11 @@ so that signals are taken on a special .Em "signal stack" . .Pp -Signal routines execute with the signal that caused their +Signal routines normally execute with the signal that caused their invocation .Em blocked , but other signals may yet occur. -A global +A global .Em "signal mask" defines the set of signals currently blocked from delivery to a process. The signal mask for a process is initialized @@ -99,10 +110,10 @@ before their first instructions. The set of pending signals is returned by the .Xr sigpending 2 -function. +system call. When a caught signal is delivered, the current state of the process is saved, -a new signal mask is calculated (as described below), +a new signal mask is calculated (as described below), and the signal handler is invoked. The call to the handler is arranged so that if the signal handling routine returns normally the process will resume execution in the context @@ -113,31 +124,34 @@ When a signal is delivered to a process a new signal mask is installed for the duration of the process' signal handler (or until a -.Xr sigprocmask -call is made). +.Xr sigprocmask 2 +system call is made). This mask is formed by taking the union of the current signal mask set, -the signal to be delivered, and +the signal to be delivered, and the signal mask associated with the handler to be invoked. .Pp -.Fn Sigaction -assigns an action for a specific signal. +The +.Fn sigaction +system call +assigns an action for a signal specified by +.Fa sig . If .Fa act is non-zero, it specifies an action -.Pf ( Dv SIG_DFL , +.Dv ( SIG_DFL , .Dv SIG_IGN , or a handler routine) and mask to be used when delivering the specified signal. -If +If .Fa oact is non-zero, the previous handling information for the signal is returned to the user. .Pp -Once a signal handler is installed, it remains installed +Once a signal handler is installed, it normally remains installed until another .Fn sigaction -call is made, or an +system call is made, or an .Xr execve 2 is performed. A signal-specific default action may be reset by @@ -163,10 +177,11 @@ of the signal are ignored and discarded. .Pp Options may be specified by setting -.Em sa_flags . -If the -.Dv SA_NOCLDSTOP -bit is set when installing a catching function +.Va sa_flags . +The meaning of the various bits is as follows: +.Bl -tag -offset indent -width SA_RESETHANDXX +.It Dv SA_NOCLDSTOP +If this bit is set when installing a catching function for the .Dv SIGCHLD signal, @@ -174,40 +189,56 @@ .Dv SIGCHLD signal will be generated only when a child process exits, not when a child process stops. -Further, if the -.Dv SA_ONSTACK -bit is set in -.Em sa_flags , -the system will deliver the signal to the process on a +.It Dv SA_NOCLDWAIT +If this bit is set when calling +.Fn sigaction +for the +.Dv SIGCHLD +signal, the system will not create zombie processes when children of +the calling process exit. If the calling process subsequently issues +a +.Xr wait 2 +(or equivalent), it blocks until all of the calling process's child +processes terminate, and then returns a value of -1 with errno set to +.Er ECHILD . +.It Dv SA_ONSTACK +If this bit is set, the system will deliver the signal to the process +on a .Em "signal stack" , specified with -.Xr sigstack 2 . -.Pp -Finally, the -.Dv SA_SIGINFO -option causes the 2nd argument for the signal handler to be a pointer -to a -.Em siginfo_t -as described in -.Pa . -The -.Em siginfo_t -is a part of -.St -p1003.1b . -and provides much more information about the causes and -attributes of the signal that is being delivered. +.Xr sigaltstack 2 . +.It Dv SA_NODEFER +If this bit is set, further occurrences of the delivered signal are +not masked during the execution of the handler. +.It Dv SA_RESETHAND +If this bit is set, the handler is reset back to +.Dv SIG_DFL +at the moment the signal is delivered. +.It Dv SA_RESTART +See paragraph below. +.It Dv SA_SIGINFO +If this bit is set, the handler function is assumed to be pointed to by the +.Dv sa_sigaction +member of struct sigaction and should match the prototype shown above or as +below in +.Sx EXAMPLES . +This bit should not be set when assigning +.Dv SIG_DFL +or +.Dv SIG_IGN . +.El .Pp If a signal is caught during the system calls listed below, the call may be forced to terminate with the error -.Dv EINTR , +.Er EINTR , the call may return with a data transfer shorter than requested, or the call may be restarted. Restart of pending calls is requested by setting the .Dv SA_RESTART bit in -.Ar sa_flags . +.Va sa_flags . The affected system calls include .Xr open 2 , .Xr read 2 , @@ -233,8 +264,9 @@ all signals, the signal mask, the signal stack, and the restart/interrupt flags are inherited by the child. .Pp -.Xr Execve 2 -reinstates the default +The +.Xr execve 2 +system call reinstates the default action for all signals which were caught and resets all signals to be caught on the user stack. Ignored signals remain ignored; @@ -245,13 +277,13 @@ with names as in the include file .Aq Pa signal.h : .Bl -column SIGVTALARMXX "create core imagexxx" -.It Sy " NAME " " Default Action " " Description" +.It Sy "NAME Default Action Description" .It Dv SIGHUP No " terminate process" " terminal line hangup" .It Dv SIGINT No " terminate process" " interrupt program" .It Dv SIGQUIT No " create core image" " quit program" .It Dv SIGILL No " create core image" " illegal instruction" .It Dv SIGTRAP No " create core image" " trace trap" -.It Dv SIGABRT No " create core image" Xr abort 2 +.It Dv SIGABRT No " create core image" Ta Xr abort 3 call (formerly .Dv SIGIOT ) .It Dv SIGEMT No " create core image" " emulate instruction executed" @@ -259,7 +291,7 @@ .It Dv SIGKILL No " terminate process" " kill program" .It Dv SIGBUS No " create core image" " bus error" .It Dv SIGSEGV No " create core image" " segmentation violation" -.It Dv SIGSYS No " create core image" " system call given invalid argument" +.It Dv SIGSYS No " create core image" " non-existent system call invoked" .It Dv SIGPIPE No " terminate process" " write on a pipe with no reader" .It Dv SIGALRM No " terminate process" " real-time timer expired" .It Dv SIGTERM No " terminate process" " software termination signal" @@ -287,62 +319,232 @@ .It Dv SIGUSR2 No " terminate process" " User defined signal 2" .El .Sh NOTE -The mask specified in +The +.Fa sa_mask +field specified in .Fa act is not allowed to block .Dv SIGKILL or .Dv SIGSTOP . -This is done silently by the system. -.Sh RETURN VALUES -A 0 value indicated that the call succeeded. A \-1 return value -indicates an error occurred and +Any attempt to do so will be silently ignored. +.Pp +The following functions are either reentrant or not interruptible +by signals and are async-signal safe. +Therefore applications may +invoke them, without restriction, from signal-catching functions: +.Pp +Base Interfaces: +.Pp +.Fn _exit , +.Fn access , +.Fn alarm , +.Fn cfgetispeed , +.Fn cfgetospeed , +.Fn cfsetispeed , +.Fn cfsetospeed , +.Fn chdir , +.Fn chmod , +.Fn chown , +.Fn close , +.Fn creat , +.Fn dup , +.Fn dup2 , +.Fn execle , +.Fn execve , +.Fn fcntl , +.Fn fork , +.Fn fpathconf , +.Fn fstat , +.Fn fsync , +.Fn getegid , +.Fn geteuid , +.Fn getgid , +.Fn getgroups , +.Fn getpgrp , +.Fn getpid , +.Fn getppid , +.Fn getuid , +.Fn kill , +.Fn link , +.Fn lseek , +.Fn mkdir , +.Fn mkfifo , +.Fn open , +.Fn pathconf , +.Fn pause , +.Fn pipe , +.Fn raise , +.Fn read , +.Fn rename , +.Fn rmdir , +.Fn setgid , +.Fn setpgid , +.Fn setsid , +.Fn setuid , +.Fn sigaction , +.Fn sigaddset , +.Fn sigdelset , +.Fn sigemptyset , +.Fn sigfillset , +.Fn sigismember , +.Fn signal , +.Fn sigpending , +.Fn sigprocmask , +.Fn sigsuspend , +.Fn sleep , +.Fn stat , +.Fn sysconf , +.Fn tcdrain , +.Fn tcflow , +.Fn tcflush , +.Fn tcgetattr , +.Fn tcgetpgrp , +.Fn tcsendbreak , +.Fn tcsetattr , +.Fn tcsetpgrp , +.Fn time , +.Fn times , +.Fn umask , +.Fn uname , +.Fn unlink , +.Fn utime , +.Fn wait , +.Fn waitpid , +.Fn write . +.Pp +Realtime Interfaces: +.Pp +.Fn aio_error , +.Fn clock_gettime , +.Fn sigpause , +.Fn timer_getoverrun , +.Fn aio_return , +.Fn fdatasync , +.Fn sigqueue , +.Fn timer_gettime , +.Fn aio_suspend , +.Fn sem_post , +.Fn sigset , +.Fn timer_settime . +.Pp +ANSI C Interfaces: +.Pp +.Fn strcpy , +.Fn strcat , +.Fn strncpy , +.Fn strncat , +and perhaps some others. +.Pp +Extension Interfaces: +.Pp +.Fn strlcpy , +.Fn strlcat . +.Pp +All functions not in the above lists are considered to be unsafe +with respect to signals. That is to say, the behaviour of such +functions when called from a signal handler is undefined. +In general though, signal handlers should do little more than set a +flag; most other actions are not safe. +.Pp +Also, it is good practice to make a copy of the global variable .Va errno -is set to indicated the reason. -.Sh EXAMPLE -The handler routine can be declared: -.Bd -literal -offset indent -void handler(sig, sip, scp) -int sig; -siginfo_t *sip; -struct sigcontext *scp; -.Ed +and restore it before returning from the signal handler. +This protects against the side effect of +.Va errno +being set by functions called from inside the signal handler. +.Sh RETURN VALUES +.Rv -std sigaction +.Sh EXAMPLES +There are three possible prototypes the handler may match: +.Bl -tag -offset indent -width short +.It ANSI C: +.Ft void +.Fn handler int ; +.It POSIX SA_SIGINFO: +.Ft void +.Fn handler int "siginfo_t *info" "ucontext_t *uap" ; +.El +.Pp +The handler function should match the SA_SIGINFO prototype if the +SA_SIGINFO bit is set in flags. +It then should be pointed to by the +.Dv sa_sigaction +member of +.Dv struct sigaction . +Note that you should not assign SIG_DFL or SIG_IGN this way. +.Pp +If the SA_SIGINFO flag is not set, the handler function should match +either the ANSI C or traditional +.Bx +prototype and be pointed to by +the +.Dv sa_handler +member of +.Dv struct sigaction . +In practice, +.Fx +always sends the three arguments of the latter and since the ANSI C +prototype is a subset, both will work. +The +.Dv sa_handler +member declaration in +.Fx +include files is that of ANSI C (as required by POSIX), +so a function pointer of a +.Bx Ns -style +function needs to be casted to +compile without warning. +The traditional +.Bx +style is not portable and since its capabilities +are a full subset of a SA_SIGINFO handler, +its use is deprecated. .Pp -Here +The .Fa sig -is the signal number, into which the hardware faults and traps are -mapped. -If the -.Dv SA_SIGINFO -option is set, -.Fa sip -is a pointer to a -.Dv siginfo_t -as described in -.Pa . -If -.Dv SA_SIGINFO -is not set, this is NULL. -.Fa Scp -is a pointer to the -.Fa sigcontext -structure (defined in -.Aq Pa signal.h ) , -used to restore the context from before the signal. +argument is the signal number, one of the +.Dv SIG... +values from . +.Pp +The +.Fa code +argument of the +.Bx Ns -style +handler and the +.Dv si_code +member of the +.Dv info +argument to a SA_SIGINFO handler contain a numeric code explaining the +cause of the signal, usually one of the +.Dv SI_... +values from + or codes specific to a signal, i.e. one of the +.Dv FPE_... +values for SIGFPE. +.Pp +The +.Fa uap +argument to a POSIX SA_SIGINFO handler points to an instance of +ucontext_t. .Sh ERRORS -.Fn Sigaction +The +.Fn sigaction +system call will fail and no new signal handler will be installed if one of the following occurs: .Bl -tag -width Er .It Bq Er EFAULT Either .Fa act -or +or .Fa oact points to memory that is not a valid part of the process address space. .It Bq Er EINVAL -.Fa Sig +The +.Fa sig +argument is not a valid signal number. .It Bq Er EINVAL An attempt is made to ignore or supply a handler for @@ -352,9 +554,9 @@ .El .Sh STANDARDS The -.Nm sigaction -function is defined by -.St -p1003.1-88 . +.Fn sigaction +system call is expected to conform to +.St -p1003.1-90 . The .Dv SA_ONSTACK and @@ -375,21 +577,36 @@ and .Dv SIGINFO . Those signals are available on most -.Tn BSD Ns \-derived +.Bx Ns \-derived systems. +The +.Dv SA_NODEFER +and +.Dv SA_RESETHAND +flags are intended for backwards compatibility with other operating +systems. The +.Dv SA_NOCLDSTOP , +and +.Dv SA_NOCLDWAIT +.\" and +.\" SA_SIGINFO +flags are featuring options commonly found in other operating systems. .Sh SEE ALSO .Xr kill 1 , -.Xr ptrace 2 , .Xr kill 2 , -.Xr sigaction 2 , -.Xr sigprocmask 2 , -.Xr sigsuspend 2 , +.Xr ptrace 2 , +.Xr sigaltstack 2 , .Xr sigblock 2 , -.Xr sigsetmask 2 , .Xr sigpause 2 , -.Xr sigstack 2 , -.Xr sigvec 3 , +.Xr sigpending 2 , +.Xr sigprocmask 2 , +.Xr sigsetmask 2 , +.Xr sigsuspend 2 , +.Xr sigvec 2 , +.Xr wait 2 , +.Xr fpsetmask 3 , .Xr setjmp 3 , .Xr siginterrupt 3 , .Xr sigsetops 3 , +.Xr ucontext 3 , .Xr tty 4 diff -urN xnu-344.49/bsd/man/man2/socket.2 xnu-517/bsd/man/man2/socket.2 --- xnu-344.49/bsd/man/man2/socket.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/socket.2 Tue Oct 21 21:24:55 2003 @@ -221,7 +221,7 @@ The per-process descriptor table is full. .It Bq Er ENFILE The system file table is full. -.It Bq Er EACCESS +.It Bq Er EACCES Permission to create a socket of the specified type and/or protocol is denied. .It Bq Er ENOBUFS diff -urN xnu-344.49/bsd/man/man2/statfs.2 xnu-517/bsd/man/man2/statfs.2 --- xnu-344.49/bsd/man/man2/statfs.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/statfs.2 Tue Oct 21 21:24:55 2003 @@ -56,12 +56,12 @@ .Bd -literal typedef struct { int32_t val[2]; } fsid_t; -#define MFSNAMELEN 16 /* length of fs type name, including nul */ -#define MNAMELEN 32 /* length of buffer for returned name */ +#define MFSNAMELEN 15 /* length of fs type name, not inc. nul */ +#define MNAMELEN 90 /* length of buffer for returned name */ struct statfs { - short f_type; /* type of file system (unused; zero) */ - short f_flags; /* copy of mount flags */ + short f_otype; /* type of file system (reserved: zero) */ + short f_oflags; /* copy of mount flags (reserved: zero) */ long f_bsize; /* fundamental file system block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in file system */ @@ -71,10 +71,15 @@ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* file system id (super-user only) */ uid_t f_owner; /* user that mounted the file system */ - long f_spare[4]; /* spare for later */ + short f_reserved1; /* reserved for future use */ + short f_type; /* type of file system (reserved) */ + long f_flags; /* copy of mount flags (reserved) */ + long f_reserved2[2]; /* reserved for future use */ char f_fstypename[MFSNAMELEN]; /* fs type name */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ char f_mntfromname[MNAMELEN]; /* mounted file system */ + char f_reserved3; /* reserved for future use */ + long f_reserved4[4]; /* reserved for future use */ }; .Ed .Pp diff -urN xnu-344.49/bsd/man/man2/wait.2 xnu-517/bsd/man/man2/wait.2 --- xnu-344.49/bsd/man/man2/wait.2 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man2/wait.2 Tue Oct 21 21:24:55 2003 @@ -158,7 +158,7 @@ True if the process terminated normally by a call to .Xr _exit 2 or -.Xr exit 2 . +.Xr exit 3 . .It Fn WIFSIGNALED status True if the process terminated due to receipt of a signal. .It Fn WIFSTOPPED status @@ -180,7 +180,7 @@ of the argument passed to .Xr _exit 2 or -.Xr exit 2 +.Xr exit 3 by the child. .It Fn WTERMSIG status If @@ -293,7 +293,7 @@ .Fn wait call are extensions to the POSIX interface. .Sh SEE ALSO -.Xr exit 2 , +.Xr exit 3 , .Xr sigaction 2 .Sh HISTORY A diff -urN xnu-344.49/bsd/man/man4/Makefile xnu-517/bsd/man/man4/Makefile --- xnu-344.49/bsd/man/man4/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man4/Makefile Tue Oct 21 21:24:55 2003 @@ -29,7 +29,6 @@ pty.4 \ random.4 \ route.4 \ - scsi.4 \ stderr.4 \ stdin.4 \ stdout.4 \ diff -urN xnu-344.49/bsd/man/man4/icmp.4 xnu-517/bsd/man/man4/icmp.4 --- xnu-344.49/bsd/man/man4/icmp.4 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man4/icmp.4 Tue Oct 21 21:24:55 2003 @@ -94,7 +94,7 @@ .Dv SOCK_DGRAM socket type without requiring root privileges. The synopsis is the following: .Pp -.Fn socket AF_INET SOCK_DGRAM IPPROTO_IP +.Fn socket AF_INET SOCK_DGRAM IPPROTO_ICMP .Pp This can be used by non root privileged processes to send .Tn ICMP diff -urN xnu-344.49/bsd/man/man4/scsi.4 xnu-517/bsd/man/man4/scsi.4 --- xnu-344.49/bsd/man/man4/scsi.4 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man4/scsi.4 Thu Jan 1 01:00:00 1970 @@ -1,156 +0,0 @@ -.\" $OpenBSD: scsi.4,v 1.1 1996/08/04 20:28:20 tholo Exp $ -.\" -.Dd August 4, 1996 -.Dt SD 4 -.Os OpenBSD -.Sh NAME -.Nm scsi -.Nd scsi system -.Sh SYNOPSIS -.Nm scsibus* at aha? -.Nm scsibus* at ncr? -.Nm device cd* at scsibus? target ? lun ? -.Nm device ch* at scsibus? target ? lun ? -.Nm device sd* at scsibus? target ? lun ? -.Nm device st* at scsibus? target ? lun ? -.Nm device ss* at scsibus? target ? lun ? -.Nm device su* at scsibus? target ? lun ? -.Nm device uk* at scsibus? target ? lun ? -.Sh DESCRIPTION -The -.Em scsi -system provides a uniform and modular system for the implementation -of drivers to control various scsi devices, and to utilize different -scsi host adapters through host adapter drivers. When the system probes the -.Em SCSI -busses, it attaches any devices it finds to the appropriate -drivers. If no driver seems appropriate, then it attaches the device to the -uk (unknown) driver so that user level scsi ioctls may -still be performed against the device. -.Sh KERNEL CONFIGURATION -The option SCSIDEBUG enables the debug ioctl. -.Pp -All devices and the SCSI busses support boot time allocation so that -an upper number of devices and controllers does not need to be configured; -.Em "device sd* at scsibus? target ? lun ?" -will suffice for any number of disk drivers. -.Pp -The devices are either -.Em wired -so they appear as a particular device unit or -.Em counted -so that they appear as the next available unused unit. -.Pp -To configure a driver in the kernel without wiring down the device use a -config line similar to -.Em "device ch* at scsibus? target ? lun ?" -to include the changer driver. -.Pp -To wire down a unit use a config line similar to -.Em "device ch1 at scsibus0 target 4 lun 0" -to assign changer 1 as the changer with SCSI ID 4, -SCSI logical unit 0 on SCSI bus 0. -Individual scsibuses can be wired down to specific controllers with -a config line similar to -.Em "scsibus0 at ahc0" -which assigns scsi bus 0 to the first unit using the ahc driver. -For controllers supporting more than one bus, -the particular bus can be specified as in -.Em "scsibus3 at ahc1 bus 1" -which assigns scsibus 1 to the second bus probed on the ahc1 device. -.Pp -When you have a mixture of wired down and counted devices then the -counting begins with the first non-wired down unit for a particular -type. That is, if you have a disk wired down as -.Em "disk sd1 at scsibus? target ? lun ?" , -then the first non-wired disk shall come on line as -.Em sd2 . -.Sh IOCTLS -There are a number of ioctls that work on any -.Em SCSI -device. They are defined in -.Em sys/scsiio.h -and can be applied against any scsi device that permits them. -For the tape, it must be applied against the control -device. See the manual page for each device type for more information about -how generic scsi ioctls may be applied to a specific device. -.Bl -tag -width DIOCSDINFO____ -.It Dv SCIOCRESET* -reset a device. -.It Dv SCIOCDEBUG -Turn on debugging.. All scsi operations originating from this device's driver -will be traced to the console, along with other information. Debugging is -controlled by four bits, described in the header file. If no debugging is -configured into the kernel, debugging will have no effect. -.Em SCSI -debugging is controlled by the configuration option -.Em SCSIDEBUG. -.It Dv SCIOCCOMMAND -Take a scsi command and data from a user process and apply them to the scsi -device. Return all status information and return data to the process. The -ioctl will return a successful status even if the device rejected the -command. As all status is returned to the user, it is up to the user -process to examine this information to decide the success of the command. -.It Dv SCIOCREPROBE -Ask the system to probe the scsi busses for any new devices. If it finds -any, they will be attached to the appropriate drivers. The search can be -narrowed to a specific bus, target or lun. The new device may or may not -be related to the device on which the ioctl was performed. -.It Dv SCIOCIDENTIFY -Ask the driver what it's bus, target and lun are. -.It Dv SCIOCDECONFIG -Ask the device to disappear. This may not happen if the device is in use. -.El -.Sh NOTES -the generic scsi part of the system is still being mapped out. -Watch this space for changes. -.Pp - A device by the name of su (scsi_user) -(e.g su0-0-0) will map bus, target and lun to minor numbers. It has not -yet decided yet whether this device will be able to open a device that is -already controlled by an explicit driver. -.Sh ADAPTERS -The system allows common device drivers to work through many different -types of adapters. The adapters take requests from the upper layers and do -all IO between the -.Em SCSI -bus and the system. The maximum size of a transfer is governed by the -adapter. Most adapters can transfer 64KB in a single operation, however -many can transfer larger amounts. -.Sh TARGET MODE -Some adapters support -.Em target mode -in which the system is capable of operating as a device, responding to -operations initiated by another system. Target mode will be supported for -some adapters, but is not yet complete for this version of the scsi system. -.Sh DIAGNOSTICS -When the kernel is compiled with option SCSIDEBUG, the SCIOCDEBUG ioctl -can be used to enable various amounts of tracing information on any -specific device. Devices not being traced will not produce trace information. -The four bits that make up the debug level, each control certain types -of debugging information. -.Bl -tag -width "Bit 0" -.It Dv Bit 0 -shows all scsi bus operations including scsi commands, -error information and the first 48 bytes of any data transferred. -.It Dv Bit 1 -shows routines called. -.It Dv Bit 2 -shows information about what branches are taken and often some -of the return values of functions. -.It Dv Bit 3 -shows more detailed information including DMA scatter-gather logs. -.El -.Sh SEE ALSO -.Xr ch 4 , -.Xr cd 4 , -.Xr sd 4 , -.Xr ss 4 , -.Xr st 4 , -.Xr su 4 -and -.Xr uk 4 -.Sh HISTORY -This -.Nm -system appeared in MACH 2.5 at TRW. diff -urN xnu-344.49/bsd/man/man5/core.5 xnu-517/bsd/man/man5/core.5 --- xnu-344.49/bsd/man/man5/core.5 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man5/core.5 Tue Oct 21 21:24:55 2003 @@ -41,17 +41,25 @@ The core file consists of the .Pa Xr Mach-O 5 header as described in the -.Aq Pa sys/loader.h +.Aq Pa mach-o/loader.h file. The remainder of the core file consists of various sections described in the .Xr Mach-O 5 header. +.Sh NOTE +Core dumps are disabled by default under Darwin/Mac OS X. To re-enable core dumps, a +privlaged user must edit +.Pa /etc/hostconfig +to contain the line: +.Bd -literal +COREDUMPS=-YES- +.Ed .Sh SEE ALSO .Xr gdb 1 , .Xr setrlimit 2 , .Xr sigaction 2 , -.Xr Mach-O 5, +.Xr Mach-O 5 , .Xr sysctl 8 .Sh HISTORY A diff -urN xnu-344.49/bsd/man/man5/dir.5 xnu-517/bsd/man/man5/dir.5 --- xnu-344.49/bsd/man/man5/dir.5 Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man5/dir.5 Tue Oct 21 21:24:55 2003 @@ -82,64 +82,120 @@ .Xr mount 8 . ) .Pp The directory entry format is defined in the file +.Aq sys/dirent.h +and further in the file .Aq dirent.h : .Bd -literal -#ifndef _DIRENT_H_ -#define _DIRENT_H_ - +/*** Excerpt from ***/ /* -* A directory entry has a struct dirent at the front of it, containing its -* inode number, the length of the entry, and the length of the name -* contained in the entry. These are followed by the name padded to a 4 -* byte boundary with null bytes. All names are guaranteed null terminated. -* The maximum length of a name in a directory is MAXNAMLEN. -*/ + * The dirent structure defines the format of directory entries returned by + * the getdirentries(2) system call. + * + * A directory entry has a struct dirent at the front of it, containing its + * inode number, the length of the entry, and the length of the name + * contained in the entry. These are followed by the name padded to a 4 + * byte boundary with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * The dirent structure defines the format of directory entries returned by + * the getdirentries(2) system call. + */ + +#ifndef _SYS_DIRENT_H +#define _SYS_DIRENT_H struct dirent { - u_long d_fileno; /* file number of entry */ - u_short d_reclen; /* length of this record */ - u_short d_namlen; /* length of string in d_name */ + u_int32_t d_fileno; /* file number of entry */ + u_int16_t d_reclen; /* length of this record */ + u_int8_t d_type; /* file type, see below */ + u_int8_t d_namlen; /* length of string in d_name */ #ifdef _POSIX_SOURCE - char d_name[MAXNAMLEN + 1]; /* maximum name length */ + char d_name[255 + 1]; /* name must be no longer than this */ #else #define MAXNAMLEN 255 - char d_name[MAXNAMLEN + 1]; /* maximum name length */ + char d_name[MAXNAMLEN + 1]; /* name must be no longer than this */ #endif - }; +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +#endif /* !_SYS_DIRENT_H_ */ + +.Ed +----------------------------------------- +.Bd -literal +/*** Excerpt from ***/ + +#ifndef _DIRENT_H +#define _DIRENT_H + #ifdef _POSIX_SOURCE -typedef void * DIR; +typedef void * DIR; #else -#define d_ino d_fileno /* backward compatibility */ +#define d_ino d_fileno /* backward compatibility */ /* definitions for library routines operating on directories. */ -#define DIRBLKSIZ 1024 +#define DIRBLKSIZ 1024 + +struct _telldir; /* see telldir.h */ /* structure describing an open directory. */ typedef struct _dirdesc { - int dd_fd; /* file descriptor associated with directory */ - long dd_loc; /* offset in current buffer */ - long dd_size; /* amount of data returned by getdirentries */ - char *dd_buf; /* data buffer */ - int dd_len; /* size of data buffer */ - long dd_seek; /* magic cookie returned by getdirentries */ + int dd_fd; /* file descriptor associated with directory */ + long dd_loc; /* offset in current buffer */ + long dd_size; /* amount of data returned by getdirentries */ + char *dd_buf; /* data buffer */ + int dd_len; /* size of data buffer */ + long dd_seek; /* magic cookie returned by getdirentries */ + long dd_rewind; /* magic cookie for rewinding */ + int dd_flags; /* flags for readdir */ + pthread_mutex_t dd_lock; /* for thread locking */ + struct _telldir *dd_td; /* telldir position recording */ } DIR; -#define dirfd(dirp) ((dirp)->dd_fd) +#define dirfd(dirp) ((dirp)->dd_fd) -#ifndef NULL -#define NULL 0 -#endif +/* flags for opendir2 */ +#define DTF_HIDEW 0x0001 /* hide whiteout entries */ +#define DTF_NODUP 0x0002 /* don't return duplicate names */ +/* structure describing an open directory. */ +typedef struct _dirdesc { + int dd_fd; /* file descriptor associated with directory */ + long dd_loc; /* offset in current buffer */ + long dd_size; /* amount of data returned by getdirentries */ + char *dd_buf; /* data buffer */ + int dd_len; /* size of data buffer */ + long dd_seek; /* magic cookie returned by getdirentries */ + long dd_rewind; /* magic cookie for rewinding */ + int dd_flags; /* flags for readdir */ + pthread_mutex_t dd_lock; /* for thread locking */ + struct _telldir *dd_td; /* telldir position recording */ +} DIR; -#endif /* _POSIX_SOURCE */ +#define dirfd(dirp) ((dirp)->dd_fd) -#ifndef _KERNEL +/* flags for opendir2 */ +#define DTF_HIDEW 0x0001 /* hide whiteout entries */ +#define DTF_NODUP 0x0002 /* don't return duplicate names */ +#define DTF_REWIND 0x0004 /* rewind after reading union stack */ +#define __DTF_READALL 0x0008 /* everything has been read */ -#include +#ifndef NULL +#define NULL 0 +#endif -#endif /* !_KERNEL */ +#endif /* _POSIX_SOURCE */ #endif /* !_DIRENT_H_ */ .Ed diff -urN xnu-344.49/bsd/man/man9/Makefile xnu-517/bsd/man/man9/Makefile --- xnu-344.49/bsd/man/man9/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/man/man9/Makefile Tue Oct 21 21:24:55 2003 @@ -11,6 +11,7 @@ fetch.9 \ store.9 \ style.9 \ + intro.9 INSTALL_MAN_LIST = ${DATAFILES} diff -urN xnu-344.49/bsd/man/man9/intro.9 xnu-517/bsd/man/man9/intro.9 --- xnu-344.49/bsd/man/man9/intro.9 Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/man/man9/intro.9 Tue Oct 21 21:24:55 2003 @@ -0,0 +1,109 @@ +.\" Copyright (c) 1983, 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/share/man/man9/intro.9,v 1.15 2001/07/14 19:41:16 schweikh Exp $ +.\" +.Dd December 13, 1995 +.Dt INTRO 9 +.Os +.Sh NAME +.Nm intro +.Nd "introduction to system kernel interfaces" +.Sh DESCRIPTION +This section contains information about the interfaces and +subroutines in the kernel. +.Sh PROTOTYPES ANSI-C AND ALL THAT +Yes please. +.Pp +We would like all code to be fully prototyped. +.Pp +If your code compiles cleanly with +.Nm cc +.Ar -Wall +we would feel happy about it. +It is important to understand that this isn't a question of just shutting up +.Nm cc , +it is a question about avoiding the things it complains about. +To put it bluntly, don't hide the problem by casting and other +obfuscating practices, solve the problem. +.Sh INDENTATION AND STYLE +Believe it or not, there actually exists a guide for indentation and style. +It isn't generally applied though. +.Pp +We would appreciate if people would pay attention to it, and at least not +violate it blatantly. +.Pp +We don't mind it too badly if you have your own style, but please make +sure we can read it too. +.Pp +Please take time to read +.Xr style 9 +for more information. +.Sh NAMING THINGS +Some general rules exist: +.Bl -enum +.It +If a function is meant as a debugging aid in DDB, it should be enclosed +in +.Bd -literal -offset indent +#ifdef DDB + +#endif /* DDB */ +.Ed +.Pp +And the name of the procedure should start with the prefix +.Li DDB_ +to clearly identify the procedure as a debugger routine. +.El +.Sh SCOPE OF SYMBOLS +It is important to carefully consider the scope of symbols in the kernel. +The default is to make everything static, unless some reason requires +the opposite. +.Pp +There are several reasons for this policy, +the main one is that the kernel is one monolithic name-space, +and pollution is not a good idea here either. +.Pp +For device drivers and other modules that don't add new internal interfaces +to the kernel, the entire source should be in one file if possible. +That way all symbols can be made static. +.Pp +If for some reason a module is split over multiple source files, then try +to split the module along some major fault-line and consider using the +number of global symbols as your guide. +The fewer the better. +.Sh SEE ALSO +.Xr style 9 +.Sh HISTORY +The +.Nm +section manual page appeared in +.Fx 2.2 . diff -urN xnu-344.49/bsd/miscfs/Makefile xnu-517/bsd/miscfs/Makefile --- xnu-344.49/bsd/miscfs/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/miscfs/Makefile Tue Dec 30 19:36:22 2003 @@ -11,7 +11,7 @@ devfs \ fdesc \ specfs \ - union + union INSTINC_SUBDIRS_PPC = \ @@ -21,7 +21,7 @@ devfs \ fdesc \ specfs \ - union + union EXPINC_SUBDIRS_PPC = \ diff -urN xnu-344.49/bsd/miscfs/devfs/devfs_tree.c xnu-517/bsd/miscfs/devfs/devfs_tree.c --- xnu-344.49/bsd/miscfs/devfs/devfs_tree.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/devfs/devfs_tree.c Sat Oct 25 00:25:25 2003 @@ -99,7 +99,7 @@ #ifdef HIDDEN_MOUNTPOINT static struct mount *devfs_hidden_mount; -#endif HIDDEN_MOINTPOINT +#endif /* HIDDEN_MOINTPOINT */ static int devfs_ready = 0; @@ -137,7 +137,7 @@ devfs_mount(devfs_hidden_mount,"dummy",NULL,NULL,NULL); dev_root->de_dnp->dn_dvm = (struct devfsmount *)devfs_hidden_mount->mnt_data; -#endif HIDDEN_MOUNTPOINT +#endif /* HIDDEN_MOUNTPOINT */ devfs_ready = 1; return (0); } @@ -287,7 +287,7 @@ return 0; } } -#endif 0 +#endif /***********************************************************************\ * Given a starting node (0 for root) and a pathname, return the node * * for the end item on the path. It MUST BE A DIRECTORY. If the 'CREATE' * @@ -338,6 +338,7 @@ scan++; strncpy(component, start, scan - start); + component[ scan - start ] = '\0'; if (*scan == '/') scan++; @@ -670,14 +671,14 @@ if (dnp->dn_vn == NULL) { #if 0 printf("devfs_dn_free: free'ing %x\n", (unsigned int)dnp); -#endif 0 +#endif devnode_free(dnp); /* no accesses/references */ } else { #if 0 printf("devfs_dn_free: marking %x for deletion\n", (unsigned int)dnp); -#endif 0 +#endif dnp->dn_delete = TRUE; } } diff -urN xnu-344.49/bsd/miscfs/devfs/devfs_vfsops.c xnu-517/bsd/miscfs/devfs/devfs_vfsops.c --- xnu-344.49/bsd/miscfs/devfs/devfs_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/devfs/devfs_vfsops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -90,7 +90,6 @@ if (devfs_sinit()) return (EOPNOTSUPP); - printf("devfs enabled\n"); devfs_make_node(makedev(0, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0622, "console"); devfs_make_node(makedev(2, 0), DEVFS_CHAR, @@ -379,7 +378,8 @@ /* * Allocate and initialize the filesystem. */ - mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + MALLOC_ZONE(mp, struct mount *, (u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); /* Initialize the default IO constraints */ @@ -406,12 +406,15 @@ if (error) { printf("devfs_kernel_mount: mount %s failed: %d", mntname, error); mp->mnt_vfc->vfc_refcount--; + + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + FREE(mp->mnt_xinfo_ptr, M_TEMP); vfs_unbusy(mp, procp); - _FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + + FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); vput(vp); return (error); } - printf("devfs on %s\n", mntname); simple_lock(&mountlist_slock); CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); simple_unlock(&mountlist_slock); diff -urN xnu-344.49/bsd/miscfs/devfs/devfs_vnops.c xnu-517/bsd/miscfs/devfs/devfs_vnops.c --- xnu-344.49/bsd/miscfs/devfs/devfs_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/devfs/devfs_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -1153,7 +1153,7 @@ char *a_target; } */ { - struct componentname * cnp = ap->a_cnp; + struct componentname * cnp = ap->a_cnp; struct vnode *vp = NULL; int error = 0; devnode_t * dir_p; @@ -1186,9 +1186,13 @@ goto failure; vp = *vpp; vput(vp); - failure: - if ((cnp->cn_flags & SAVESTART) == 0) - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); +failure: + if ((cnp->cn_flags & SAVESTART) == 0) { + char *tmp = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + } vput(ap->a_dvp); return error; } @@ -1239,13 +1243,17 @@ dev_p->dn_uid = cnp->cn_cred->cr_uid; dev_p->dn_gid = dir_p->dn_gid; dev_p->dn_mode = vap->va_mode; - failure: +failure: if (*vpp) { vput(*vpp); *vpp = 0; } - if ((cnp->cn_flags & SAVESTART) == 0) - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); + if ((cnp->cn_flags & SAVESTART) == 0) { + char *tmp = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmp, cnp->cn_pnlen, M_NAMEI); + } vput(dvp); return (error); } @@ -1383,20 +1391,6 @@ } static int -devfs_abortop(struct vop_abortop_args *ap) - /*struct vop_abortop_args { - struct vnode *a_dvp; - struct componentname *a_cnp; - } */ -{ - if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) { - FREE_ZONE(ap->a_cnp->cn_pnbuf, ap->a_cnp->cn_pnlen, M_NAMEI); - } - return 0; -} - - -static int devfs_reclaim(struct vop_reclaim_args *ap) /*struct vop_reclaim_args { struct vnode *a_vp; @@ -1519,7 +1513,7 @@ { &vop_symlink_desc, (VOPFUNC)devfs_symlink }, /* symlink */ { &vop_readdir_desc, (VOPFUNC)devfs_readdir }, /* readdir */ { &vop_readlink_desc, (VOPFUNC)devfs_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)devfs_abortop }, /* abortop */ + { &vop_abortop_desc, (VOPFUNC)nop_abortop }, /* abortop */ { &vop_inactive_desc, (VOPFUNC)devfs_inactive }, /* inactive */ { &vop_reclaim_desc, (VOPFUNC)devfs_reclaim }, /* reclaim */ { &vop_lock_desc, (VOPFUNC)nop_lock }, /* lock */ diff -urN xnu-344.49/bsd/miscfs/devfs/devfsdefs.h xnu-517/bsd/miscfs/devfs/devfsdefs.h --- xnu-344.49/bsd/miscfs/devfs/devfsdefs.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/devfs/devfsdefs.h Sat Oct 25 00:25:25 2003 @@ -71,7 +71,7 @@ DEV_DIR, DEV_BDEV, DEV_CDEV, - DEV_SLNK, + DEV_SLNK } devfstype_t; extern int (**devfs_vnodeop_p)(void *); /* our own vector array for dirs */ @@ -180,19 +180,7 @@ #define M_DEVFSNODE M_DEVFS #define M_DEVFSMNT M_DEVFS -static __inline__ void -getnanotime(struct timespec * t_p) -{ - struct timeval tv; - - microtime(&tv); - t_p->tv_sec = tv.tv_sec; - t_p->tv_nsec = tv.tv_usec * 1000; - return; -} - #define VTODN(vp) ((devnode_t *)(vp)->v_data) -extern void cache_purge(struct vnode *vp); /* vfs_cache.c */ static __inline__ int DEVFS_LOCK(struct proc * p) diff -urN xnu-344.49/bsd/miscfs/fdesc/fdesc.h xnu-517/bsd/miscfs/fdesc/fdesc.h --- xnu-344.49/bsd/miscfs/fdesc/fdesc.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/fdesc/fdesc.h Sat Oct 25 00:25:25 2003 @@ -85,7 +85,7 @@ Froot, Fdevfd, Fdesc, - Flink, + Flink } fdntype; struct fdescnode { diff -urN xnu-344.49/bsd/miscfs/fdesc/fdesc_vfsops.c xnu-517/bsd/miscfs/fdesc/fdesc_vfsops.c --- xnu-344.49/bsd/miscfs/fdesc/fdesc_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/fdesc/fdesc_vfsops.c Sat Oct 25 00:25:25 2003 @@ -91,7 +91,7 @@ struct proc *p; { int error = 0; - u_int size; + size_t size; struct fdescmount *fmp; struct vnode *rvp; @@ -253,7 +253,7 @@ struct proc *)))eopnotsupp) #define fdesc_sysctl ((int (*) __P((int *, u_int, void *, size_t *, void *, \ size_t, struct proc *)))eopnotsupp) -#define fdesc_vget ((int (*) __P((struct mount *, ino_t, struct vnode **))) \ +#define fdesc_vget ((int (*) __P((struct mount *, void *, struct vnode **))) \ eopnotsupp) #define fdesc_vptofh ((int (*) __P((struct vnode *, struct fid *)))eopnotsupp) diff -urN xnu-344.49/bsd/miscfs/fdesc/fdesc_vnops.c xnu-517/bsd/miscfs/fdesc/fdesc_vnops.c --- xnu-344.49/bsd/miscfs/fdesc/fdesc_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/fdesc/fdesc_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -823,7 +823,7 @@ #define fdesc_mkdir ((int (*) __P((struct vop_mkdir_args *)))eopnotsupp) #define fdesc_rmdir ((int (*) __P((struct vop_rmdir_args *)))eopnotsupp) #define fdesc_symlink ((int (*) __P((struct vop_symlink_args *)))eopnotsupp) -#define fdesc_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define fdesc_abortop ((int (*) __P((struct vop_abortop_args *)))nop_abortop) #define fdesc_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define fdesc_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) #define fdesc_bmap ((int (*) __P((struct vop_bmap_args *)))fdesc_badop) diff -urN xnu-344.49/bsd/miscfs/fifofs/fifo_vnops.c xnu-517/bsd/miscfs/fifofs/fifo_vnops.c --- xnu-344.49/bsd/miscfs/fifofs/fifo_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/fifofs/fifo_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -179,22 +179,22 @@ int error; if ((fip = vp->v_fifoinfo) == NULL) { - MALLOC_ZONE(fip, struct fifoinfo *, - sizeof(*fip), M_VNODE, M_WAITOK); + MALLOC(fip, struct fifoinfo *, + sizeof(*fip), M_TEMP, M_WAITOK); vp->v_fifoinfo = fip; thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); if (error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0)) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - _FREE_ZONE(fip, sizeof *fip, M_VNODE); vp->v_fifoinfo = NULL; + FREE(fip, M_TEMP); return (error); } fip->fi_readsock = rso; if (error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0)) { (void)soclose(rso); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - _FREE_ZONE(fip, sizeof *fip, M_VNODE); vp->v_fifoinfo = NULL; + FREE(fip, M_TEMP); return (error); } fip->fi_writesock = wso; @@ -202,8 +202,8 @@ (void)soclose(wso); (void)soclose(rso); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - _FREE_ZONE(fip, sizeof *fip, M_VNODE); vp->v_fifoinfo = NULL; + FREE(fip, M_TEMP); return (error); } wso->so_state |= SS_CANTRCVMORE; @@ -479,8 +479,8 @@ error1 = soclose(fip->fi_readsock); error2 = soclose(fip->fi_writesock); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - FREE_ZONE(fip, sizeof *fip, M_VNODE); vp->v_fifoinfo = NULL; + FREE(fip, M_TEMP); if (error1) return (error1); return (error2); diff -urN xnu-344.49/bsd/miscfs/specfs/spec_vnops.c xnu-517/bsd/miscfs/specfs/spec_vnops.c --- xnu-344.49/bsd/miscfs/specfs/spec_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/specfs/spec_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -72,7 +72,7 @@ #include #include #include -#include +#include #include #include @@ -275,7 +275,30 @@ return (error); error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); if (!error) { + u_int64_t blkcnt; + u_int32_t blksize; + set_blocksize(vp, dev); + + /* + * Cache the size in bytes of the block device for later + * use by spec_write(). + */ + vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ + if (!VOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, NOCRED, p)) { + /* Switch to 512 byte sectors (temporarily) */ + u_int32_t size512 = 512; + + if (!VOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, NOCRED, p)) { + /* Get the number of 512 byte physical blocks. */ + if (!VOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, NOCRED, p)) { + vp->v_specdevsize = blkcnt * (u_int64_t)size512; + } + } + /* If it doesn't set back, we can't recover */ + if (VOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, NOCRED, p)) + error = ENXIO; + } } return(error); } @@ -439,11 +462,35 @@ n = min((unsigned)(bsize - on), uio->uio_resid); + /* + * Use getblk() as an optimization IFF: + * + * 1) We are reading exactly a block on a block + * aligned boundary + * 2) We know the size of the device from spec_open + * 3) The read doesn't span the end of the device + * + * Otherwise, we fall back on bread(). + */ + if (n == bsize && + vp->v_specdevsize != (u_int64_t)0 && + (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { + /* reduce the size of the read to what is there */ + n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; + } + if (n == bsize) bp = getblk(vp, bn, bsize, 0, 0, BLK_WRITE); else error = bread(vp, bn, bsize, NOCRED, &bp); + /* Translate downstream error for upstream, if needed */ + if (!error) { + error = bp->b_error; + if (!error && (bp->b_flags & B_ERROR) != 0) { + error = EIO; + } + } if (error) { brelse(bp); return (error); @@ -595,6 +642,7 @@ } */ *ap; { struct buf *bp; + extern int hard_throttle_on_root; bp = ap->a_bp; @@ -612,8 +660,11 @@ code |= DKIO_PAGING; KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - bp, bp->b_dev, bp->b_blkno, bp->b_bcount, 0); + (unsigned int)bp, bp->b_dev, bp->b_blkno, bp->b_bcount, 0); } + if ((bp->b_flags & B_PGIN) && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) + hard_throttle_on_root = 1; + (*bdevsw[major(bp->b_dev)].d_strategy)(bp); return (0); } diff -urN xnu-344.49/bsd/miscfs/specfs/specdev.h xnu-517/bsd/miscfs/specfs/specdev.h --- xnu-344.49/bsd/miscfs/specfs/specdev.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/specfs/specdev.h Sat Oct 25 00:25:25 2003 @@ -76,7 +76,8 @@ struct vnode *si_specnext; long si_flags; dev_t si_rdev; - daddr_t si_size; /* block device size in bytes */ + daddr_t si_size; /* device block size in bytes */ + u_int64_t si_devsize; /* actual device size in bytes */ }; /* * Exported shorthand @@ -86,6 +87,7 @@ #define v_specnext v_specinfo->si_specnext #define v_specflags v_specinfo->si_flags #define v_specsize v_specinfo->si_size +#define v_specdevsize v_specinfo->si_devsize /* * Flags for specinfo diff -urN xnu-344.49/bsd/miscfs/synthfs/synthfs_util.c xnu-517/bsd/miscfs/synthfs/synthfs_util.c --- xnu-344.49/bsd/miscfs/synthfs/synthfs_util.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/synthfs/synthfs_util.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -194,27 +194,25 @@ struct synthfsnode *source_sp = VTOS(source_vp); struct synthfsnode *parent_sp = VTOS(newparent_vp); char *new_name_ptr; - int result; - - if (parent_sp == source_sp->s_parent) return 0; + int result = 0; /* Unlink the entry from its current place: */ result = synthfs_remove_entry(source_vp); - if (result) return result; + if (result) goto err_exit; /* Change the name as necessary: */ - FREE(source_sp->s_name, M_TEMP); - if (new_name == NULL) { - MALLOC(new_name_ptr, char *, 1, M_TEMP, M_WAITOK); - new_name_ptr[0] = 0; - } else { - MALLOC(new_name_ptr, char *, strlen(new_name) + 1, M_TEMP, M_WAITOK); - strcpy(new_name_ptr, new_name); - }; - source_sp->s_name = new_name_ptr; - + if (new_name) { + FREE(source_sp->s_name, M_TEMP); + MALLOC(new_name_ptr, char *, strlen(new_name) + 1, M_TEMP, M_WAITOK); + strcpy(new_name_ptr, new_name); + source_sp->s_name = new_name_ptr; + }; + /* Insert the entry in its new home: */ - return synthfs_insertnode(source_sp, parent_sp); + result = synthfs_insertnode(source_sp, parent_sp); + +err_exit: + return result; } @@ -320,7 +318,7 @@ long padtext = 0; unsigned short direntrylength; - namelength = ((name == NULL) ? 0 : strlen(name)); + namelength = ((name == NULL) ? 0 : strlen(name) + 1); padding = (4 - (namelength & 3)) & 3; direntrylength = sizeof(struct synthfs_direntry_head) + namelength + padding; diff -urN xnu-344.49/bsd/miscfs/synthfs/synthfs_vfsops.c xnu-517/bsd/miscfs/synthfs/synthfs_vfsops.c --- xnu-344.49/bsd/miscfs/synthfs/synthfs_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/synthfs/synthfs_vfsops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff -urN xnu-344.49/bsd/miscfs/synthfs/synthfs_vnops.c xnu-517/bsd/miscfs/synthfs/synthfs_vnops.c --- xnu-344.49/bsd/miscfs/synthfs/synthfs_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/synthfs/synthfs_vnops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -65,13 +65,6 @@ #include #endif -/* external routines defined in vfs_cache.c */ -extern void cache_purge (struct vnode *vp); -extern int cache_lookup (struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); -extern void cache_enter (struct vnode *dvp, struct vnode *vpp, struct componentname *cnp); - -//extern void vnode_uncache(struct vnode *); - extern int groupmember(gid_t gid, struct ucred* cred); #define VOPFUNC int (*)(void *) @@ -185,7 +178,7 @@ Debugger(debugmsg); #endif - return EOPNOTSUPP; + return err_create(ap); } diff -urN xnu-344.49/bsd/miscfs/union/union_vfsops.c xnu-517/bsd/miscfs/union/union_vfsops.c --- xnu-344.49/bsd/miscfs/union/union_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/union/union_vfsops.c Sat Oct 25 00:25:25 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -226,7 +226,8 @@ mp->mnt_data = (qaddr_t) um; vfs_getnewfsid(mp); - (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + (void) copyinstr(path, mp->mnt_stat.f_mntonname, + MNAMELEN - 1, (size_t *)&size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); switch (um->um_op) { @@ -246,7 +247,7 @@ cp = mp->mnt_stat.f_mntfromname + len; len = MNAMELEN - len; - (void) copyinstr(args.target, cp, len - 1, &size); + (void) copyinstr(args.target, cp, len - 1, (size_t *)&size); bzero(cp + size, len - size); #ifdef UNION_DIAGNOSTIC @@ -507,7 +508,7 @@ struct proc *)))eopnotsupp) #define union_sysctl ((int (*) __P((int *, u_int, void *, size_t *, void *, \ size_t, struct proc *)))eopnotsupp) -#define union_vget ((int (*) __P((struct mount *, ino_t, struct vnode **))) \ +#define union_vget ((int (*) __P((struct mount *, void *, struct vnode **))) \ eopnotsupp) #define union_vptofh ((int (*) __P((struct vnode *, struct fid *)))eopnotsupp) diff -urN xnu-344.49/bsd/miscfs/union/union_vnops.c xnu-517/bsd/miscfs/union/union_vnops.c --- xnu-344.49/bsd/miscfs/union/union_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/union/union_vnops.c Sat Oct 25 00:25:25 2003 @@ -154,10 +154,10 @@ */ while (dvp != udvp && (dvp->v_type == VDIR) && (mp = dvp->v_mountedhere)) { - - if (vfs_busy(mp, 0, 0, p)) - continue; - + if (vfs_busy(mp, LK_NOWAIT, 0, p)) { + vput(dvp); + return(ENOENT); + } error = VFS_ROOT(mp, &tdvp); vfs_unbusy(mp, p); if (error) { diff -urN xnu-344.49/bsd/miscfs/volfs/volfs.h xnu-517/bsd/miscfs/volfs/volfs.h --- xnu-344.49/bsd/miscfs/volfs/volfs.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/volfs/volfs.h Sat Oct 25 00:25:25 2003 @@ -61,6 +61,9 @@ #define ROOT_DIRID 2 +#define MAXPLCENTRIES 250 +#define PLCHASHSIZE 128 + extern int (**volfs_vnodeop_p)(void *); __BEGIN_DECLS diff -urN xnu-344.49/bsd/miscfs/volfs/volfs_vfsops.c xnu-517/bsd/miscfs/volfs/volfs_vfsops.c --- xnu-344.49/bsd/miscfs/volfs/volfs_vfsops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/volfs/volfs_vfsops.c Sat Oct 25 00:25:25 2003 @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include @@ -265,6 +265,8 @@ root_vp->v_data = priv_vn_data; priv_mnt_data->volfs_rootvp = root_vp; + + mp->mnt_flag &= ~MNT_RDONLY; return (0); } @@ -403,6 +405,14 @@ struct proc *p; { // DBG_VOP(("volfs_sync called\n")); + + /* Release a few entries from the permissions cache to keep them from getting stale. + * Since sync is called at least every 30 seconds or so, releasing 1/20 of the cache + * every time through should free all entries in no less than 10 minutes, which should + * be adequate to prevent pid-wrapping from mis-associating PLC entries: + */ + volfs_PLC_reclaim_entries(MAXPLCENTRIES / 20); + return 0; } /* @@ -462,6 +472,9 @@ struct vfsconf *vfsp; { DBG_VOP(("volfs_init called\n")); + + volfs_PLChashinit(); + return (0); } diff -urN xnu-344.49/bsd/miscfs/volfs/volfs_vnops.c xnu-517/bsd/miscfs/volfs/volfs_vnops.c --- xnu-344.49/bsd/miscfs/volfs/volfs_vnops.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/miscfs/volfs/volfs_vnops.c Sat Oct 25 00:25:25 2003 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,8 @@ #include #include #include +#include +#include #include #include @@ -168,15 +171,165 @@ struct vnodeopv_desc volfs_vnodeop_opv_desc = {&volfs_vnodeop_p, volfs_vnodeop_entries}; +static char gDot[] = "."; +static char gDotDot[] = ".."; + +struct finfo { + fsobj_id_t parID; +}; + +struct finfoattrbuf { + unsigned long length; + struct finfo fi; +}; static int validfsnode(struct mount *fsnode); +struct volfs_PLCEntry +{ + LIST_ENTRY(volfs_PLCEntry) vplc_hash_link; /* entry's hash chain */ + TAILQ_ENTRY(volfs_PLCEntry) vplc_lru_link; /* entry's LRU chain link */ + int32_t vplc_fsid; + u_int vplc_item_id; + uid_t vplc_uid; + pid_t vplc_pid; +}; + +#define VOLFSPLCHASH(fsid, inum) ((((unsigned long)fsid) + (unsigned long)(inum)) & volfs_PLCHashMask) + +static struct slock volfs_PLChashtable_slock; +static TAILQ_HEAD(volfs_PLCLRUListHead, volfs_PLCEntry) volfs_PLCLRUList; +static TAILQ_HEAD(volfs_PLCFreeListHead, volfs_PLCEntry) volfs_PLCFreeList; +static LIST_HEAD(, volfs_PLCEntry) *volfs_PLCHashTable; +static u_long volfs_PLCHashMask; /* size of hash table - 1 */ +static u_long volfs_PLCEntryCount; + #if DBG_VOP_TEST_LOCKS static void DbgVopTest (int max, int error, VopDbgStoreRec *VopDbgStore, char *funcname); #endif /* DBG_VOP_TEST_LOCKS */ /* + * volfs_PLChashinit + */ +__private_extern__ void +volfs_PLChashinit(void) +{ + int i; + + TAILQ_INIT(&volfs_PLCLRUList); + TAILQ_INIT(&volfs_PLCFreeList); + simple_lock_init(&volfs_PLChashtable_slock); +#if MAXPLCENTRIES + volfs_PLCHashTable = hashinit(PLCHASHSIZE, M_TEMP, &volfs_PLCHashMask); + + for (i = 0; i < PLCHASHSIZE; ++i) { + LIST_INIT(&volfs_PLCHashTable[i]); + }; +#endif + volfs_PLCEntryCount = 0; +} + + + +__private_extern__ void +volfs_PLC_reclaim_entries(int entrycount) +{ +#if MAXPLCENTRIES + int i; + struct volfs_PLCEntry *reclaim_target; + + simple_lock(&volfs_PLChashtable_slock); + + for (i = entrycount; i > 0; --i) { + if (TAILQ_EMPTY(&volfs_PLCLRUList)) break; + + /* Pick the next entry to be recycled and free it: */ + reclaim_target = TAILQ_FIRST(&volfs_PLCLRUList); + TAILQ_REMOVE(&volfs_PLCLRUList, reclaim_target, vplc_lru_link); + LIST_REMOVE(reclaim_target, vplc_hash_link); + TAILQ_INSERT_TAIL(&volfs_PLCFreeList, reclaim_target, vplc_lru_link); + }; + + simple_unlock(&volfs_PLChashtable_slock); +#endif +} + + + +#if MAXPLCENTRIES +/* + * volfs_PLCLookup + * + * Look up a PLC entry in the hash + */ +static int +volfs_PLCLookup(int32_t fsid, u_int target_id, uid_t uid, pid_t pid) +{ + struct volfs_PLCEntry *hash_entry; + int result = 0; + + simple_lock(&volfs_PLChashtable_slock); + LIST_FOREACH(hash_entry, &volfs_PLCHashTable[VOLFSPLCHASH(fsid, target_id)], vplc_hash_link) { + if ((hash_entry->vplc_item_id == target_id) && + (hash_entry->vplc_pid == pid) && + (hash_entry->vplc_uid == uid) && + (hash_entry->vplc_fsid == fsid)) { + result = 1; +#if 0 + if (hash_entry != TAILQ_LAST(&volfs_PLCLRUList, volfs_PLCLRUListHead)) { + TAILQ_REMOVE(&volfs_PLCLRUList, hash_entry, vplc_lru_link); + TAILQ_INSERT_TAIL(&volfs_PLCLRUList, hash_entry, vplc_lru_link); + }; +#endif + break; + }; + }; + simple_unlock(&volfs_PLChashtable_slock); + return result; +} + + +static void +volfs_PLCEnter(int32_t fsid, u_int target_id, uid_t uid, pid_t pid) +{ + struct volfs_PLCEntry *new_entry; + + simple_lock(&volfs_PLChashtable_slock); + if (!TAILQ_EMPTY(&volfs_PLCFreeList)) { + new_entry = TAILQ_FIRST(&volfs_PLCFreeList); + TAILQ_REMOVE(&volfs_PLCFreeList, new_entry, vplc_lru_link); + } else { + /* + * Allocate up to the predetermined maximum number of new entries: + * [must be done now to avoid blocking in MALLOC() with volfs_PLChashtable_slock held locked] + */ + if (volfs_PLCEntryCount < MAXPLCENTRIES) { + simple_unlock(&volfs_PLChashtable_slock); + new_entry = MALLOC(new_entry, struct volfs_PLCEntry *, sizeof(struct volfs_PLCEntry), M_TEMP, M_WAITOK); + simple_lock(&volfs_PLChashtable_slock); + ++volfs_PLCEntryCount; + } else { + new_entry = TAILQ_FIRST(&volfs_PLCLRUList); + TAILQ_REMOVE(&volfs_PLCLRUList, new_entry, vplc_lru_link); + LIST_REMOVE(new_entry, vplc_hash_link); + }; + }; + + new_entry->vplc_fsid = fsid; + new_entry->vplc_item_id = target_id; + new_entry->vplc_uid = uid; + new_entry->vplc_pid = pid; + + /* Link the new entry on the hash list for the fsid/target_id as well as the tail of the LRU list: */ + LIST_INSERT_HEAD(&volfs_PLCHashTable[VOLFSPLCHASH(fsid, target_id)], new_entry, vplc_hash_link); + TAILQ_INSERT_TAIL(&volfs_PLCLRUList, new_entry, vplc_lru_link); + simple_unlock(&volfs_PLChashtable_slock); +} +#endif + + +/* * volfs_reclaim - Reclaim a vnode so that it can be used for other purposes. * * Locking policy: ignored @@ -222,7 +375,7 @@ /* * We don't need to check credentials! FS is read-only for everyone */ - if (ap->a_mode == VREAD || ap->a_mode == VEXEC) + if ((ap->a_mode & ~(VREAD | VEXEC)) == 0) ret_err = 0; else ret_err = EACCES; @@ -555,18 +708,26 @@ volfs_lock(ap) struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct proc *a_p; } */ *ap; -{ +{ int retval; struct volfs_vndata *priv_data; DBG_FUNC_NAME("volfs_lock"); DBG_VOP_LOCKS_DECL(1); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 0)) | DBG_FUNC_START, + (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, 0, 0); +#endif DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_UNLOCKED, VOPDBG_LOCKED, VOPDBG_UNLOCKED, VOPDBG_ZERO); - + priv_data = (struct volfs_vndata *) ap->a_vp->v_data; retval = lockmgr(&priv_data->lock, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p); DBG_VOP_LOCKS_TEST(retval); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 0)) | DBG_FUNC_END, + (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, retval, 0); +#endif return (retval); } @@ -584,6 +745,10 @@ struct volfs_vndata *priv_data; DBG_FUNC_NAME("volfs_unlock"); DBG_VOP_LOCKS_DECL(1); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 4)) | DBG_FUNC_START, + (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, 0, 0); +#endif DBG_VOP_PRINT_FUNCNAME();DBG_VOP_PRINT_VNODE_INFO(ap->a_vp);DBG_VOP(("\n")); DBG_VOP_LOCKS_INIT(0,ap->a_vp, VOPDBG_LOCKED, VOPDBG_UNLOCKED, VOPDBG_LOCKED, VOPDBG_ZERO); @@ -593,6 +758,10 @@ &ap->a_vp->v_interlock, ap->a_p); DBG_VOP_LOCKS_TEST(retval); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 4)) | DBG_FUNC_END, + (unsigned int)ap->a_vp, (unsigned int)ap->a_flags, (unsigned int)ap->a_p, retval, 0); +#endif return (retval); } @@ -658,6 +827,237 @@ /* NOTREACHED */ } + +/* + * Call VOP_GETATTRLIST on a given vnode + */ +static int +vp_getattrlist(struct vnode *vp, struct attrlist alist, void *attrbufptr, size_t bufsize, unsigned long options, struct proc *p) { + struct iovec iov; + struct uio bufuio; + + iov.iov_base = (char *)attrbufptr; + iov.iov_len = bufsize; + + bufuio.uio_iov = &iov; + bufuio.uio_iovcnt = 1; + bufuio.uio_offset = 0; + bufuio.uio_resid = iov.iov_len; + bufuio.uio_segflg = UIO_SYSSPACE; + bufuio.uio_rw = UIO_READ; + bufuio.uio_procp = p; + + return VOP_GETATTRLIST(vp, &alist, &bufuio, p->p_ucred, p); +} + +/* + * get_parentvp() - internal routine that tries to lookup the parent of vpp. + * On success, *vpp is the parent vp and is returned locked and the original child + * is left unlocked. On failure, the original child will be locked upon return. + */ +static int +get_parentvp(struct vnode **vpp, struct mount *mp, struct proc *p) +{ + int result; + struct attrlist alist; + struct finfoattrbuf finfobuf; + struct vnode *child_vp = *vpp; + + alist.bitmapcount = 5; + alist.reserved = 0; + alist.commonattr = ATTR_CMN_PAROBJID; + alist.volattr = 0; + alist.dirattr = 0; + alist.fileattr = 0; + alist.forkattr = 0; + result = vp_getattrlist(child_vp, alist, &finfobuf, sizeof(finfobuf), 0, p); + if (result) + return result; + + /* Release the child vnode before trying to acquire its parent + to avoid vnode deadlock problems with parsing code + coming top-down through the directory hierarchy: */ + VOP_UNLOCK(child_vp, 0, p); + + /* Shift attention to the parent directory vnode: */ + result = VFS_VGET(mp, &finfobuf.fi.parID.fid_objno, vpp); + if (result) { + /* Make sure child_vp is still locked on exit: */ + vn_lock(child_vp, LK_EXCLUSIVE | LK_RETRY, p); + } + + return result; +} + + +/* + * Look up the parent directory of a given vnode. + */ +static int +lookup_parent(u_int id, struct vnode *child_vp, struct vnode **parent_vp, struct proc *p) +{ + struct nameidata nd; + struct componentname *cnp = &nd.ni_cnd; + struct filedesc *fdp = p->p_fd; + int error; + + *parent_vp = NULL; + + /* + * Special case lookups for root's parent directory, + * recognized by its special id of "1": + */ + if (id != 1) { + VREF(child_vp); + nd.ni_startdir = child_vp; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, (caddr_t)&gDotDot, p); + } else { + struct vnode *root_vp; + + error = VFS_ROOT(child_vp->v_mount, &root_vp); + if (error) return error; + VOP_UNLOCK(root_vp, 0, p); /* Hold on to the reference */ + nd.ni_startdir = root_vp; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, (caddr_t)&gDot, p); + }; + nd.ni_cnd.cn_cred = nd.ni_cnd.cn_proc->p_ucred; + + /* Since we can't hit any symlinks, use the source path string directly: */ + cnp->cn_pnbuf = nd.ni_dirp; + nd.ni_pathlen = strlen(cnp->cn_pnbuf); + cnp->cn_pnlen = nd.ni_pathlen + 1; + cnp->cn_flags |= (HASBUF | SAVENAME); + + nd.ni_loopcnt = 0; + + if ((nd.ni_rootdir = fdp->fd_rdir) == NULL) nd.ni_rootdir = rootvnode; + cnp->cn_nameptr = cnp->cn_pnbuf; + if (error = lookup(&nd)) { + cnp->cn_pnbuf = NULL; + return (error); + } + /* + * Check for symbolic link + */ + if (cnp->cn_flags & ISSYMLINK) return ENOENT; + if (nd.ni_vp == child_vp) return ELOOP; + + *parent_vp = nd.ni_vp; + return 0; +} + + + +/* + * verify_fullpathaccess(ret_vnode); + */ + +static int +verify_fullpathaccess(u_int id, struct vnode *targetvp, struct proc *p) { + struct vnode *vp, *parent_vp; + struct mount *mp = targetvp->v_mount; + struct attrlist alist; + struct finfoattrbuf finfobuf; + int result; + struct filedesc *fdp = p->p_fd; /* pointer to file descriptor state */ + u_int target_id; + u_long vp_id; + +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 12)) | DBG_FUNC_START, + (unsigned int)targetvp, (unsigned int)mp, (unsigned int)p, 0, 0); +#endif + + vp = targetvp; + vp_id = vp->v_id; + if (vp->v_type != VDIR) { + + /* The target is a file: get the parent directory. */ + result = get_parentvp(&vp, mp, p); + if (result) goto err_exit; + + /* At this point, targetvp is unlocked (but still referenced), and + vp is the parent directory vnode, held locked */ + }; + + +#if MAXPLCENTRIES + if (volfs_PLCLookup(mp->mnt_stat.f_fsid.val[0], id, p->p_ucred->cr_uid, p->p_pid)) goto lookup_success; +#endif + /* Keep going up until either the process's root or the process's working directory is hit, + either one of which are potential valid starting points for a full pathname: */ + target_id = id; + while (vp != NULL && (!((vp->v_flag & VROOT) || /* Hit "/" */ + (vp == fdp->fd_cdir) || /* Hit process's working directory */ + (vp == fdp->fd_rdir)))) { /* Hit process chroot()-ed root */ + + /* At this point, vp is some directory node and it's always locked */ + /* Unlock the starting directory for namei(), retaining a reference... */ + VOP_UNLOCK(vp, 0, p); + + if (result = lookup_parent(target_id, vp, &parent_vp, p)) { + /* + * If the lookup fails with EACCES and the targetvp is a directory, + * we should try again using get_parentvp(). Without this check, + * directories that you can navigate to but not traverse will + * disappear when clicked in the Finder. + */ + if (result == EACCES && vp == targetvp && vp->v_type == VDIR && (vp->v_flag & VROOT) == 0) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + parent_vp = vp; + if (get_parentvp(&parent_vp, mp, p)) { + /* on error, vp is still locked... unlock for lookup_err_exit path */ + VOP_UNLOCK(vp, 0, p); + } else { + /* on success, vp is returned unlocked, parent_vp is returned locked */ + result = 0; + } + }; + if (result) goto lookup_err_exit; + }; + + if (vp != targetvp) { + vrele(vp); /* Completely done with that vp now... */ + }; + + vp = parent_vp; + target_id = 0; /* It's unknown at this point */ + + if (((result = VOP_ACCESS(vp, VEXEC, p->p_ucred, p)) != 0) && + ((result = VOP_ACCESS(vp, VREAD, p->p_ucred, p)) != 0)) { + VOP_UNLOCK(vp, 0, p); + goto lookup_err_exit; + }; + }; + +#if MAXPLCENTRIES + volfs_PLCEnter(mp->mnt_stat.f_fsid.val[0], id, p->p_ucred->cr_uid, p->p_pid); +#endif + +lookup_success: + /* Success: the caller has complete access to the initial vnode: */ + result = 0; + + if (vp && vp != targetvp) VOP_UNLOCK(vp, 0, p); + +lookup_err_exit: + if (vp && vp != targetvp) { + vrele(vp); + vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp_id != targetvp->v_id || targetvp->v_type == VBAD) { + result = EAGAIN; /* vnode was recycled */ + } + }; + +err_exit: +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 12)) | DBG_FUNC_END, + (unsigned int)targetvp, (unsigned int)mp, (unsigned int)p, result, 0); +#endif + return result; +}; + + /* * get_fsvnode - internal routine to create a vnode for a file system. Called with mount pointer, * id of filesystem to lookup and pointer to vnode pointer to fill in @@ -769,15 +1169,15 @@ * to a vnode pointer */ static int -get_filevnode(parent_fs, id, ret_vnode) +get_filevnode(parent_fs, id, ret_vnode, p) struct mount *parent_fs; u_int id; struct vnode **ret_vnode; + struct proc *p; { int retval; - DBG_VOP(("get_filevnode called for ID %d\n", id)); - +again: /* * Special case 2 to mean the root of a file system */ @@ -785,7 +1185,23 @@ retval = VFS_ROOT(parent_fs, ret_vnode); else retval = VFS_VGET(parent_fs, &id, ret_vnode); + if (retval) goto error; + + retval = verify_fullpathaccess(id, *ret_vnode, p); + if (retval) { + /* An error was encountered verifying that the caller has, + in fact, got access all the way from "/" or their working + directory to the specified item... + */ + vput(*ret_vnode); + *ret_vnode = NULL; + /* vnode was recycled during access verification. */ + if (retval == EAGAIN) { + goto again; + } + }; +error: return (retval); } @@ -799,11 +1215,16 @@ char *cnp; long namelen; struct mount *parent_fs; - int unlocked_parent = 0; + int unlocked_parent = 0, isdot_or_dotdot = 0; int ret_err = ENOENT; DBG_FUNC_NAME("volfs_lookup"); DBG_VOP_LOCKS_DECL(2); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 8)) | DBG_FUNC_START, + (unsigned int)ap->a_dvp, (unsigned int)ap->a_cnp, (unsigned int)p, 0, 0); +#endif + DBG_VOP(("volfs_lookup called, name = %s, namelen = %ld\n", ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen)); DBG_VOP_LOCKS_INIT(0,ap->a_dvp, VOPDBG_LOCKED, VOPDBG_IGNORE, VOPDBG_IGNORE, VOPDBG_POS); @@ -851,14 +1272,16 @@ if (namelen == 1) { /* "." requested */ + isdot_or_dotdot = 1; *ap->a_vpp = ap->a_dvp; VREF(*ap->a_vpp); DBG_VOP_LOCKS_TEST(0); - return (0); + ret_err = 0; } else if (cnp[1] == '.' && namelen == 2) { /* ".." requested */ + isdot_or_dotdot = 1; ret_err = volfs_root(ap->a_dvp->v_mount, ap->a_vpp); } } @@ -901,14 +1324,22 @@ ret_err = get_fsvnode(ap->a_dvp->v_mount, id, ap->a_vpp); else { parent_fs = priv_data->fs_mount; - if (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN)) { - VOP_UNLOCK(ap->a_dvp, 0, ap->a_cnp->cn_proc); - unlocked_parent = 1; - }; - ret_err = get_filevnode(parent_fs, id, ap->a_vpp); + if (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN)) { + VOP_UNLOCK(ap->a_dvp, 0, ap->a_cnp->cn_proc); + unlocked_parent = 1; + }; + ret_err = get_filevnode(parent_fs, id, ap->a_vpp, ap->a_cnp->cn_proc); } } + } + if (!isdot_or_dotdot && *ap->a_vpp && VPARENT(*ap->a_vpp) == NULL && ap->a_dvp != *ap->a_vpp) { + if (VPARENT(ap->a_dvp) == *ap->a_vpp) { + panic("volfs: ap->a_dvp 0x%x has parent == a_vpp 0x%x\n", + ap->a_dvp, *ap->a_vpp); + } + vget(ap->a_dvp, 0, ap->a_cnp->cn_proc); + VPARENT(*ap->a_vpp) = ap->a_dvp; } if (!unlocked_parent && (!(ap->a_cnp->cn_flags & LOCKPARENT) || !(ap->a_cnp->cn_flags & ISLASTCN))) { @@ -922,6 +1353,10 @@ DBG_VOP_UPDATE_VP(1, *ap->a_vpp); DBG_VOP_LOCKS_TEST(ret_err); +#if 0 + KERNEL_DEBUG((FSDBG_CODE(DBG_FSVN, 8)) | DBG_FUNC_START, + (unsigned int)ap->a_dvp, (unsigned int)ap->a_cnp, (unsigned int)p, ret_err, 0); +#endif return (ret_err); } diff -urN xnu-344.49/bsd/net/Makefile xnu-517/bsd/net/Makefile --- xnu-344.49/bsd/net/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/net/Makefile Tue Oct 21 21:24:55 2003 @@ -23,14 +23,14 @@ bpf.h bpf_compat.h bpfdesc.h dlil.h dlil_pvt.h \ etherdefs.h ethernet.h if.h if_arp.h \ if_dl.h if_llc.h if_media.h if_mib.h \ - if_ppp.h if_slvar.h \ + if_slvar.h \ if_types.h if_var.h iso88025.h \ kext_net.h ndrv.h net_osdep.h netisr.h pfkeyv2.h \ - ppp_defs.h radix.h raw_cb.h route.h slcompress.h slip.h + radix.h raw_cb.h route.h slcompress.h slip.h PRIVATE_DATAFILES = \ ndrv_var.h zlib.h if_pppvar.h if_sppp.h ppp_comp.h if_atm.h \ - if_tun.h if_vlan_var.h + if_tun.h if_vlan_var.h if_ppp.h firewire.h ppp_defs.h INSTALL_MI_LIST = ${DATAFILES} diff -urN xnu-344.49/bsd/net/bpf.c xnu-517/bsd/net/bpf.c --- xnu-344.49/bsd/net/bpf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/bpf.c Sat Oct 25 00:25:25 2003 @@ -101,8 +101,9 @@ #include #include #include +#include - +#include #include #include @@ -122,6 +123,7 @@ #define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio) #endif + #define PRINET 26 /* interruptible */ /* @@ -136,7 +138,7 @@ /* * bpf_iflist is the list of interfaces; each corresponds to an ifnet - * bpf_dtab holds the descriptors, indexed by minor device # + * bpf_dtab holds pointer to the descriptors, indexed by minor device # */ static struct bpf_if *bpf_iflist; #ifdef __APPLE__ @@ -145,10 +147,19 @@ * on their system. Our dev_t is an int, so we still store * the bpf_d in a separate table indexed by minor device #. */ -static struct bpf_d bpf_dtab[NBPFILTER]; -static int bpf_dtab_init; -static int nbpfilter = NBPFILTER; -#endif +static struct bpf_d **bpf_dtab = NULL; +static int bpf_dtab_size = 0; +static int nbpfilter = 0; + +/* + * Mark a descriptor free by making it point to itself. + * This is probably cheaper than marking with a constant since + * the address should be in a register anyway. + */ +#define D_ISFREE(d) ((d) == (d)->bd_next) +#define D_MARKFREE(d) ((d)->bd_next = (d)) +#define D_MARKUSED(d) ((d)->bd_next = 0) +#endif /* __APPLE__ */ static int bpf_allocbufs __P((struct bpf_d *)); static void bpf_attachd __P((struct bpf_d *d, struct bpf_if *bp)); @@ -165,6 +176,13 @@ static void reset_d __P((struct bpf_d *)); static int bpf_setf __P((struct bpf_d *, struct bpf_program *)); +/*static void *bpf_devfs_token[MAXBPFILTER];*/ + +static int bpf_devsw_installed; + +void bpf_init __P((void *unused)); + + /* * Darwin differs from BSD here, the following are static * on BSD and not static on Darwin. @@ -202,6 +220,7 @@ /* type */ 0 }; +#define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) static int bpf_movein(uio, linktype, mp, sockp, datlen) @@ -270,10 +289,17 @@ hlen = 4; /* This should match PPP_HDRLEN */ break; + case DLT_APPLE_IP_OVER_IEEE1394: + sockp->sa_family = AF_UNSPEC; + hlen = sizeof(struct firewire_header); + break; + default: return (EIO); } - + if ((hlen + SOCKADDR_HDR_LEN) > sockp->sa_len) { + return (EIO); + } len = uio->uio_resid; *datlen = len - hlen; if ((unsigned)len > MCLBYTES) @@ -340,6 +366,62 @@ thread_funnel_set(network_flock, funnel_state); return 0; } + +/* + * Returns 1 on sucess, 0 on failure + */ +static int +bpf_dtab_grow(int increment) +{ + struct bpf_d **new_dtab = NULL; + + new_dtab = (struct bpf_d **)_MALLOC(sizeof(struct bpf_d *) * (bpf_dtab_size + increment), M_DEVBUF, M_WAIT); + if (new_dtab == NULL) + return 0; + + if (bpf_dtab) { + struct bpf_d **old_dtab; + + bcopy(bpf_dtab, new_dtab, sizeof(struct bpf_d *) * bpf_dtab_size); + /* + * replace must be atomic with respect to free do bpf_dtab + * is always valid. + */ + old_dtab = bpf_dtab; + bpf_dtab = new_dtab; + _FREE(old_dtab, M_DEVBUF); + } + else bpf_dtab = new_dtab; + + bzero(bpf_dtab + bpf_dtab_size, sizeof(struct bpf_d *) * increment); + + bpf_dtab_size += increment; + + return 1; +} + +static struct bpf_d * +bpf_make_dev_t(int maj) +{ + struct bpf_d *d; + + if (nbpfilter >= bpf_dtab_size && bpf_dtab_grow(NBPFILTER) == 0) + return NULL; + + d = (struct bpf_d *)_MALLOC(sizeof(struct bpf_d), M_DEVBUF, M_WAIT); + if (d != NULL) { + int i = nbpfilter++; + + bzero(d, sizeof(struct bpf_d)); + bpf_dtab[i] = d; + D_MARKFREE(bpf_dtab[i]); + /*bpf_devfs_token[i] = */devfs_make_node(makedev(maj, i), + DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, + "bpf%d", i); + } + return d; +} + #endif /* @@ -420,16 +502,6 @@ } -#ifdef __APPLE__ -/* - * Mark a descriptor free by making it point to itself. - * This is probably cheaper than marking with a constant since - * the address should be in a register anyway. - */ -#define D_ISFREE(d) ((d) == (d)->bd_next) -#define D_MARKFREE(d) ((d)->bd_next = (d)) -#define D_MARKUSED(d) ((d)->bd_next = 0) -#endif /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. @@ -445,12 +517,16 @@ register struct bpf_d *d; #ifdef __APPLE__ + /* new device nodes on demand when opening the last one */ + if (minor(dev) == nbpfilter - 1) + bpf_make_dev_t(major(dev)); + if (minor(dev) >= nbpfilter) return (ENXIO); + + d = bpf_dtab[minor(dev)]; thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - - d = &bpf_dtab[minor(dev)]; #else if (p->p_prison) return (EPERM); @@ -480,7 +556,11 @@ d->bd_bufsize = bpf_bufsize; d->bd_sig = SIGIO; d->bd_seesent = 1; + +#ifdef __APPLE__ thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); +#endif + return (0); } @@ -498,16 +578,33 @@ { register struct bpf_d *d; register int s; - - thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); +#ifdef __APPLE__ + struct bpf_d **bpf_dtab_schk; +#endif #ifndef __APPLE__ funsetown(d->bd_sigio); #endif s = splimp(); #ifdef __APPLE__ - d = &bpf_dtab[minor(dev)]; +again: + d = bpf_dtab[minor(dev)]; + bpf_dtab_schk = bpf_dtab; +#endif + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + +#ifdef __APPLE__ + /* + * If someone grows bpf_dtab[] while we were waiting for the + * funnel, then we will be pointing off into freed memory; + * check to see if this is the case. + */ + if (bpf_dtab_schk != bpf_dtab) { + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + goto again; + } #endif + if (d->bd_bif) bpf_detachd(d); splx(s); @@ -585,8 +682,9 @@ int error; int s; + d = bpf_dtab[minor(dev)]; + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - d = &bpf_dtab[minor(dev)]; /* * Restrict application to use a buffer the same size as @@ -707,6 +805,9 @@ #endif } +/* keep in sync with bpf_movein above: */ +#define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) + int bpfwrite(dev, uio, ioflag) dev_t dev; @@ -717,11 +818,12 @@ struct ifnet *ifp; struct mbuf *m; int error, s; - static struct sockaddr dst; + char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; int datlen; + d = bpf_dtab[minor(dev)]; + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - d = &bpf_dtab[minor(dev)]; if (d->bd_bif == 0) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); @@ -734,8 +836,9 @@ thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (0); } - - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, &dst, &datlen); + ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf); + error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, + (struct sockaddr *)dst_buf, &datlen); if (error) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (error); @@ -746,13 +849,14 @@ return (EMSGSIZE); } - if (d->bd_hdrcmplt) - dst.sa_family = pseudo_AF_HDRCMPLT; + if (d->bd_hdrcmplt) { + ((struct sockaddr *)dst_buf)->sa_family = pseudo_AF_HDRCMPLT; + } s = splnet(); - error = dlil_output(ifp->if_data.default_proto, m, - (caddr_t) 0, &dst, 0); + error = dlil_output(ifptodlt(ifp, PF_INET), m, + (caddr_t) 0, (struct sockaddr *)dst_buf, 0); splx(s); thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); @@ -813,9 +917,9 @@ register struct bpf_d *d; int s, error = 0; + d = bpf_dtab[minor(dev)]; thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); - d = &bpf_dtab[minor(dev)]; switch (cmd) { @@ -1204,12 +1308,12 @@ register int s; int revents = 0; + d = bpf_dtab[minor(dev)]; + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); /* * An imitation of the FIONREAD ioctl code. */ - d = &bpf_dtab[minor(dev)]; - if (d->bd_bif == NULL) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (ENXIO); @@ -1284,7 +1388,7 @@ if (m == 0) panic("bpf_mcopy"); count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); + bcopy(mtod((struct mbuf *)m, void *), dst, count); m = m->m_next; dst += count; len -= count; @@ -1475,16 +1579,7 @@ */ bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; -#ifdef __APPLE__ - /* - * Mark all the descriptors free if this hasn't been done. - */ - if (!bpf_dtab_init) { - for (i = 0; i < nbpfilter; ++i) - D_MARKFREE(&bpf_dtab[i]); - bpf_dtab_init = 1; - } -#else +#ifndef __APPLE__ if (bootverbose) printf("bpf: %s%d attached\n", ifp->if_name, ifp->if_unit); #endif @@ -1547,12 +1642,6 @@ splx(s); } -static void *bpf_devfs_token[NBPFILTER]; - -static int bpf_devsw_installed; - -void bpf_init __P((void *unused)); - void bpf_init(unused) void *unused; @@ -1569,11 +1658,12 @@ nbpfilter = 0; return; } - for (i = 0 ; i < nbpfilter; i++) { - bpf_devfs_token[i] = devfs_make_node(makedev(maj, i), - DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, - "bpf%x", i); + if (bpf_dtab_grow(NBPFILTER) == 0) { + printf("bpf_init: failed to allocate bpf_dtab\n"); + return; } + for (i = 0 ; i < NBPFILTER; i++) + bpf_make_dev_t(maj); } #else cdevsw_add(&bpf_cdevsw); diff -urN xnu-344.49/bsd/net/bpf.h xnu-517/bsd/net/bpf.h --- xnu-344.49/bsd/net/bpf.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/bpf.h Sat Oct 25 00:25:25 2003 @@ -179,6 +179,7 @@ #define DLT_FDDI 10 /* FDDI */ #define DLT_ATM_RFC1483 11 /* LLC/SNAP encapsulated atm */ #define DLT_RAW 12 /* raw IP */ +#define DLT_APPLE_IP_OVER_IEEE1394 138 /* * These are values from BSD/OS's "bpf.h". diff -urN xnu-344.49/bsd/net/dlil.c xnu-517/bsd/net/dlil.c --- xnu-344.49/bsd/net/dlil.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/dlil.c Sat Oct 25 00:25:55 2003 @@ -132,6 +132,16 @@ }; +struct proto_family_str { + TAILQ_ENTRY(proto_family_str) proto_fam_next; + u_long proto_family; + u_long if_family; + + int (*attach_proto)(struct ifnet *ifp, u_long *dl_tag); + int (*detach_proto)(struct ifnet *ifp, u_long dl_tag); +}; + + struct dlil_stats_str dlil_stats; @@ -147,6 +157,9 @@ static TAILQ_HEAD(, if_family_str) if_family_head; +static +TAILQ_HEAD(, proto_family_str) proto_family_head; + static ifnet_inited = 0; static u_long dl_tag_nb = 0; static u_long dlil_filters_nb = 0; @@ -154,7 +167,6 @@ int dlil_initialized = 0; decl_simple_lock_data(, dlil_input_lock) int dlil_input_thread_wakeup = 0; -int dlil_expand_mcl; static struct mbuf *dlil_input_mbuf_head = NULL; static struct mbuf *dlil_input_mbuf_tail = NULL; #if NLOOP > 1 @@ -162,11 +174,13 @@ #endif static struct mbuf *dlil_input_loop_head = NULL; static struct mbuf *dlil_input_loop_tail = NULL; +extern struct ifmultihead ifma_lostlist; static void dlil_input_thread(void); extern void run_netisr(void); extern void bpfdetach(struct ifnet*); +int dlil_expand_mcl; /* * Internal functions. @@ -185,6 +199,20 @@ return mod; } +static +struct proto_family_str *find_proto_module(u_long proto_family, u_long if_family) +{ + struct proto_family_str *mod = NULL; + + TAILQ_FOREACH(mod, &proto_family_head, proto_fam_next) { + if ((mod->proto_family == (proto_family & 0xffff)) + && (mod->if_family == (if_family & 0xffff))) + break; + } + + return mod; +} + /* * Public functions. @@ -296,6 +324,7 @@ TAILQ_INIT(&dlil_ifnet_head); TAILQ_INIT(&if_family_head); + TAILQ_INIT(&proto_family_head); // create the dl tag array MALLOC(dl_tag_array, void *, sizeof(struct dl_tag_str) * MAX_DL_TAGS, M_NKE, M_WAITOK); @@ -497,13 +526,11 @@ return retval; } - void dlil_input_thread_continue(void) { while (1) { struct mbuf *m, *m_loop; - int expand_mcl; usimple_lock(&dlil_input_lock); m = dlil_input_mbuf_head; @@ -514,16 +541,6 @@ dlil_input_loop_tail = NULL; usimple_unlock(&dlil_input_lock); - MBUF_LOCK(); - expand_mcl = dlil_expand_mcl; - dlil_expand_mcl = 0; - MBUF_UNLOCK(); - if (expand_mcl) { - caddr_t p; - MCLALLOC(p, M_WAIT); - if (p) MCLFREE(p); - } - /* * NOTE warning %%% attention !!!! * We should think about putting some thread starvation safeguards if @@ -565,17 +582,10 @@ void dlil_input_thread(void) { - register thread_t self = current_thread(); - extern void stack_privilege(thread_t thread); + register thread_t self = current_act(); - /* - * Make sure that this thread - * always has a kernel stack, and - * bind it to the master cpu. - */ - stack_privilege(self); - ml_thread_policy(current_thread(), MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); + ml_thread_policy(self, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); /* The dlil thread is always funneled */ thread_funnel_set(network_flock, TRUE); @@ -1443,59 +1453,69 @@ int dlil_if_detach(struct ifnet *ifp) { - struct if_proto *proto; - struct dlil_filterq_entry *if_filter; - struct if_family_str *if_family; - struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; - int s; - struct kev_msg ev_msg; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(network_flock, TRUE); - s = splnet(); - - if_family = find_family_module(ifp->if_family); - - if (!if_family) { - kprintf("Attempt to detach interface without family module - %s\n", - ifp->if_name); - splx(s); - thread_funnel_set(network_flock, funnel_state); - return ENODEV; - } - - while (if_filter = TAILQ_FIRST(fhead)) - dlil_detach_filter(if_filter->filter_id); - - ifp->refcnt--; - - if (ifp->refcnt == 0) { - /* Let BPF know the interface is detaching. */ - bpfdetach(ifp); + struct if_proto *proto; + struct dlil_filterq_entry *if_filter; + struct if_family_str *if_family; + struct dlil_filterq_head *fhead = (struct dlil_filterq_head *) &ifp->if_flt_head; + struct kev_msg ev_msg; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(network_flock, TRUE); + + if_family = find_family_module(ifp->if_family); + + if (!if_family) { + kprintf("Attempt to detach interface without family module - %s\n", + ifp->if_name); + thread_funnel_set(network_flock, funnel_state); + return ENODEV; + } + + while (if_filter = TAILQ_FIRST(fhead)) + dlil_detach_filter(if_filter->filter_id); + + ifp->refcnt--; + + if (ifp->refcnt > 0) { + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, 0, 0); + thread_funnel_set(network_flock, funnel_state); + return DLIL_WAIT_FOR_FREE; + } + + while (ifp->if_multiaddrs.lh_first) { + struct ifmultiaddr *ifma = ifp->if_multiaddrs.lh_first; + + /* + * When the interface is gone, we will no + * longer be listening on these multicasts. + * Various bits of the stack may be referencing + * these multicasts, so we can't just free them. + * We place them on a list so they may be cleaned + * up later as the other bits of the stack release + * them. + */ + LIST_REMOVE(ifma, ifma_link); + ifma->ifma_ifp = NULL; + LIST_INSERT_HEAD(&ifma_lostlist, ifma, ifma_link); + } + + /* Let BPF know the interface is detaching. */ + bpfdetach(ifp); TAILQ_REMOVE(&ifnet, ifp, if_link); (*if_family->del_if)(ifp); - + if (--if_family->refcnt == 0) { - if (if_family->shutdown) - (*if_family->shutdown)(); - - TAILQ_REMOVE(&if_family_head, if_family, if_fam_next); - FREE(if_family, M_IFADDR); + if (if_family->shutdown) + (*if_family->shutdown)(); + + TAILQ_REMOVE(&if_family_head, if_family, if_fam_next); + FREE(if_family, M_IFADDR); } - - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, 0, 0); - splx(s); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, 0, 0); thread_funnel_set(network_flock, funnel_state); return 0; - } - else - { - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, 0, 0); - splx(s); - thread_funnel_set(network_flock, funnel_state); - return DLIL_WAIT_FOR_FREE; - } } @@ -1605,6 +1625,126 @@ +int +dlil_reg_proto_module(u_long protocol_family, u_long interface_family, + struct dlil_protomod_reg_str *protomod_reg) +{ + struct proto_family_str *proto_family; + int s; + boolean_t funnel_state; + + + funnel_state = thread_funnel_set(network_flock, TRUE); + s = splnet(); + if (find_proto_module(protocol_family, interface_family)) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return EEXIST; + } + + if (protomod_reg->reserved[0] != 0 || protomod_reg->reserved[1] != 0 + || protomod_reg->reserved[2] != 0 || protomod_reg->reserved[3] !=0) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return EINVAL; + } + + if (protomod_reg->attach_proto == NULL) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return EINVAL; + } + + proto_family = (struct proto_family_str *) _MALLOC(sizeof(struct proto_family_str), M_IFADDR, M_WAITOK); + if (!proto_family) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ENOMEM; + } + + bzero(proto_family, sizeof(struct proto_family_str)); + proto_family->proto_family = protocol_family; + proto_family->if_family = interface_family & 0xffff; + proto_family->attach_proto = protomod_reg->attach_proto; + proto_family->detach_proto = protomod_reg->detach_proto; + + TAILQ_INSERT_TAIL(&proto_family_head, proto_family, proto_fam_next); + splx(s); + thread_funnel_set(network_flock, funnel_state); + return 0; +} + +int dlil_dereg_proto_module(u_long protocol_family, u_long interface_family) +{ + struct proto_family_str *proto_family; + int s, ret = 0; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(network_flock, TRUE); + s = splnet(); + proto_family = find_proto_module(protocol_family, interface_family); + if (proto_family == 0) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ENOENT; + } + + TAILQ_REMOVE(&proto_family_head, proto_family, proto_fam_next); + FREE(proto_family, M_IFADDR); + + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ret; +} + +int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp, u_long *dl_tag) +{ + struct proto_family_str *proto_family; + int s, ret = 0; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(network_flock, TRUE); + s = splnet(); + proto_family = find_proto_module(protocol_family, ifp->if_family); + if (proto_family == 0) { + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ENOENT; + } + + ret = (*proto_family->attach_proto)(ifp, dl_tag); + + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ret; +} + + +int dlil_unplumb_protocol(u_long protocol_family, struct ifnet *ifp) +{ + struct proto_family_str *proto_family; + int s, ret = 0; + u_long tag; + boolean_t funnel_state; + + funnel_state = thread_funnel_set(network_flock, TRUE); + s = splnet(); + + ret = dlil_find_dltag(ifp->if_family, ifp->if_unit, protocol_family, &tag); + + if (ret == 0) { + proto_family = find_proto_module(protocol_family, ifp->if_family); + if (proto_family && proto_family->detach_proto) + ret = (*proto_family->detach_proto)(ifp, tag); + else + ret = dlil_detach_protocol(tag); + } + + splx(s); + thread_funnel_set(network_flock, funnel_state); + return ret; +} + /* diff -urN xnu-344.49/bsd/net/dlil.h xnu-517/bsd/net/dlil.h --- xnu-344.49/bsd/net/dlil.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/dlil.h Sat Oct 25 00:25:55 2003 @@ -344,6 +344,161 @@ int dlil_reg_if_modules(u_long interface_family, struct dlil_ifmod_reg_str *ifmod_reg); +struct dlil_protomod_reg_str { + /* + * attach the protocol to the interface and return the dl_tag + */ + int (*attach_proto)(struct ifnet *ifp, u_long *dl_tag); + + /* + * detach the protocol from the interface. + * this is optionnal. If it is NULL, DLIL will use 0 default detach function. + */ + int (*detach_proto)(struct ifnet *ifp, u_long dl_tag); + + /* + * reserved for future use. MUST be NULL. + */ + u_long reserved[4]; +}; + +/* + +Function : dlil_reg_proto_module + + A DLIL protocol module is a piece of code that know how to handle a certain type + of protocol (PF_INET, PF_INET6, ...) for a certain family of interface (APPLE_IF_FAM_ETHERNET, + APPLE_IF_FAM_PPP, ...). + + dlil_reg_proto_module() allows the registration of such a protocol/interface handler before any + interface is attached. + Typically, the attach and detach function of the protocol handler will call + dlil_{attach/detach}_protocol with the parameter specific to the protocol. + + The goal of this modules is to insulate the actual protocol (IP, IPv6) from the DLIL details. + +Parameters : + 'protocol_family' is PF_INET, PF_INET6, ... + 'interface_family' is APPLE_IF_FAM_ETHERNET, APPLE_IF_FAM_PPP, ... + 'protomod_reg' is the protocol registration structure. + 'attach_proto' funtion is mandatory. + 'detach_proto' funtion is optional (DLIL will manage it). + +Return code : + +0 : + + No error. + +ENOMEM: + + No memory can be allocated for internal data structure. + +EEXIST: + + The protocol family has already been registered for this interface family. + +EINVAL: + + The dlil_protomod_reg_str structure contains incorrect values. + +*/ + +int dlil_reg_proto_module(u_long protocol_family, u_long interface_family, + struct dlil_protomod_reg_str *protomod_reg); + +/* + +Function : dlil_dereg_proto_module + + dlil_dereg_proto_module() will unregister the protocol module previously + registered with dlil_dereg_proto_module(). + + There is no restriction when to call it. + Interfaces or protoco can be attached, it will not prevent the deregistration of the module. + +Parameters : + 'protocol_family' is PF_INET, PF_INET6, ... + 'interface_family' is APPLE_IF_FAM_ETHERNET, APPLE_IF_FAM_PPP, ... + +Return code : + +0 : + + No error. + +ENOENT: + + No module was registered.. + +*/ + +int dlil_dereg_proto_module(u_long protocol_family, u_long interface_family); + +/* + +Function : dlil_plumb_protocol + + dlil_plumb_protocol() will plumb a protocol to an actual interface. + This will find a registered protocol module and call its attach function. + The module will typically call dlil_attach_protocol with the appropriate parameters, + and will return the dl_tag of the attachement. + It is up to the caller to handle the dl_tag. + Some protocol (IPv4) will stick it in their internal structure for future use. + Some other protocol (IPv6) can ignore the dl_tag. + +Parameters : + 'protocol_family' is PF_INET, PF_INET6, ... + 'ifp' is the interface to plumb the protocol to. + 'dl_tag' is the tag returned from the succesful attachement. + +Return code : + +0 : + + No error. + +ENOENT: + + No module was registered. + +other: + + Error returned by the attach_proto function + +*/ +int dlil_plumb_protocol(u_long protocol_family, struct ifnet *ifp, u_long *dl_tag); + +/* + +Function : dlil_unplumb_protocol + + dlil_unplumb_protocol() will unplumb a protocol from an interface. + This will find a registered protocol module and call its detach function. + The module will typically call dlil_detach_protocol with the appropriate parameters. + If no module is found, this function will call dlil_detach_protocol directly. + +Parameters : + 'protocol_family' is PF_INET, PF_INET6, ... + 'ifp' is APPLE_IF_FAM_ETHERNET, APPLE_IF_FAM_PPP, ... + +Return code : + +0 : + + No error. + +ENOENT: + + No module was registered. + +other: + + Error returned by the attach_proto function + +*/ +int dlil_unplumb_protocol(u_long protocol_family, struct ifnet *ifp); + int dlil_inject_if_input(struct mbuf *m, char *frame_header, u_long from_id); diff -urN xnu-344.49/bsd/net/ether_if_module.c xnu-517/bsd/net/ether_if_module.c --- xnu-344.49/bsd/net/ether_if_module.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/ether_if_module.c Sat Oct 25 00:25:55 2003 @@ -139,6 +139,10 @@ static struct ether_desc_blk_str ether_desc_blk[MAX_INTERFACES]; +/* from if_ethersubr.c */ +int ether_resolvemulti __P((struct ifnet *, struct sockaddr **, + struct sockaddr *)); + /* * Release all descriptor entries owned by this dl_tag (there may be several). * Setting the type to 0 releases the entry. Eventually we should compact-out @@ -500,6 +504,7 @@ ifp->if_framer = ether_frameout; ifp->if_demux = ether_demux; ifp->if_event = 0; + ifp->if_resolvemulti = ether_resolvemulti; for (i=0; i < MAX_INTERFACES; i++) if (ether_desc_blk[i].n_count == 0) @@ -605,10 +610,15 @@ } +extern int ether_attach_inet(struct ifnet *ifp, u_long *dl_tag); +extern int ether_detach_inet(struct ifnet *ifp, u_long dl_tag); +extern int ether_attach_inet6(struct ifnet *ifp, u_long *dl_tag); +extern int ether_detach_inet6(struct ifnet *ifp, u_long dl_tag); int ether_family_init() { - int i; + int i, error=0; struct dlil_ifmod_reg_str ifmod_reg; + struct dlil_protomod_reg_str enet_protoreg; /* ethernet family is built-in, called from bsd_init */ thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); @@ -630,7 +640,23 @@ for (i=0; i < MAX_INTERFACES; i++) ether_desc_blk[i].n_count = 0; + /* Register protocol registration functions */ + + bzero(&enet_protoreg, sizeof(enet_protoreg)); + enet_protoreg.attach_proto = ether_attach_inet; + enet_protoreg.detach_proto = ether_detach_inet; + + if ( error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_ETHERNET, &enet_protoreg) != 0) + kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + + + enet_protoreg.attach_proto = ether_attach_inet6; + enet_protoreg.detach_proto = ether_detach_inet6; + + if ( error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_ETHERNET, &enet_protoreg) != 0) + kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return 0; + return (error); } diff -urN xnu-344.49/bsd/net/ether_inet6_pr_module.c xnu-517/bsd/net/ether_inet6_pr_module.c --- xnu-344.49/bsd/net/ether_inet6_pr_module.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/ether_inet6_pr_module.c Sat Oct 25 00:25:55 2003 @@ -257,7 +257,7 @@ if (!nd6_storelladdr(&ac->ac_if, rt, m, dst_netaddr, (u_char *)edst)) { /* this must be impossible, so we bark */ printf("nd6_storelladdr failed\n"); - return(0); + return(EADDRNOTAVAIL); /* dlil_output will free the mbuf */ } *(u_short *)type = htons(ETHERTYPE_IPV6); break; @@ -266,6 +266,7 @@ printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, dst_netaddr->sa_family); + /* dlil_output will free the mbuf */ return EAFNOSUPPORT; } @@ -372,19 +373,18 @@ -u_long ether_attach_inet6(struct ifnet *ifp) +int ether_attach_inet6(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; - u_long ip_dl_tag=0; u_short en_6native=ETHERTYPE_IPV6; int stat; int i; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &ip_dl_tag); + stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, dl_tag); if (stat == 0) - return ip_dl_tag; + return stat; TAILQ_INIT(®.demux_desc_head); desc.type = DLIL_DESC_RAW; @@ -403,23 +403,21 @@ reg.default_proto = 0; reg.protocol_family = PF_INET6; - stat = dlil_attach_protocol(®, &ip_dl_tag); + stat = dlil_attach_protocol(®, dl_tag); if (stat) { printf("WARNING: ether_attach_inet6 can't attach ip to interface\n"); - return stat; } - return ip_dl_tag; + return stat; } -int ether_detach_inet6(struct ifnet *ifp) +int ether_detach_inet6(struct ifnet *ifp, u_long dl_tag) { - u_long ip_dl_tag = 0; int stat; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &ip_dl_tag); + stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag); if (stat == 0) { - stat = dlil_detach_protocol(ip_dl_tag); + stat = dlil_detach_protocol(dl_tag); if (stat) { printf("WARNING: ether_detach_inet6 can't detach ip6 from interface\n"); } diff -urN xnu-344.49/bsd/net/ether_inet_pr_module.c xnu-517/bsd/net/ether_inet_pr_module.c --- xnu-344.49/bsd/net/ether_inet_pr_module.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/ether_inet_pr_module.c Sat Oct 25 00:25:55 2003 @@ -365,19 +365,6 @@ if (ifp->if_init) ifp->if_init(ifp->if_softc); /* before arpwhohas */ - // - // See if another station has *our* IP address. - // i.e.: There is an address conflict! If a - // conflict exists, a message is sent to the - // console. - // - if (IA_SIN(ifa)->sin_addr.s_addr != 0) - { - /* don't bother for 0.0.0.0 */ - ac->ac_ipaddr = IA_SIN(ifa)->sin_addr; - arpwhohas(ac, &IA_SIN(ifa)->sin_addr); - } - arp_ifinit(IFP2AC(ifp), ifa); /* @@ -425,22 +412,21 @@ -u_long -ether_attach_inet(struct ifnet *ifp) +int +ether_attach_inet(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; struct dlil_demux_desc desc2; - u_long ip_dl_tag=0; u_short en_native=ETHERTYPE_IP; u_short arp_native=ETHERTYPE_ARP; int stat; int i; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, &ip_dl_tag); + stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, dl_tag); if (stat == 0) - return ip_dl_tag; + return (stat); TAILQ_INIT(®.demux_desc_head); desc.type = DLIL_DESC_RAW; @@ -463,22 +449,21 @@ desc2.native_type = (char *) &arp_native; TAILQ_INSERT_TAIL(®.demux_desc_head, &desc2, next); - stat = dlil_attach_protocol(®, &ip_dl_tag); + stat = dlil_attach_protocol(®, dl_tag); if (stat) { printf("WARNING: ether_attach_inet can't attach ip to interface\n"); return stat; } - return ip_dl_tag; + return (0); } -int ether_detach_inet(struct ifnet *ifp) +int ether_detach_inet(struct ifnet *ifp, u_long dl_tag) { - u_long ip_dl_tag = 0; int stat; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, &ip_dl_tag); + stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET, &dl_tag); if (stat == 0) { - stat = dlil_detach_protocol(ip_dl_tag); + stat = dlil_detach_protocol(dl_tag); if (stat) { printf("WARNING: ether_detach_inet can't detach ip from interface\n"); } diff -urN xnu-344.49/bsd/net/ethernet.h xnu-517/bsd/net/ethernet.h --- xnu-344.49/bsd/net/ethernet.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/ethernet.h Sat Oct 25 00:25:55 2003 @@ -121,7 +121,7 @@ int ether_hostton __P((char *, struct ether_addr *)); int ether_line __P((char *, struct ether_addr *, char *)); -char *ether_ntoa __P((struct ether_addr *)); +char *ether_ntoa __P((const struct ether_addr *)); int ether_ntohost __P((char *, struct ether_addr *)); __END_DECLS #endif /* !KERNEL */ diff -urN xnu-344.49/bsd/net/firewire.h xnu-517/bsd/net/firewire.h --- xnu-344.49/bsd/net/firewire.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/net/firewire.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Fundamental constants relating to FireWire network device. + */ + +#ifndef _NET_FIREWIRE_H_ +#define _NET_FIREWIRE_H_ + +#include + +/* + * The number of bytes in a FireWire EUI-64. + */ +#define FIREWIRE_EUI64_LEN 8 + +/* + * The number of bytes in the type field. + */ +#define FIREWIRE_TYPE_LEN 2 + +/* + * The length of the header provided by the FireWire network device. + */ +#define FIREWIRE_HDR_LEN (FIREWIRE_EUI64_LEN*2+FIREWIRE_TYPE_LEN) + +/* + * The minimum packet length. + */ +#define FIREWIRE_MIN_LEN 64 + +/* + * The maximum packet length. + */ +#define FIREWIRE_MAX_LEN 4096 + +/* + * A macro to validate a length with + */ +#define FIREWIRE_IS_VALID_LEN(foo) \ + ((foo) >= FIREWIRE_MIN_LEN && (foo) <= FIREWIRE_MAX_LEN) + +/* + * Structure of header provided by the FireWire network device. + * + * The device uses a simplified header with just the non-changing + * EUI-64 addresses and ethernet type specified; + */ +struct firewire_header { + u_char firewire_dhost[FIREWIRE_EUI64_LEN]; + u_char firewire_shost[FIREWIRE_EUI64_LEN]; + u_short firewire_type; /* ethertype */ +}; + +/* + * Format of FireWire EUI-64. + */ +struct firewire_eui64 { + u_char octet[FIREWIRE_EUI64_LEN]; +}; + +/* + * Format of FireWire hardware address. + */ +struct firewire_address { + u_char eui64[FIREWIRE_EUI64_LEN]; + u_char maxRec; + u_char spd; + u_int16_t unicastFifoHi; + u_int32_t unicastFifoLo; +}; + +#define FIREWIRE_ADDR_LEN 16 /* sizeof(struct firewire_address) */ + + +#define FIREWIRE_MTU (FIREWIRE_MAX_LEN - FIREWIRE_HDR_LEN) +#define FIREWIRE_MIN (FIREWIRE_MIN_LEN - FIREWIRE_HDR_LEN) + +#endif /* !_NET_FIREWIRE_H_ */ diff -urN xnu-344.49/bsd/net/if.c xnu-517/bsd/net/if.c --- xnu-344.49/bsd/net/if.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if.c Sat Oct 25 00:25:55 2003 @@ -107,6 +107,7 @@ int ifqmaxlen = IFQ_MAXLEN; struct ifnethead ifnet; /* depend on static init XXX */ +struct ifmultihead ifma_lostlist = LIST_HEAD_INITIALIZER(ifma_lostlist); #if INET6 /* @@ -114,7 +115,6 @@ * should be more generalized? */ extern void nd6_setmtu __P((struct ifnet *)); -extern int ip6_auto_on; #endif /* @@ -154,7 +154,9 @@ } TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); - ifp->if_index = ++if_index; + /* if the interface is recycled, keep the index */ + if (!((ifp->if_eflags & IFEF_REUSE) && ifp->if_index)) + ifp->if_index = ++if_index; /* * XXX - * The old code would work if the interface passed a pre-existing @@ -226,6 +228,28 @@ } } +__private_extern__ int +ifa_foraddr(addr) + unsigned int addr; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + register unsigned int addr2; + + + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) + for (ifa = ifp->if_addrhead.tqh_first; ifa; + ifa = ifa->ifa_link.tqe_next) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + addr2 = IA_SIN(ifa)->sin_addr.s_addr; + + if (addr == addr2) + return (1); + } + return (0); +} + /* * Locate an interface based on a complete address. */ @@ -498,10 +522,6 @@ pfctlinput(PRC_IFUP, ifa->ifa_addr); rt_ifmsg(ifp); -#if INET6 - if (ip6_auto_on) /* Only if IPv6 is on on configured on on all ifs */ - in6_if_up(ifp); -#endif } /* @@ -1220,41 +1240,65 @@ return 0; } -/* - * Remove a reference to a multicast address on this interface. Yell - * if the request does not match an existing membership. - */ int -if_delmulti(ifp, sa) - struct ifnet *ifp; - struct sockaddr *sa; +if_delmultiaddr(struct ifmultiaddr *ifma) { - struct ifmultiaddr *ifma; - int s; - - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) - if (equal(sa, ifma->ifma_addr)) - break; - if (ifma == 0) - return ENOENT; - + struct sockaddr *sa; + struct ifnet *ifp; + + /* Verify ifma is valid */ + { + struct ifmultiaddr *match = NULL; + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { + for (match = ifp->if_multiaddrs.lh_first; match; match = match->ifma_link.le_next) { + if (match->ifma_ifp != ifp) { + printf("if_delmultiaddr: ifma (%x) on ifp i(%s) is stale\n", + match, if_name(ifp)); + return (0) ; /* swallow error ? */ + } + if (match == ifma) + break; + } + if (match == ifma) + break; + } + if (match != ifma) { + for (match = ifma_lostlist.lh_first; match; match = match->ifma_link.le_next) { + if (match->ifma_ifp != NULL) { + printf("if_delmultiaddr: item on lost list (%x) contains non-null ifp=%s\n", + match, if_name(match->ifma_ifp)); + return (0) ; /* swallow error ? */ + } + if (match == ifma) + break; + } + } + + if (match != ifma) { + printf("if_delmultiaddr: ifma 0x%X is invalid\n", ifma); + return 0; + } + } + if (ifma->ifma_refcount > 1) { ifma->ifma_refcount--; return 0; } - rt_newmaddrmsg(RTM_DELMADDR, ifma); sa = ifma->ifma_lladdr; - s = splimp(); + + if (sa) /* send a routing msg for network addresses only */ + rt_newmaddrmsg(RTM_DELMADDR, ifma); + + ifp = ifma->ifma_ifp; + LIST_REMOVE(ifma, ifma_link); /* * Make sure the interface driver is notified * in the case of a link layer mcast group being left. */ - if (ifma->ifma_addr->sa_family == AF_LINK && sa == 0) + if (ifp && ifma->ifma_addr->sa_family == AF_LINK && sa == 0) dlil_ioctl(0, ifp, SIOCDELMULTI, 0); - splx(s); FREE(ifma->ifma_addr, M_IFMADDR); FREE(ifma, M_IFMADDR); if (sa == 0) @@ -1271,27 +1315,41 @@ * in the record for the link-layer address. (So we don't complain * in that case.) */ - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) + if (ifp) + ifma = ifp->if_multiaddrs.lh_first; + else + ifma = ifma_lostlist.lh_first; + for (; ifma; ifma = ifma->ifma_link.le_next) if (equal(sa, ifma->ifma_addr)) break; - if (ifma == 0) - return 0; - - if (ifma->ifma_refcount > 1) { - ifma->ifma_refcount--; + + FREE(sa, M_IFMADDR); + if (ifma == 0) { return 0; } - s = splimp(); - LIST_REMOVE(ifma, ifma_link); - dlil_ioctl(0, ifp, SIOCDELMULTI, (caddr_t) 0); - splx(s); - FREE(ifma->ifma_addr, M_IFMADDR); - FREE(sa, M_IFMADDR); - FREE(ifma, M_IFMADDR); + return if_delmultiaddr(ifma); +} - return 0; +/* + * Remove a reference to a multicast address on this interface. Yell + * if the request does not match an existing membership. + */ +int +if_delmulti(ifp, sa) + struct ifnet *ifp; + struct sockaddr *sa; +{ + struct ifmultiaddr *ifma; + + for (ifma = ifp->if_multiaddrs.lh_first; ifma; + ifma = ifma->ifma_link.le_next) + if (equal(sa, ifma->ifma_addr)) + break; + if (ifma == 0) + return ENOENT; + + return if_delmultiaddr(ifma); } diff -urN xnu-344.49/bsd/net/if.h xnu-517/bsd/net/if.h --- xnu-344.49/bsd/net/if.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if.h Sat Oct 25 00:25:55 2003 @@ -120,6 +120,7 @@ /* extended flags definitions: (all bits are reserved for internal/future use) */ #define IFEF_AUTOCONFIGURING 0x1 #define IFEF_DVR_REENTRY_OK 0x20 /* When set, driver may be reentered from its own thread */ +#define IFEF_ACCEPT_RTADVD 0x40 /* set to accept IPv6 router advertisement on the interface */ #define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ #define IFEF_REUSE 0x20000000 /* DLIL ifnet recycler, ifnet is not new */ #endif /* KERNEL_PRIVATE */ diff -urN xnu-344.49/bsd/net/if_arp.h xnu-517/bsd/net/if_arp.h --- xnu-344.49/bsd/net/if_arp.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_arp.h Sat Oct 25 00:25:55 2003 @@ -78,6 +78,8 @@ #define ARPHRD_ETHER 1 /* ethernet hardware format */ #define ARPHRD_IEEE802 6 /* token-ring hardware format */ #define ARPHRD_FRELAY 15 /* frame relay hardware format */ +#define ARPHRD_IEEE1394 24 /* IEEE1394 hardware address */ +#define ARPHRD_IEEE1394_EUI64 27 /* IEEE1394 EUI-64 */ u_short ar_pro; /* format of protocol address */ u_char ar_hln; /* length of hardware address */ u_char ar_pln; /* length of protocol address */ diff -urN xnu-344.49/bsd/net/if_atm.h xnu-517/bsd/net/if_atm.h --- xnu-344.49/bsd/net/if_atm.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_atm.h Sat Oct 25 00:25:55 2003 @@ -67,8 +67,6 @@ #define RTALLOC1(A,B) rtalloc1((A),(B),0UL) #endif -#warning if_atm.h is not used by the darwin kernel - /* * pseudo header for packet transmission diff -urN xnu-344.49/bsd/net/if_ethersubr.c xnu-517/bsd/net/if_ethersubr.c --- xnu-344.49/bsd/net/if_ethersubr.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_ethersubr.c Sat Oct 25 00:25:55 2003 @@ -102,8 +102,6 @@ #include #endif /* NVLAN > 0 */ -static int ether_resolvemulti __P((struct ifnet *, struct sockaddr **, - struct sockaddr *)); extern u_char etherbroadcastaddr[]; #define senderr(e) do { error = (e); goto bad;} while (0) #define IFP2AC(IFP) ((struct arpcom *)IFP) @@ -132,7 +130,6 @@ ifp->if_addrlen = 6; ifp->if_hdrlen = 14; ifp->if_mtu = ETHERMTU; - ifp->if_resolvemulti = ether_resolvemulti; if (ifp->if_baudrate == 0) ifp->if_baudrate = 10000000; diff -urN xnu-344.49/bsd/net/if_faith.c xnu-517/bsd/net/if_faith.c --- xnu-344.49/bsd/net/if_faith.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_faith.c Sat Oct 25 00:25:55 2003 @@ -178,29 +178,10 @@ return 0; } -void faith_reg_if_mods() -{ - struct dlil_ifmod_reg_str faith_ifmod; - - bzero(&faith_ifmod, sizeof(faith_ifmod)); - faith_ifmod.add_if = faith_add_if; - faith_ifmod.del_if = faith_del_if; - faith_ifmod.add_proto = faith_add_proto; - faith_ifmod.del_proto = faith_del_proto; - faith_ifmod.ifmod_ioctl = 0; - faith_ifmod.shutdown = faith_shutdown; - - - if (dlil_reg_if_modules(APPLE_IF_FAM_FAITH, &faith_ifmod)) - panic("Couldn't register faith modules\n"); - -} - -u_long faith_attach_inet(struct ifnet *ifp) +int faith_attach_inet(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; - u_long dl_tag=0; short native=0; int stat; int i; @@ -212,7 +193,8 @@ kprintf("faith_array for %s%d found dl_tag=%d\n", ifp->if_name, ifp->if_unit, faith_array[i]->dl_tag); #endif - return faith_array[i]->dl_tag; + *dl_tag = faith_array[i]->dl_tag; + return 0; } } @@ -234,14 +216,44 @@ reg.default_proto = 0; reg.protocol_family = PF_INET; - stat = dlil_attach_protocol(®, &dl_tag); + stat = dlil_attach_protocol(®, dl_tag); if (stat) { panic("faith_attach_inet can't attach interface\n"); } - return dl_tag; + return stat; } +void faith_reg_if_mods() +{ + struct dlil_ifmod_reg_str faith_ifmod; + struct dlil_protomod_reg_str faith_protoreg; + int error; + + bzero(&faith_ifmod, sizeof(faith_ifmod)); + faith_ifmod.add_if = faith_add_if; + faith_ifmod.del_if = faith_del_if; + faith_ifmod.add_proto = faith_add_proto; + faith_ifmod.del_proto = faith_del_proto; + faith_ifmod.ifmod_ioctl = 0; + faith_ifmod.shutdown = faith_shutdown; + + + if (dlil_reg_if_modules(APPLE_IF_FAM_FAITH, &faith_ifmod)) + panic("Couldn't register faith modules\n"); + + /* Register protocol registration functions */ + + bzero(&faith_protoreg, sizeof(faith_protoreg)); + faith_protoreg.attach_proto = faith_attach_inet; + faith_protoreg.detach_proto = 0; + + if ( error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_FAITH, &faith_protoreg) != 0) + kprintf("dlil_reg_proto_module failed for AF_INET error=%d\n", error); + + +} + void faithattach(void) { diff -urN xnu-344.49/bsd/net/if_gif.c xnu-517/bsd/net/if_gif.c --- xnu-344.49/bsd/net/if_gif.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_gif.c Sat Oct 25 00:25:55 2003 @@ -296,17 +296,55 @@ return (stat); } +int gif_attach_inet(struct ifnet *ifp, u_long *dl_tag) { + *dl_tag = gif_attach_proto_family(ifp, AF_INET); + return 0; +} + +int gif_detach_inet(struct ifnet *ifp, u_long dl_tag) { + gif_detach_proto_family(ifp, AF_INET); + return 0; +} + +int gif_attach_inet6(struct ifnet *ifp, u_long *dl_tag) { + *dl_tag = gif_attach_proto_family(ifp, AF_INET6); + return 0; +} + +int gif_detach_inet6(struct ifnet *ifp, u_long dl_tag) { + gif_detach_proto_family(ifp, AF_INET6); + return 0; +} #endif /* Function to setup the first gif interface */ void gifattach(void) { + struct dlil_protomod_reg_str gif_protoreg; + int error; + /* Init the list of interfaces */ TAILQ_INIT(&gifs); gif_reg_if_mods(); /* DLIL modules */ + /* Register protocol registration functions */ + + bzero(&gif_protoreg, sizeof(gif_protoreg)); + gif_protoreg.attach_proto = gif_attach_inet; + gif_protoreg.detach_proto = gif_detach_inet; + + if ( error = dlil_reg_proto_module(AF_INET, APPLE_IF_FAM_GIF, &gif_protoreg) != 0) + printf("dlil_reg_proto_module failed for AF_INET error=%d\n", error); + + gif_protoreg.attach_proto = gif_attach_inet6; + gif_protoreg.detach_proto = gif_detach_inet6; + + if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_GIF, &gif_protoreg) != 0) + printf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + + /* Create first device */ gif_create_dev(); } @@ -463,6 +501,7 @@ log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", called); + m_freem(m); /* free it here not in dlil_output*/ error = EIO; /* is there better errno? */ goto end; } @@ -471,6 +510,7 @@ m->m_flags &= ~(M_BCAST|M_MCAST); if (!(ifp->if_flags & IFF_UP) || sc->gif_psrc == NULL || sc->gif_pdst == NULL) { + m_freem(m); /* free it here not in dlil_output */ error = ENETDOWN; goto end; } @@ -518,8 +558,11 @@ end: called = 0; /* reset recursion counter */ - if (error) + if (error) { + /* the mbuf was freed either by in_gif_output or in here */ + *m0 = NULL; /* avoid getting dlil_output freeing it */ ifp->if_oerrors++; + } if (error == 0) error = EJUSTRETURN; /* if no error, packet got sent already */ return error; diff -urN xnu-344.49/bsd/net/if_llc.h xnu-517/bsd/net/if_llc.h --- xnu-344.49/bsd/net/if_llc.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_llc.h Sat Oct 25 00:25:55 2003 @@ -76,7 +76,7 @@ struct { u_char control; u_char format_id; - u_char class; + u_char class_id; u_char window_x2; } type_u; struct { @@ -111,7 +111,7 @@ #define llc_control llc_un.type_u.control #define llc_control_ext llc_un.type_raw.control_ext #define llc_fid llc_un.type_u.format_id -#define llc_class llc_un.type_u.class +#define llc_class llc_un.type_u.class_id #define llc_window llc_un.type_u.window_x2 #define llc_frmrinfo llc_un.type_frmr.frmrinfo #define llc_frmr_pdu0 llc_un.type_frmr.frmrinfo.rej_pdu0 diff -urN xnu-344.49/bsd/net/if_loop.c xnu-517/bsd/net/if_loop.c --- xnu-344.49/bsd/net/if_loop.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_loop.c Sat Oct 25 00:25:55 2003 @@ -353,7 +353,7 @@ ifq = &atalkintrq; isr = NETISR_APPLETALK; break; -#endif NETAT +#endif /* NETAT */ default: return (EAFNOSUPPORT); } @@ -498,37 +498,20 @@ return 0; } - -void lo_reg_if_mods() -{ - struct dlil_ifmod_reg_str lo_ifmod; - - bzero(&lo_ifmod, sizeof(lo_ifmod)); - lo_ifmod.add_if = lo_add_if; - lo_ifmod.del_if = lo_del_if; - lo_ifmod.add_proto = lo_add_proto; - lo_ifmod.del_proto = lo_del_proto; - lo_ifmod.ifmod_ioctl = 0; - lo_ifmod.shutdown = lo_shutdown; - - if (dlil_reg_if_modules(APPLE_IF_FAM_LOOPBACK, &lo_ifmod)) - panic("Couldn't register lo modules\n"); -} - - -u_long lo_attach_inet(struct ifnet *ifp) +int lo_attach_inet(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; - u_long dl_tag=0; short native=0; - int stat; + int stat =0 ; int i; for (i=0; i < lo_count; i++) { if ((lo_array[i]) && (lo_array[i]->ifp == ifp)) { - if (lo_array[i]->protocol_family == PF_INET) - return lo_array[i]->dl_tag; + if (lo_array[i]->protocol_family == PF_INET) { + *dl_tag = lo_array[i]->dl_tag; + return (0); + } } } @@ -549,27 +532,28 @@ reg.default_proto = 0; reg.protocol_family = PF_INET; - stat = dlil_attach_protocol(®, &dl_tag); - if (stat) { - panic("lo_attach_inet can't attach interface\n"); - } + stat = dlil_attach_protocol(®, dl_tag); + + if (stat) + printf("lo_attach_inet: dlil_attach_protocol returned=%d\n", stat); - return dl_tag; + return stat; } -u_long lo_attach_inet6(struct ifnet *ifp) +int lo_attach_inet6(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; - u_long dl_tag=0; short native=0; int stat; int i; for (i=0; i < lo_count; i++) { if ((lo_array[i]) && (lo_array[i]->ifp == ifp)) { - if (lo_array[i]->protocol_family == PF_INET6) - return lo_array[i]->dl_tag; + if (lo_array[i]->protocol_family == PF_INET6) { + *dl_tag = lo_array[i]->dl_tag; + return (0); + } } } @@ -590,14 +574,47 @@ reg.default_proto = 0; reg.protocol_family = PF_INET6; - stat = dlil_attach_protocol(®, &dl_tag); - if (stat) { - panic("lo_attach_inet6 can't attach interface\n"); - } + stat = dlil_attach_protocol(®, dl_tag); + + if (stat) + printf("lo_attach_inet6: dlil_attach_protocol returned=%d\n", stat); - return dl_tag; + return stat; } +void lo_reg_if_mods() +{ + struct dlil_ifmod_reg_str lo_ifmod; + struct dlil_protomod_reg_str lo_protoreg; + int error; + + bzero(&lo_ifmod, sizeof(lo_ifmod)); + lo_ifmod.add_if = lo_add_if; + lo_ifmod.del_if = lo_del_if; + lo_ifmod.add_proto = lo_add_proto; + lo_ifmod.del_proto = lo_del_proto; + lo_ifmod.ifmod_ioctl = 0; + lo_ifmod.shutdown = lo_shutdown; + + if (dlil_reg_if_modules(APPLE_IF_FAM_LOOPBACK, &lo_ifmod)) + panic("Couldn't register lo modules\n"); + + /* Register protocol registration functions */ + + bzero(&lo_protoreg, sizeof(lo_protoreg)); + lo_protoreg.attach_proto = lo_attach_inet; + lo_protoreg.detach_proto = NULL; /* no detach function for loopback */ + + if ( error = dlil_reg_proto_module(PF_INET, APPLE_IF_FAM_LOOPBACK, &lo_protoreg) != 0) + printf("dlil_reg_proto_module failed for AF_INET error=%d\n", error); + + lo_protoreg.attach_proto = lo_attach_inet6; + lo_protoreg.detach_proto = NULL; + + if ( error = dlil_reg_proto_module(PF_INET6, APPLE_IF_FAM_LOOPBACK, &lo_protoreg) != 0) + printf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); + +} int lo_set_bpf_tap(struct ifnet *ifp, int mode, int (*bpf_callback)(struct ifnet *, struct mbuf *)) { diff -urN xnu-344.49/bsd/net/if_stf.c xnu-517/bsd/net/if_stf.c --- xnu-344.49/bsd/net/if_stf.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_stf.c Sat Oct 25 00:25:55 2003 @@ -145,7 +145,7 @@ extern struct domain inetdomain; struct protosw in_stf_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, - in_stf_input, rip_output, 0, rip_ctloutput, + in_stf_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, 0, @@ -209,33 +209,17 @@ return 0; } -void stf_reg_if_mods() -{ - struct dlil_ifmod_reg_str stf_ifmod; - - bzero(&stf_ifmod, sizeof(stf_ifmod)); - stf_ifmod.add_if = stf_add_if; - stf_ifmod.del_if = stf_del_if; - stf_ifmod.add_proto = stf_add_proto; - stf_ifmod.del_proto = stf_del_proto; - stf_ifmod.ifmod_ioctl = 0; - stf_ifmod.shutdown = stf_shutdown; - - - if (dlil_reg_if_modules(APPLE_IF_FAM_STF, &stf_ifmod)) - panic("Couldn't register stf modules\n"); - -} - -u_long stf_attach_inet6(struct ifnet *ifp) +int stf_attach_inet6(struct ifnet *ifp, u_long *dl_tag) { struct dlil_proto_reg_str reg; struct dlil_demux_desc desc; short native=0; int stat, i; - if (stf_dl_tag != 0) - return stf_dl_tag; + if (stf_dl_tag != 0) { + *dl_tag = stf_dl_tag; + return 0; + } TAILQ_INIT(®.demux_desc_head); desc.type = DLIL_DESC_RAW; @@ -255,21 +239,18 @@ reg.protocol_family = PF_INET6; stat = dlil_attach_protocol(®, &stf_dl_tag); - if (stat) { - panic("stf_attach_inet6 can't attach interface\n"); - } + *dl_tag = stf_dl_tag; - return stf_dl_tag; + return stat; } -u_long stf_detach_inet6(struct ifnet *ifp) +int stf_detach_inet6(struct ifnet *ifp, u_long dl_tag) { - u_long ip_dl_tag = 0; int stat; - stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, AF_INET6, &ip_dl_tag); + stat = dlil_find_dltag(ifp->if_family, ifp->if_unit, AF_INET6, &dl_tag); if (stat == 0) { - stat = dlil_detach_protocol(ip_dl_tag); + stat = dlil_detach_protocol(dl_tag); if (stat) { printf("WARNING: stf_detach can't detach IP AF_INET6 from interface\n"); } @@ -277,6 +258,33 @@ return (stat); } +void stf_reg_if_mods() +{ + struct dlil_ifmod_reg_str stf_ifmod; + struct dlil_protomod_reg_str stf_protoreg; + int error; + + bzero(&stf_ifmod, sizeof(stf_ifmod)); + stf_ifmod.add_if = stf_add_if; + stf_ifmod.del_if = stf_del_if; + stf_ifmod.add_proto = stf_add_proto; + stf_ifmod.del_proto = stf_del_proto; + stf_ifmod.ifmod_ioctl = 0; + stf_ifmod.shutdown = stf_shutdown; + + + if (dlil_reg_if_modules(APPLE_IF_FAM_STF, &stf_ifmod)) + panic("Couldn't register stf modules\n"); + + /* Register protocol registration functions */ + + bzero(&stf_protoreg, sizeof(stf_protoreg)); + stf_protoreg.attach_proto = stf_attach_inet6; + stf_protoreg.detach_proto = stf_detach_inet6; + + if ( error = dlil_reg_proto_module(AF_INET6, APPLE_IF_FAM_STF, &stf_protoreg) != 0) + kprintf("dlil_reg_proto_module failed for AF_INET6 error=%d\n", error); +} void stfattach(void) @@ -753,6 +761,8 @@ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; splx(s); + + return; } /* ARGSUSED */ diff -urN xnu-344.49/bsd/net/if_var.h xnu-517/bsd/net/if_var.h --- xnu-344.49/bsd/net/if_var.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/if_var.h Sat Oct 25 00:25:55 2003 @@ -74,7 +74,8 @@ #define APPLE_IF_FAM_MDECAP 9 #define APPLE_IF_FAM_GIF 10 #define APPLE_IF_FAM_FAITH 11 -#define APPLE_IF_FAM_STF 12 +#define APPLE_IF_FAM_STF 12 +#define APPLE_IF_FAM_FIREWIRE 13 #endif /* @@ -490,6 +491,7 @@ struct ifmultiaddr **)); int if_allmulti __P((struct ifnet *, int)); void if_attach __P((struct ifnet *)); +int if_delmultiaddr __P((struct ifmultiaddr *ifma)); int if_delmulti __P((struct ifnet *, struct sockaddr *)); void if_down __P((struct ifnet *)); void if_route __P((struct ifnet *, int flag, int fam)); diff -urN xnu-344.49/bsd/net/ndrv.c xnu-517/bsd/net/ndrv.c --- xnu-344.49/bsd/net/ndrv.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/ndrv.c Sat Oct 25 00:25:55 2003 @@ -241,6 +241,10 @@ #if NDRV_DEBUG kprintf("NDRV attach: %x, %x, %x\n", so, proto, np); #endif + + if ((error = soreserve(so, ndrv_sendspace, ndrv_recvspace))) + return(error); + MALLOC(np, struct ndrv_cb *, sizeof(*np), M_PCB, M_WAITOK); if (np == NULL) return (ENOMEM); @@ -249,8 +253,6 @@ #if NDRV_DEBUG kprintf("NDRV attach: %x, %x, %x\n", so, proto, np); #endif - if ((error = soreserve(so, ndrv_sendspace, ndrv_recvspace))) - return(error); TAILQ_INIT(&np->nd_dlist); np->nd_signature = NDRV_SIGNATURE; np->nd_socket = so; @@ -600,7 +602,7 @@ struct ndrv_cb* cur_np = NULL; struct socket *so = np->nd_socket; struct ndrv_multicast* next; - int error; + int error = 0; #if NDRV_DEBUG kprintf("NDRV detach: %x, %x\n", so, np); diff -urN xnu-344.49/bsd/net/netisr.h xnu-517/bsd/net/netisr.h --- xnu-344.49/bsd/net/netisr.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/netisr.h Sat Oct 25 00:25:55 2003 @@ -91,4 +91,4 @@ #endif /* defined(KERNEL) && !defined(LOCORE) */ #define schednetisr(anisr) { netisr |= 1<<(anisr); setsoftnet(); } -#endif __APPLE_API_PRIVATE +#endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/net/pfkeyv2.h xnu-517/bsd/net/pfkeyv2.h --- xnu-344.49/bsd/net/pfkeyv2.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/pfkeyv2.h Sat Oct 25 00:25:55 2003 @@ -128,6 +128,15 @@ u_int32_t sadb_sa_flags; }; +#ifdef __APPLE_API_PRIVATE +struct sadb_sa_2 { + struct sadb_sa sa; + u_int16_t sadb_sa_natt_port; + u_int16_t sadb_reserved0; + u_int32_t sadb_reserved1; +}; +#endif + struct sadb_lifetime { u_int16_t sadb_lifetime_len; u_int16_t sadb_lifetime_exttype; @@ -237,7 +246,7 @@ u_int8_t sadb_x_sa2_mode; u_int8_t sadb_x_sa2_reserved1; u_int16_t sadb_x_sa2_reserved2; - u_int32_t sadb_x_sa2_reserved3; + u_int32_t sadb_x_sa2_sequence; u_int32_t sadb_x_sa2_reqid; }; @@ -367,6 +376,11 @@ /* `flags' in sadb_sa structure holds followings */ #define SADB_X_EXT_NONE 0x0000 /* i.e. new format. */ #define SADB_X_EXT_OLD 0x0001 /* old format. */ +#ifdef __APPLE_API_PRIVATE +#define SADB_X_EXT_NATT 0x0002 /* Use UDP encapsulation to traverse NAT */ +#define SADB_X_EXT_NATT_KEEPALIVE 0x0004 /* Local node is behind NAT, send keepalives */ + /* Should only be set for outbound SAs */ +#endif #define SADB_X_EXT_IV4B 0x0010 /* IV length of 4 bytes in use */ #define SADB_X_EXT_DERIV 0x0020 /* DES derived */ diff -urN xnu-344.49/bsd/net/route.c xnu-517/bsd/net/route.c --- xnu-344.49/bsd/net/route.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/route.c Sat Oct 25 00:25:55 2003 @@ -72,6 +72,8 @@ #include #include +#include + #define SA(p) ((struct sockaddr *)(p)) struct route_cb route_cb; @@ -84,6 +86,10 @@ struct sockaddr *, struct sockaddr *)); static void rtable_init __P((void **)); +__private_extern__ u_long route_generation = 0; +extern int use_routegenid; + + static void rtable_init(table) void **table; @@ -130,6 +136,8 @@ splx(s); } ro->ro_rt = rtalloc1(&ro->ro_dst, 1, ignore); + if (ro->ro_rt) + ro->ro_rt->generation_id = route_generation; } /* @@ -220,11 +228,12 @@ { /* * find the tree for that address family + * Note: in the case of igmp packets, there might not be an rnh */ register struct radix_node_head *rnh = rt_tables[rt_key(rt)->sa_family]; - if (rt == 0 || rnh == 0) + if (rt == 0) panic("rtfree"); /* @@ -232,7 +241,7 @@ * and there is a close function defined, call the close function */ rt->rt_refcnt--; - if(rnh->rnh_close && rt->rt_refcnt == 0) { + if(rnh && rnh->rnh_close && rt->rt_refcnt == 0) { rnh->rnh_close((struct radix_node *)rt, rnh); } @@ -717,9 +726,7 @@ ifaref(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; -#ifdef __APPLE__ - rt->rt_dlt = ifa->ifa_dlt; /* dl_tag */ -#endif + /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr((caddr_t)ndst, (caddr_t)netmask, @@ -956,7 +963,7 @@ int dlen = ROUNDUP(dst->sa_len), glen = ROUNDUP(gate->sa_len); register struct rtentry *rt = rt0; struct radix_node_head *rnh = rt_tables[dst->sa_family]; - + extern void kdp_set_gateway_mac (void *gatewaymac); /* * A host route with the destination equal to the gateway * will interfere with keeping LLINFO in the routing @@ -1035,6 +1042,12 @@ rt->rt_gwroute = 0; return EDQUOT; /* failure */ } + /* Tell the kernel debugger about the new default gateway */ + if ((AF_INET == rt->rt_gateway->sa_family) && + rt->rt_gwroute && rt->rt_gwroute->rt_gateway && + (AF_LINK == rt->rt_gwroute->rt_gateway->sa_family)) { + kdp_set_gateway_mac(((struct sockaddr_dl *)rt0->rt_gwroute->rt_gateway)->sdl_data); + } } /* @@ -1166,6 +1179,8 @@ * notify any listenning routing agents of the change */ rt_newaddrmsg(cmd, ifa, error, nrt); + if (use_routegenid) + route_generation++; if (rt->rt_refcnt <= 0) { rt->rt_refcnt++; /* need a 1->0 transition to free */ rtfree(rt); @@ -1206,9 +1221,6 @@ * we are adding. */ rt->rt_ifp = ifa->ifa_ifp; -#ifdef __APPLE__ - rt->rt_dlt = ifa->ifa_dlt; /* dl_tag */ -#endif rt->rt_rmx.rmx_mtu = ifa->ifa_ifp->if_mtu; /*XXX*/ /* * Now ask the protocol to check if it needs @@ -1221,6 +1233,8 @@ * notify any listenning routing agents of the change */ rt_newaddrmsg(cmd, ifa, error, nrt); + if (use_routegenid) + route_generation++; } return (error); } diff -urN xnu-344.49/bsd/net/route.h xnu-517/bsd/net/route.h --- xnu-344.49/bsd/net/route.h Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/route.h Sat Oct 25 00:25:55 2003 @@ -145,7 +145,7 @@ struct sockaddr *, struct rtentry *)); /* output routine for this (rt,if) */ struct rtentry *rt_parent; /* cloning parent of this route */ - void *rt_filler2; /* more filler */ + u_long generation_id; /* route generation id */ }; #endif /* __APPLE_API_UNSTABLE */ @@ -240,6 +240,9 @@ #define RTM_IFINFO 0xe /* iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* mcast group membership being deleted */ +#ifdef KERNEL_PRIVATE +#define RTM_GET_SILENT 0x11 +#endif /* * Bitmask values for rtm_inits and rmx_locks. diff -urN xnu-344.49/bsd/net/rtsock.c xnu-517/bsd/net/rtsock.c --- xnu-344.49/bsd/net/rtsock.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/rtsock.c Sat Oct 25 00:25:55 2003 @@ -149,11 +149,12 @@ */ s = splnet(); so->so_pcb = (caddr_t)rp; - error = raw_usrreqs.pru_attach(so, proto, p); + error = raw_attach(so, proto); /* don't use raw_usrreqs.pru_attach, it checks for SS_PRIV */ rp = sotorawcb(so); if (error) { splx(s); FREE(rp, M_PCB); + so->so_pcb = 0; return error; } switch(rp->rcb_proto.sp_protocol) { @@ -311,6 +312,7 @@ struct ifnet *ifp = 0; struct ifaddr *ifa = 0; struct proc *curproc = current_proc(); + int sendonlytoself = 0; #define senderr(e) { error = e; goto flush;} if (m == 0 || ((m->m_len < sizeof(long)) && @@ -334,6 +336,26 @@ dst = 0; senderr(EPROTONOSUPPORT); } + + /* + * Silent version of RTM_GET for Reachabiltiy APIs. We may change + * all RTM_GETs to be silent in the future, so this is private for now. + */ + if (rtm->rtm_type == RTM_GET_SILENT) { + if ((so->so_options & SO_USELOOPBACK) == 0) + senderr(EINVAL); + sendonlytoself = 1; + rtm->rtm_type = RTM_GET; + } + + /* + * Perform permission checking, only privileged sockets + * may perform operations other than RTM_GET + */ + if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) { + dst = 0; + senderr(EPERM); + } rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { @@ -566,15 +588,24 @@ m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); Free(rtm); } - if (rp) - rp->rcb_proto.sp_family = 0; /* Avoid us */ - if (dst) - route_proto.sp_protocol = dst->sa_family; - if (m) - raw_input(m, &route_proto, &route_src, &route_dst); - if (rp) - rp->rcb_proto.sp_family = PF_ROUTE; - } + if (sendonlytoself && m) { + if (sbappendaddr(&so->so_rcv, &route_src, m, (struct mbuf*)0) == 0) { + m_freem(m); + error = ENOBUFS; + } else { + sorwakeup(so); + } + } else { + if (rp) + rp->rcb_proto.sp_family = 0; /* Avoid us */ + if (dst) + route_proto.sp_protocol = dst->sa_family; + if (m) + raw_input(m, &route_proto, &route_src, &route_dst); + if (rp) + rp->rcb_proto.sp_family = PF_ROUTE; + } + } return (error); } diff -urN xnu-344.49/bsd/net/zlib.c xnu-517/bsd/net/zlib.c --- xnu-344.49/bsd/net/zlib.c Thu Sep 18 21:01:01 2003 +++ xnu-517/bsd/net/zlib.c Sat Oct 25 00:25:55 2003 @@ -52,7 +52,7 @@ subject to change. Applications should only use zlib.h. */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ #ifndef _Z_UTIL_H #define _Z_UTIL_H @@ -298,7 +298,7 @@ subject to change. Applications should only use zlib.h. */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ #ifndef _DEFLATE_H #define _DEFLATE_H @@ -658,7 +658,7 @@ * */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ /* #include "deflate.h" */ @@ -2000,7 +2000,7 @@ * Addison-Wesley, 1983. ISBN 0-201-06672-6. */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ /* #define GEN_TREES_H */ @@ -2058,31 +2058,31 @@ #if defined(GEN_TREES_H) || !defined(STDC) /* non ANSI compilers may not accept trees.h */ -local ct_data static_ltree[L_CODES+2]; +local ct_data *static_ltree = Z_NULL; /* The static literal tree. Since the bit lengths are imposed, there is no * need for the L_CODES extra codes used during heap construction. However * The codes 286 and 287 are needed to build a canonical tree (see _tr_init * below). */ -local ct_data static_dtree[D_CODES]; +local ct_data *static_dtree = Z_NULL; /* The static distance tree. (Actually a trivial tree since all codes use * 5 bits.) */ -uch _dist_code[DIST_CODE_LEN]; +uch *_dist_code = Z_NULL; /* Distance codes. The first 256 values correspond to the distances * 3 .. 258, the last 256 values correspond to the top 8 bits of * the 15 bit distances. */ -uch _length_code[MAX_MATCH-MIN_MATCH+1]; +uch *_length_code = Z_NULL; /* length code for each normalized match length (0 == MIN_MATCH) */ -local int base_length[LENGTH_CODES]; +local int *base_length = Z_NULL; /* First normalized length for each code (0 = MIN_MATCH) */ -local int base_dist[D_CODES]; +local int *base_dist = Z_NULL; /* First normalized distance for each code (0 = distance of 1) */ #else @@ -2227,10 +2227,10 @@ }; local static_tree_desc static_l_desc = -{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; +{NULL, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; local static_tree_desc static_d_desc = -{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS}; +{NULL, extra_dbits, 0, D_CODES, MAX_BITS}; local static_tree_desc static_bl_desc = {(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS}; @@ -2239,7 +2239,7 @@ * Local (static) routines in this file. */ -local void tr_static_init OF((void)); +local int tr_static_init OF((z_streamp z)); local void init_block OF((deflate_state *s)); local void pqdownheap OF((deflate_state *s, ct_data *tree, int k)); local void gen_bitlen OF((deflate_state *s, tree_desc *desc)); @@ -2335,10 +2335,22 @@ #endif /* the arguments must not have side effects */ +typedef struct { + ct_data static_ltree[L_CODES+2]; + ct_data static_dtree[D_CODES]; + uch _dist_code[DIST_CODE_LEN]; + uch _length_code[MAX_MATCH-MIN_MATCH+1]; + int base_length[LENGTH_CODES]; + int base_dist[D_CODES]; +} __used_to_be_static; + +static __used_to_be_static *static_storage = Z_NULL; + /* =========================================================================== * Initialize the various 'constant' tables. */ -local void tr_static_init() +local int tr_static_init( + z_streamp z) { #if defined(GEN_TREES_H) || !defined(STDC) static int static_init_done = 0; @@ -2351,7 +2363,21 @@ /* number of codes at each bit length for an optimal tree */ if (static_init_done) return; - + + /* allocate storage for static structures */ + if (static_storage == Z_NULL) { + static_storage = (__used_to_be_static*)ZALLOC(z, 1, sizeof(__used_to_be_static)); + if (static_storage == Z_NULL) + return Z_MEM_ERROR; + } + + static_ltree = static_storage->static_ltree; + static_dtree = static_storage->static_dtree; + _dist_code = static_storage->_dist_code; + _length_code = static_storage->_length_code; + base_length = static_storage->base_length; + base_dist = static_storage->base_dist; + /* For some embedded targets, global variables are not initialized: */ static_l_desc.static_tree = static_ltree; static_l_desc.extra_bits = extra_lbits; @@ -2485,7 +2511,7 @@ void _tr_init(s) deflate_state *s; { - tr_static_init(); + tr_static_init(s->strm); s->l_desc.dyn_tree = s->dyn_ltree; s->l_desc.stat_desc = &static_l_desc; @@ -4731,7 +4757,7 @@ #ifdef BUILDFIXED local int fixed_built = 0; #define FIXEDH 544 /* number of hufts used by fixed tables */ -local inflate_huft fixed_mem[FIXEDH]; +local inflate_huft *fixed_mem = NULL; local uInt fixed_bl; local uInt fixed_bd; local inflate_huft *fixed_tl; @@ -4917,6 +4943,13 @@ ZFREE(z, c); return Z_MEM_ERROR; } + + if ((fixed_mem = (inflate_huft*)ZALLOC(z, FIXEDH, sizeof(inflate_huft))) == Z_NULL) + { + ZFREE(z, c); + ZFREE(z, v); + return Z_MEM_ERROR; + } /* literal table */ for (k = 0; k < 144; k++) @@ -5511,7 +5544,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ /* #include "zutil.h" */ @@ -5741,7 +5774,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ +/* @(#) $Id: xnu-344.49-to-xnu-517.diff,v 1.1 2003/12/31 01:21:51 fkr Exp $ */ /* #include "zlib.h" */ diff -urN xnu-344.49/bsd/netat/adsp_Close.c xnu-517/bsd/netat/adsp_Close.c --- xnu-344.49/bsd/netat/adsp_Close.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/adsp_Close.c Sat Oct 25 00:25:55 2003 @@ -176,7 +176,7 @@ /* * Unlink CCB from list */ - qRemove(AT_ADSP_STREAMS, sp); /* remove sp from active streams queue */ + qRemove((CCB *)AT_ADSP_STREAMS, sp); /* remove sp from active streams queue */ if (pb) { pb->ioResult = 0; diff -urN xnu-344.49/bsd/netat/adsp_RxData.c xnu-517/bsd/netat/adsp_RxData.c --- xnu-344.49/bsd/netat/adsp_RxData.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/adsp_RxData.c Sat Oct 25 00:25:55 2003 @@ -338,7 +338,7 @@ { sp->rData = 1; /* Not empty any more */ - if ((sp->rpb)->ioc == mp) { + if ((sp->rpb)->ioc == (caddr_t)mp) { dPrintf(D_M_ADSP, D_L_TRACE, ("RXData: (pb->ioc == mp) no stored data\n")); KERNEL_DEBUG(DBG_ADSP_RCV, 4, sp, sp->rpb, 0, 0); diff -urN xnu-344.49/bsd/netat/adsp_Timer.c xnu-517/bsd/netat/adsp_Timer.c --- xnu-344.49/bsd/netat/adsp_Timer.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/adsp_Timer.c Sat Oct 25 00:25:55 2003 @@ -55,6 +55,8 @@ #include #include +void TimerTick(); + /* * TrashSession * @@ -178,7 +180,7 @@ CheckSend(sp); } -void TimerTick_funnel() +void TimerTick_funnel(void *arg) { thread_funnel_set(network_flock, TRUE); TimerTick(); diff -urN xnu-344.49/bsd/netat/asp_proto.c xnu-517/bsd/netat/asp_proto.c --- xnu-344.49/bsd/netat/asp_proto.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/asp_proto.c Sat Oct 25 00:25:55 2003 @@ -84,11 +84,10 @@ void asp_ack_reply(); void asp_nak_reply(); void asp_clock(); -void asp_clock_funnel(); +void asp_clock_funnel(void *); int asp_open(); int asp_close(); int asp_wput(); -void atp_retry_req(); StaticProc asp_scb_t *asp_find_scb(); StaticProc asp_scb_t *asp_scb_alloc(); @@ -101,7 +100,7 @@ StaticProc void asp_untimout(); StaticProc void asp_hangup(); StaticProc void asp_send_tickle(); -StaticProc void asp_send_tickle_funnel(); +StaticProc void asp_send_tickle_funnel(void *); StaticProc void asp_accept(); StaticProc int asp_send_req(); @@ -374,7 +373,7 @@ dPrintf(D_M_ASP, D_L_TRACE, (" %s: %s\n", str, mbuf_totals())); } -#endif AT_MBUF_TRACE +#endif /* AT_MBUF_TRACE */ /* * the write routine @@ -662,7 +661,7 @@ { struct atp_state *atp = (struct atp_state *)gref->info; if (atp->dflag) - atp = atp->atp_msgq; + atp = (struct atp_state *)atp->atp_msgq; if (gbuf_cont(mioc) == 0) { asp_iocnak(gref, mioc, EINVAL); @@ -782,10 +781,10 @@ */ StaticProc void asp_send_tickle_funnel(scb) - asp_scb_t *scb; + void *scb; { thread_funnel_set(network_flock, TRUE); - asp_send_tickle(scb); + asp_send_tickle((asp_scb_t *)scb); thread_funnel_set(network_flock, FALSE); } @@ -1915,8 +1914,8 @@ int ASPputmsg(gref_t *gref, strbuf_t *ctlptr, strbuf_t *datptr, gbuf_t *mreq, int flags, int *errp) { - int s, i, err, len; - gbuf_t *mioc, *mdata, *mx; + int s, i, err, len, offset, remain, size, copy_len; + gbuf_t *mioc, *mdata, *mx, *m0; ioc_t *iocbp; strbuf_t ctlbuf; strbuf_t datbuf; @@ -1930,6 +1929,7 @@ asp_word_t *awp; union asp_primitives *primitives; unsigned short tid; + caddr_t dataptr; if ((scb = (asp_scb_t *)gref->info) == 0) { dPrintf(D_M_ASP, D_L_ERROR, @@ -1991,46 +1991,77 @@ ("ASPputmsg: %s\n", aspCmdStr(Primitive))); /* - * allocate buffer and copy in the data content + * copy in the data content into multiple mbuf clusters if + * required. ATP now expects reply data to be placed in + * standard clusters, not the large external clusters that + * were used previously. */ - len = (Primitive == ASPFUNC_CmdReply) ? 0 : aspCMDsize; + + /* set offset for use by some commands */ + offset = (Primitive == ASPFUNC_CmdReply) ? 0 : aspCMDsize; + size = 0; + if (mreq != NULL) { + /* The data from the in-kernel call for use by AFP is passed + * in as one large external cluster. This needs to be copied + * to a chain of standard clusters. + */ + remain = gbuf_len(mreq); + dataptr = mtod(mreq, caddr_t); + } else { + /* copyin from user space */ + remain = datbuf.len; + dataptr = (caddr_t)datbuf.buf; + } - if (!(mdata = gbuf_alloc_wait(datbuf.len+len, TRUE))) { + /* allocate first buffer */ + if (!(mdata = gbuf_alloc_wait((remain + offset > MCLBYTES ? MCLBYTES : remain + offset), TRUE))) { /* error return should not be possible */ err = ENOBUFS; gbuf_freem(mioc); goto l_err; } - gbuf_wset(mdata, (datbuf.len+len)); + gbuf_wset(mdata, 0); /* init length to zero */ gbuf_cont(mioc) = mdata; - - if (mreq != NULL) { - /* being called from kernel space */ - gbuf_t *tmp = mreq; - unsigned long offset = 0; - - /* copy afp cmd data from the passed in mbufs to mdata. I cant - chain mreq to mdata since the rest of this code assumes - just one big mbuf with space in front for the BDS */ - offset = len; - while (tmp != NULL) { - bcopy (gbuf_rptr(tmp), (gbuf_rptr(mdata) + offset), gbuf_len(tmp)); - offset += gbuf_len(tmp); - tmp = gbuf_cont(tmp); /* on to next mbuf in chain */ - } - - /* all data copied out of mreq so free it */ - gbuf_freem(mreq); - } else { - /* being called from user space */ - if ((err = copyin((caddr_t)datbuf.buf, - (caddr_t)(gbuf_rptr(mdata)+len), datbuf.len)) != 0) { - gbuf_freem(mioc); - goto l_err; - } - } - switch (Primitive) { + while (remain) { + if (remain + offset > MCLBYTES) + copy_len = MCLBYTES - offset; + else + copy_len = remain; + remain -= copy_len; + if (mreq != NULL) + bcopy (dataptr, (gbuf_rptr(mdata) + offset), copy_len); + else if ((err = copyin(dataptr, (caddr_t)(gbuf_rptr(mdata) + offset), copy_len)) != 0) { + gbuf_freem(mioc); + goto l_err; + } + gbuf_wset(mdata, (copy_len + offset)); + size += copy_len + offset; + dataptr += copy_len; + offset = 0; + if (remain) { + /* allocate the next mbuf */ + if ((gbuf_cont(mdata) = m_get((M_WAIT), MSG_DATA)) == 0) { + err = ENOBUFS; + gbuf_freem(mioc); + goto l_err; + } + mdata = gbuf_cont(mdata); + MCLGET(mdata, M_WAIT); + if (!(mdata->m_flags & M_EXT)) { + err = ENOBUFS; + gbuf_freem(mioc); + goto l_err; + } + } + } + mdata = gbuf_cont(mioc); /* code further on down expects this to b e set */ + mdata->m_pkthdr.len = size; /* set packet hdr len */ + + if (mreq != 0) + gbuf_freem(mreq); + + switch (Primitive) { case ASPFUNC_Command: case ASPFUNC_Write: @@ -2147,16 +2178,20 @@ atp->xo = 1; atp->xo_relt = 1; } + /* setup the atpBDS struct - only the length field is used, + * except for the first one which contains the bds count in + * bdsDataSz. + */ atpBDS = (struct atpBDS *)gbuf_wptr(mioc); msize = mdata ? gbuf_msgsize(mdata) : 0; - for (nbds=0; (nbds < ATP_TRESP_MAX) && (msize > 0); nbds++) { + for (nbds=0; (nbds < ATP_TRESP_MAX) && (msize > 0); nbds++) { len = msize < ATP_DATA_SIZE ? msize : ATP_DATA_SIZE; msize -= ATP_DATA_SIZE; *(long *)atpBDS[nbds].bdsUserData = 0; UAL_ASSIGN(atpBDS[nbds].bdsBuffAddr, 1); UAS_ASSIGN(atpBDS[nbds].bdsBuffSz, len); } - UAS_ASSIGN(atpBDS[0].bdsDataSz, nbds); + UAS_ASSIGN(atpBDS[0].bdsDataSz, nbds); *(long *)atpBDS[0].bdsUserData = (long)result; *(long *)atp->user_bytes = (long)result; gbuf_winc(mioc,atpBDSsize); diff -urN xnu-344.49/bsd/netat/at_aarp.h xnu-517/bsd/netat/at_aarp.h --- xnu-344.49/bsd/netat/at_aarp.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/at_aarp.h Sat Oct 25 00:25:55 2003 @@ -109,7 +109,7 @@ gbuf_t *m; /* ptr to msg blk to be sent out */ at_ifaddr_t *elapp; int error; - void *tmo; + int tmo; } aarp_amt_t; #define AMT_BSIZ 4 /* bucket size */ diff -urN xnu-344.49/bsd/netat/at_snmp.h xnu-517/bsd/netat/at_snmp.h --- xnu-344.49/bsd/netat/at_snmp.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/at_snmp.h Sat Oct 25 00:25:55 2003 @@ -215,4 +215,4 @@ #define SNMP_TYPE(var,type) ((var & SNMP_OBJ_TYPE_MASK) == type) -#endif _NETAT_AT_SNMP_H_ +#endif /* _NETAT_AT_SNMP_H_ */ diff -urN xnu-344.49/bsd/netat/at_var.h xnu-517/bsd/netat/at_var.h --- xnu-344.49/bsd/netat/at_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/at_var.h Sat Oct 25 00:25:55 2003 @@ -305,4 +305,7 @@ } node_data; }; +void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *address, at_nvestr_t *zone); +void aarp_sched_probe(void *); + #endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/netat/atp.h xnu-517/bsd/netat/atp.h --- xnu-344.49/bsd/netat/atp.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/atp.h Sat Oct 25 00:25:55 2003 @@ -434,7 +434,7 @@ void atp_send_rsp(gref_t *, gbuf_t *, int); void atp_wput(gref_t *, gbuf_t *); void atp_rput(gref_t *, gbuf_t *); -void atp_retry_req(gbuf_t *); +void atp_retry_req(void *); void atp_stop(gbuf_t *, int); void atp_cancel_req(gref_t *, unsigned short); int atp_open(gref_t *, int); diff -urN xnu-344.49/bsd/netat/atp_read.c xnu-517/bsd/netat/atp_read.c --- xnu-344.49/bsd/netat/atp_read.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/atp_read.c Sat Oct 25 00:25:55 2003 @@ -53,7 +53,7 @@ static void atp_trans_complete(); void atp_x_done(); -void atp_x_done_funnel(); +void atp_x_done_funnel(void *); extern void atp_req_timeout(); /* @@ -61,9 +61,9 @@ * Version 1.7 of atp_read.c on 89/02/09 17:53:16 */ -void atp_treq_event(gref) -register gref_t *gref; +void atp_treq_event(void *arg) { + register gref_t *gref = (gref_t *)arg; register gbuf_t *m; register struct atp_state *atp; boolean_t funnel_state; @@ -459,10 +459,10 @@ void atp_x_done_funnel(trp) -register struct atp_trans *trp; +void *trp; { thread_funnel_set(network_flock, TRUE); - atp_x_done(trp); + atp_x_done((struct atp_trans *)trp); (void) thread_funnel_set(network_flock, FALSE); } diff -urN xnu-344.49/bsd/netat/atp_write.c xnu-517/bsd/netat/atp_write.c --- xnu-344.49/bsd/netat/atp_write.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/atp_write.c Sat Oct 25 00:25:55 2003 @@ -70,7 +70,7 @@ static void atp_pack_bdsp(struct atp_trans *, struct atpBDS *); static int atp_unpack_bdsp(struct atp_state *, gbuf_t *, struct atp_rcb *, int, int); -void atp_retry_req(), atp_trp_clock(), asp_clock(), asp_clock_funnel(), atp_trp_clock_funnel();; +void atp_trp_clock(), asp_clock(), asp_clock_funnel(), atp_trp_clock_funnel();; extern struct atp_rcb_qhead atp_need_rel; extern int atp_inited; @@ -455,11 +455,11 @@ register struct atp_rcb *rcbp; { register gbuf_t *m; register int i, len; - int s_gen, s, cnt; + int s_gen, s, cnt, err, offset, space; unsigned char *m0_rptr = NULL, *m0_wptr = NULL; register at_atp_t *athp; register struct atpBDS *bdsp; - register gbuf_t *m2, *m1, *m0, *m3; + register gbuf_t *m2, *m1, *m0, *mhdr; caddr_t lastPage; gbuf_t *mprev, *mlist = 0; at_socket src_socket = (at_socket)atp->atp_socket_no; @@ -497,109 +497,67 @@ m = rcbp->rc_xmt; m0 = gbuf_cont(m); - if (m0) { - m0_rptr = gbuf_rptr(m0); - m0_wptr = gbuf_wptr(m0); - } if (gbuf_len(m) > TOTAL_ATP_HDR_SIZE) bdsp = (struct atpBDS *)(AT_ATP_HDR(m)->data); else bdsp = 0; - + offset = 0; + if (m0) + space = gbuf_msgsize(m0); for (i = 0; i < cnt; i++) { - if (rcbp->rc_snd[i] == 0) { - if ((len = UAS_VALUE(bdsp->bdsBuffSz))) - gbuf_rinc(m0,len); - - } else { - m2 = rc_xmt[i]; - gbuf_rinc(m2,AT_WR_OFFSET); - gbuf_wset(m2,TOTAL_ATP_HDR_SIZE); - *(struct ddp_atp *)(gbuf_rptr(m2))= *(struct ddp_atp *)(gbuf_rptr(m)); - athp = AT_ATP_HDR(m2); - ATP_CLEAR_CONTROL(athp); - athp->cmd = ATP_CMD_TRESP; - athp->bitmap = i; - if (i == (cnt - 1)) - athp->eom = 1; /* for the last fragment */ - if (bdsp) - UAL_UAL(athp->user_bytes, bdsp->bdsUserData); - - if (bdsp) - if (len = UAS_VALUE(bdsp->bdsBuffSz)) { /* copy in data */ - if (m0 && gbuf_len(m0)) { - if ((m1 = gbuf_dupb(m0)) == NULL) { - for (i = 0; i < cnt; i++) - if (rc_xmt[i]) - gbuf_freem(rc_xmt[i]); - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); - goto nothing_to_send; - } - gbuf_wset(m1,len); - gbuf_rinc(m0,len); - if ((len = gbuf_len(m0)) < 0) { - gbuf_rdec(m0,len); - gbuf_wdec(m1,len); - if (!append_copy((struct mbuf *)m1, - (struct mbuf *)gbuf_cont(m0), FALSE)) { - for (i = 0; i < cnt; i++) - if (rc_xmt[i]) - gbuf_freem(rc_xmt[i]); - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); - goto nothing_to_send; + if (rcbp->rc_snd[i] == 0) { + if ((len = UAS_VALUE(bdsp->bdsBuffSz))) { + offset += len; + space -= len; + } + } else { + mhdr = rc_xmt[i]; + /* setup header fields */ + gbuf_rinc(mhdr,AT_WR_OFFSET); + gbuf_wset(mhdr,TOTAL_ATP_HDR_SIZE); + *(struct ddp_atp *)(gbuf_rptr(mhdr))= *(struct ddp_atp *)(gbuf_rptr(m)); + athp = AT_ATP_HDR(mhdr); + ATP_CLEAR_CONTROL(athp); + athp->cmd = ATP_CMD_TRESP; + athp->bitmap = i; + if (i == (cnt - 1)) + athp->eom = 1; /* for the last fragment */ + if (bdsp) { + UAL_UAL(athp->user_bytes, bdsp->bdsUserData); + if ((len = UAS_VALUE(bdsp->bdsBuffSz)) && m0 != 0 && space > 0) { + if ((m1 = m_copym(m0, offset, len, M_DONTWAIT)) == 0) { + for (i = 0; i < cnt; i++) + if (rc_xmt[i]) + gbuf_freem(rc_xmt[i]); + goto nothing_to_send; + } + offset += len; + space -= len; + gbuf_cont(mhdr) = m1; } - } else - gbuf_cont(m1) = 0; - gbuf_cont(m2) = m1; + } - /* temp fix for page boundary problem - bug# 2703163 */ - lastPage = (caddr_t)((int)(gbuf_wptr(m1) - 1) & ~PAGE_MASK); /* 4k page of last byte */ - if (lastPage != (caddr_t)((int)(gbuf_rptr(m1)) & ~PAGE_MASK)) { /* 1st byte and last on same page ? */ - if ((m3 = gbuf_dupb(m1)) == NULL) { - for (i = 0; i < cnt; i++) - if (rc_xmt[i]) - gbuf_freem(rc_xmt[i]); - (gbuf_rptr(m0)) = m0_rptr; - gbuf_wset(m0, (m0_wptr - m0_rptr)); - goto nothing_to_send; - } - (gbuf_rptr(m3)) = lastPage; /* new mbuf starts at beginning of page */ - gbuf_wset(m3, (gbuf_wptr(m1) - lastPage)); /* len = remaining data crossing over page boundary */ - gbuf_wset(m1, (lastPage - (gbuf_rptr(m1)))); /* adjust len of m1 */ - (gbuf_cont(m1)) = m3; - (gbuf_cont(m3)) = 0; - } - } - } - - AT_DDP_HDR(m2)->src_socket = src_socket; - dPrintf(D_M_ATP_LOW, D_L_OUTPUT, - ("atp_send_replies: %d, socket=%d, size=%d\n", - i, atp->atp_socket_no, gbuf_msgsize(gbuf_cont(m2)))); - - if (mlist) - gbuf_next(mprev) = m2; - else - mlist = m2; - mprev = m2; - - rcbp->rc_snd[i] = 0; - rcbp->rc_not_sent_bitmap &= ~atp_mask[i]; - if (rcbp->rc_not_sent_bitmap == 0) - break; - } - /* - * on to the next frag - */ - bdsp++; - } - if (m0) { - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); + AT_DDP_HDR(mhdr)->src_socket = src_socket; + dPrintf(D_M_ATP_LOW, D_L_OUTPUT, + ("atp_send_replies: %d, socket=%d, size=%d\n", + i, atp->atp_socket_no, gbuf_msgsize(gbuf_cont(m2)))); + + if (mlist) + gbuf_next(mprev) = mhdr; + else + mlist = mhdr; + mprev = mhdr; + + rcbp->rc_snd[i] = 0; + rcbp->rc_not_sent_bitmap &= ~atp_mask[i]; + if (rcbp->rc_not_sent_bitmap == 0) + break; + } + /* + * on to the next frag + */ + bdsp++; } - if (mlist) { ATENABLE(s, atp->atp_lock); DDP_OUTPUT(mlist); @@ -706,6 +664,11 @@ } /* atp_pack_bdsp */ +/* create an mbuf chain with mbuf packet headers for each ATP response packet + * to be sent. m contains the DDP hdr, ATP hdr, and and array of atpBDS structs. + * chained to m is an mbuf that contians the actual data pointed to by the atpBDS + * structs. + */ static int atp_unpack_bdsp(atp, m, rcbp, cnt, wait) struct atp_state *atp; @@ -714,17 +677,19 @@ register int cnt, wait; { register struct atpBDS *bdsp; - register gbuf_t *m2, *m1, *m0, *m3; - caddr_t lastPage; - register at_atp_t *athp; - register int i, len, s_gen; - at_socket src_socket; - struct ddp_atp { + register gbuf_t *m2, *m1, *m0, *mhdr; + caddr_t lastPage; + at_atp_t *athp; + int i, len, s_gen; + at_socket src_socket; + + struct ddp_atp { char ddp_atp_hdr[TOTAL_ATP_HDR_SIZE]; }; - gbuf_t *mprev, *mlist = 0; - gbuf_t *rc_xmt[ATP_TRESP_MAX]; - unsigned char *m0_rptr, *m0_wptr; + gbuf_t *mprev, *mlist = 0; + gbuf_t *rc_xmt[ATP_TRESP_MAX]; + unsigned char *m0_rptr, *m0_wptr; + int err, offset, space; /* * get the user data structure pointer @@ -790,101 +755,70 @@ goto l_send; } + /* create an array of mbuf packet headers for the packets to be sent + * to contain the atp and ddp headers with room at the front for the + * datalink header. + */ for (i = 0; i < cnt; i++) { /* all hdrs, packet data and dst addr storage */ if ((rc_xmt[i] = - gbuf_alloc_wait(AT_WR_OFFSET+TOTAL_ATP_HDR_SIZE, - wait)) == NULL) { - for (cnt = 0; cnt < i; cnt++) - if (rc_xmt[cnt]) - gbuf_freeb(rc_xmt[cnt]); - return 0; + gbuf_alloc_wait(AT_WR_OFFSET+TOTAL_ATP_HDR_SIZE, wait)) == NULL) { + for (cnt = 0; cnt < i; cnt++) + if (rc_xmt[cnt]) + gbuf_freeb(rc_xmt[cnt]); + return 0; } } - if (m0) { - m0_rptr = gbuf_rptr(m0); - m0_wptr = gbuf_wptr(m0); - } - for (i = 0; i < cnt; i++) { - m2 = rc_xmt[i]; - gbuf_rinc(m2,AT_WR_OFFSET); - gbuf_wset(m2,TOTAL_ATP_HDR_SIZE); - *(struct ddp_atp *)(gbuf_rptr(m2))= *(struct ddp_atp *)(gbuf_rptr(m)); - athp = AT_ATP_HDR(m2); + /* run through the atpBDS structs and create an mbuf for the data + * portion of each packet to be sent. these get chained to the mbufs + * containing the ATP and DDP headers. this code assumes that no ATP + * packet is contained in more than 2 mbufs (e.i crosses mbuf boundary + * no more than one time). + */ + offset = 0; + if (m0) + space = gbuf_msgsize(m0); + for (i = 0; i < cnt; i++) { /* for each hdr mbuf */ + mhdr = rc_xmt[i]; + /* setup header fields */ + gbuf_rinc(mhdr,AT_WR_OFFSET); + gbuf_wset(mhdr,TOTAL_ATP_HDR_SIZE); + *(struct ddp_atp *)(gbuf_rptr(mhdr))= *(struct ddp_atp *)(gbuf_rptr(m)); + athp = AT_ATP_HDR(mhdr); ATP_CLEAR_CONTROL(athp); athp->cmd = ATP_CMD_TRESP; athp->bitmap = i; if (i == (cnt - 1)) athp->eom = 1; /* for the last fragment */ UAL_UAL(athp->user_bytes, bdsp->bdsUserData); - - if ((len = UAS_VALUE(bdsp->bdsBuffSz))) { /* copy in data */ - if (m0 && gbuf_len(m0)) { - if ((m1 = gbuf_dupb_wait(m0, wait)) == NULL) { + + if ((len = UAS_VALUE(bdsp->bdsBuffSz)) != 0 && m0 != 0 && space > 0) { + if ((m1 = m_copym(m0, offset, len, wait)) == 0) { for (i = 0; i < cnt; i++) if (rc_xmt[i]) gbuf_freem(rc_xmt[i]); - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); return 0; } - gbuf_wset(m1,len); /* *** m1 is first len bytes of m0? *** */ - gbuf_rinc(m0,len); - if ((len = gbuf_len(m0)) < 0) { - gbuf_rdec(m0,len); - gbuf_wdec(m1,len); - if (!append_copy((struct mbuf *)m1, - (struct mbuf *)gbuf_cont(m0), wait)) { - for (i = 0; i < cnt; i++) - if (rc_xmt[i]) - gbuf_freem(rc_xmt[i]); - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); - return 0; - } - } else - gbuf_cont(m1) = 0; - gbuf_cont(m2) = m1; - - /* temp fix for page boundary problem - bug# 2703163 */ - lastPage = (caddr_t)((int)(gbuf_wptr(m1) - 1) & ~PAGE_MASK); /* 4k page of last byte */ - if (lastPage != (caddr_t)((int)(gbuf_rptr(m1)) & ~PAGE_MASK)) { /* 1st byte and last on same page ? */ - if ((m3 = gbuf_dupb_wait(m1, wait)) == NULL) { - for (i = 0; i < cnt; i++) - if (rc_xmt[i]) - gbuf_freem(rc_xmt[i]); - (gbuf_rptr(m0)) = m0_rptr; - gbuf_wset(m0, (m0_wptr - m0_rptr)); - return 0; - } - (gbuf_rptr(m3)) = lastPage; /* new mbuf starts at beginning of page */ - gbuf_wset(m3, (gbuf_wptr(m1) - lastPage)); /* len = remaining data crossing over page boundary */ - gbuf_wset(m1, (lastPage - (gbuf_rptr(m1)))); /* adjust len of m1 */ - (gbuf_cont(m1)) = m3; - (gbuf_cont(m3)) = 0; - } - } + gbuf_cont(mhdr) = m1; + space -= len; + offset += len; } - - AT_DDP_HDR(m2)->src_socket = src_socket; + + AT_DDP_HDR(mhdr)->src_socket = src_socket; dPrintf(D_M_ATP_LOW,D_L_INFO, ("atp_unpack_bdsp %d, socket=%d, size=%d, cnt=%d\n", - i,atp->atp_socket_no,gbuf_msgsize(gbuf_cont(m2)),cnt)); + i,atp->atp_socket_no,gbuf_msgsize(gbuf_cont(mhdr)),cnt)); if (mlist) - gbuf_next(mprev) = m2; + gbuf_next(mprev) = mhdr; else - mlist = m2; - mprev = m2; + mlist = mhdr; + mprev = mhdr; /* * on to the next frag */ bdsp++; } - if (m0) { - gbuf_rptr(m0) = m0_rptr; - gbuf_wset(m0,(m0_wptr-m0_rptr)); - } /* * send the message */ @@ -901,6 +835,7 @@ DDP_OUTPUT(mlist); return 0; + } /* atp_unpack_bdsp */ #define ATP_SOCKET_LAST (DDP_SOCKET_LAST-6) @@ -1325,9 +1260,10 @@ } } /* atp_send_req */ -void atp_retry_req(m) - gbuf_t *m; +void atp_retry_req(arg) + void *arg; { + gbuf_t *m = (gbuf_t *)arg; gref_t *gref; boolean_t funnel_state; @@ -1671,7 +1607,7 @@ /* * copy out the recv data */ - atp_pack_bdsp(trp, bds); + atp_pack_bdsp(trp, (struct atpBDS *)bds); /* * copyout the result info @@ -1683,6 +1619,14 @@ return (int)tid; } /* _ATPsndreq */ + +/* entry point for ATP send response. respbuf contains a DDP hdr, + * ATP hdr, and atpBDS array. The bdsDataSz field of the first atpBDS + * struct contains the number of atpBDS structs in the array. resplen + * contains the len of the data in respbuf and datalen contains the + * len of the data buffer holding the response packets which the atpBDS + * struct entries point to. + */ int _ATPsndrsp(fd, respbuff, resplen, datalen, err, proc) int fd; @@ -1692,15 +1636,18 @@ int *err; void *proc; { - gref_t *gref; - int s, rc; - long bufaddr; - gbuf_t *m, *mdata; - register short len; - register int size; - register struct atp_state *atp; - register struct atpBDS *bdsp; - register char *buf; + gref_t *gref; + int s, rc; + long bufaddr; + gbuf_t *m, *mdata; + short space; + int size; + struct atp_state *atp; + struct atpBDS *bdsp; + u_int16_t *bufsz; + char *buf; + int bds_cnt, count, len; + caddr_t dataptr; if ((*err = atalk_getref(0, fd, &gref, proc)) != 0) return -1; @@ -1728,33 +1675,68 @@ gbuf_wset(m,resplen); ((at_ddp_t *)gbuf_rptr(m))->src_node = 0; bdsp = (struct atpBDS *)(gbuf_rptr(m) + TOTAL_ATP_HDR_SIZE); - if ((resplen == TOTAL_ATP_HDR_SIZE) || ((len = UAS_VALUE(bdsp->bdsDataSz)) == 1)) - len = 0; - else - len = 16 * sizeof(gbuf_t); /* - * allocate buffer and copy in the response data - */ - if ((mdata = gbuf_alloc_wait(datalen+len, TRUE)) == 0) { - gbuf_freem(m); + * allocate buffers and copy in the response data. + * note that only the size field of the atpBDS field + * is used internally in the kernel. + */ + bds_cnt = get_bds_entries(m); /* count of # entries */ + /* check correctness of parameters */ + if (bds_cnt > ATP_TRESP_MAX) { + gbuf_freem(m); + *err = EINVAL; + return -1; + } + + for (size = 0, count = 0; count < bds_cnt; count++) { + size += UAS_VALUE(bdsp[count].bdsBuffSz); + } + if (size > datalen) { + gbuf_freem(m); + *err = EINVAL; + return -1; + } + + /* get the first mbuf */ + if ((mdata = gbuf_alloc_wait((space = (size > MCLBYTES ? MCLBYTES : size)), TRUE)) == 0) { + gbuf_freem(m); *err = ENOMEM; return -1; } gbuf_cont(m) = mdata; - for (size=0; bdsp < (struct atpBDS *)gbuf_wptr(m); bdsp++) { - if ((bufaddr = UAL_VALUE(bdsp->bdsBuffAddr)) != 0) { - len = UAS_VALUE(bdsp->bdsBuffSz); - buf = (char *)bufaddr; - if ((*err = copyin((caddr_t)buf, - (caddr_t)&gbuf_rptr(mdata)[size], len)) != 0) { + dataptr = mtod(mdata, caddr_t); + for (count = 0; count < bds_cnt; bdsp++, count++) { + if ((bufaddr = UAL_VALUE(bdsp->bdsBuffAddr)) != 0 && + (len = UAS_VALUE(bdsp->bdsBuffSz)) != 0) { + if (len > space) { /* enough room ? */ + gbuf_wset(mdata, dataptr - mtod(mdata, caddr_t)); /* set len of last mbuf */ + /* allocate the next mbuf */ + if ((gbuf_cont(mdata) = m_get((M_WAIT), MSG_DATA)) == 0) { + gbuf_freem(m); + *err = ENOMEM; + return -1; + } + mdata = gbuf_cont(mdata); + MCLGET(mdata, M_WAIT); + if (!(mdata->m_flags & M_EXT)) { + m_freem(m); + return(NULL); + } + dataptr = mtod(mdata, caddr_t); + space = MCLBYTES; + } + /* do the copyin */ + if ((*err = copyin((caddr_t)bufaddr, dataptr, len)) != 0) { gbuf_freem(m); return -1; } - size += len; + dataptr += len; + space -= len; } } - gbuf_wset(mdata,size); + gbuf_wset(mdata, dataptr - mtod(mdata, caddr_t)); /* set len of last mbuf */ + gbuf_cont(m)->m_pkthdr.len = size; /* set packet hdr len */ atp_send_rsp(gref, m, TRUE); return 0; @@ -1861,7 +1843,7 @@ if ((*err = copyin((caddr_t)bdsp, (caddr_t)bds, sizeof(bds))) != 0) return -1; - atp_pack_bdsp(trp, bds); + atp_pack_bdsp(trp, (struct atpBDS *)bds); tid = (int)trp->tr_tid; atp_free(trp); copyout((caddr_t)bds, (caddr_t)bdsp, sizeof(bds)); diff -urN xnu-344.49/bsd/netat/aurp_aurpd.c xnu-517/bsd/netat/aurp_aurpd.c --- xnu-344.49/bsd/netat/aurp_aurpd.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/aurp_aurpd.c Sat Oct 25 00:25:55 2003 @@ -262,7 +262,7 @@ ("AURPgetmsg: soreceive returned %d, aurp_global.event==0x%x\n", *err, events)); /* soreceive() sets *mp to zero! at start */ if (p_mbuf) - ip_to_atalk(from, p_mbuf); + ip_to_atalk((struct sockaddr_in *)from, p_mbuf); if (*err || (p_mbuf == NULL)) { /* * An error occurred in soreceive(), diff -urN xnu-344.49/bsd/netat/aurp_ri.c xnu-517/bsd/netat/aurp_ri.c --- xnu-344.49/bsd/netat/aurp_ri.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/aurp_ri.c Sat Oct 25 00:25:55 2003 @@ -53,6 +53,9 @@ #include #include + +static void AURPsndRIRsp(aurp_state_t *); + /* */ void AURPsndRIAck(state, m, flags) aurp_state_t *state; diff -urN xnu-344.49/bsd/netat/ddp_aarp.c xnu-517/bsd/netat/ddp_aarp.c --- xnu-344.49/bsd/netat/ddp_aarp.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_aarp.c Sat Oct 25 00:25:55 2003 @@ -81,7 +81,6 @@ int aarp_init1(), aarp_init2(); int aarp_send_data(); -int aarp_sched_probe(); StaticProc int aarp_req_cmd_in(); StaticProc int aarp_resp_cmd_in(); @@ -93,7 +92,7 @@ StaticProc int aarp_glean_info(); StaticProc int aarp_delete_amt_info(); StaticProc void aarp_build_pkt(); -StaticProc int aarp_sched_req(); +StaticProc void aarp_sched_req(void *); StaticProc int aarp_get_rand_node(); StaticProc int aarp_get_next_node(); StaticProc int aarp_get_rand_net(); @@ -767,13 +766,14 @@ * ****************************************************************************/ -int aarp_sched_probe() +void aarp_sched_probe(void *arg) { boolean_t funnel_state; funnel_state = thread_funnel_set(network_flock, TRUE); - if (probe_cb.no_of_retries != AARP_MAX_PROBE_RETRIES) { + if (probe_cb.elapp->aa_ifp != 0 && + probe_cb.no_of_retries != AARP_MAX_PROBE_RETRIES) { if (aarp_send_probe() == -1) AARPwakeup(&probe_cb); } else { @@ -782,7 +782,6 @@ } (void) thread_funnel_set(network_flock, FALSE); - return(0); } @@ -810,11 +809,12 @@ * ****************************************************************************/ -StaticProc int aarp_sched_req(amt_ptr) - register aarp_amt_t *amt_ptr; +StaticProc void aarp_sched_req(arg) + void *arg; { int s, i; boolean_t funnel_state; + aarp_amt_t *amt_ptr = (aarp_amt_t *)arg; funnel_state = thread_funnel_set(network_flock, TRUE); @@ -824,7 +824,8 @@ * into one of the amt arrays. */ for (i = 0; i < IF_TOTAL_MAX; i++) { - if (aarp_table[i] == NULL || amt_ptr < aarp_table[i] || amt_ptr >= (aarp_table[i] + 1)) + if (aarp_table[i] == NULL || (void *)amt_ptr < (void *)aarp_table[i] || + (void *)amt_ptr >= (void *)(aarp_table[i] + 1)) continue; /* no match - try next entry */ /* @@ -834,13 +835,13 @@ if (amt_ptr->tmo == 0) { ATENABLE(s, arpinp_lock); (void) thread_funnel_set(network_flock, FALSE); - return(0); + return; } if (amt_ptr->no_of_retries < AARP_MAX_REQ_RETRIES) { ATENABLE(s, arpinp_lock); if (aarp_send_req(amt_ptr) == 0) { (void) thread_funnel_set(network_flock, FALSE); - return(0); + return; } ATDISABLE(s, arpinp_lock); } @@ -850,7 +851,7 @@ } (void) thread_funnel_set(network_flock, FALSE); - return(0); + return; } diff -urN xnu-344.49/bsd/netat/ddp_brt.c xnu-517/bsd/netat/ddp_brt.c --- xnu-344.49/bsd/netat/ddp_brt.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_brt.c Sat Oct 25 00:25:55 2003 @@ -71,6 +71,8 @@ ddp_brt_t at_ddp_brt[BRTSIZE]; int ddp_brt_sweep_timer; +void ddp_brt_sweep(); + void ddp_glean(mp, ifID, src_addr) register gbuf_t *mp; register at_ifaddr_t *ifID; diff -urN xnu-344.49/bsd/netat/ddp_lap.c xnu-517/bsd/netat/ddp_lap.c --- xnu-344.49/bsd/netat/ddp_lap.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_lap.c Sat Oct 25 00:25:55 2003 @@ -887,10 +887,22 @@ /* Get DDP started */ if ((errno = ddp_add_if(elapp))) return(errno); - + + // check if we still have an interface - can be lost when + // ddp_add_if calls malloc + // need to make check here after ddp_add_if completes because + // lap_online will call ddp_rem_if if we fail here + if (elapp->aa_ifp == 0) + return ENOENT; + /* set up multicast address for cable-wide broadcasts */ (void)at_reg_mcast(elapp, (caddr_t)&elapp->cable_multicast_addr); + // need to check again if interface is present + // can be lost in at_reg_mcast + if (elapp->aa_ifp == 0) + return ENOENT; + elapp->startup_inprogress = TRUE; if (! (elapp->startup_error = re_aarp(elapp))) (void)tsleep(&elapp->startup_inprogress, PSOCK | PCATCH, @@ -1083,8 +1095,6 @@ vm_offset_t temp_rcb_data, temp_state_data; int i, s, active_skts = 0; /* count of active pids for non-socketized AppleTalk protocols */ - extern int aarp_sched_probe(); - /* Network is shutting down... send error messages up on each open * socket. @@ -1235,29 +1245,6 @@ } ddp_start(); - /* free buffers for large arrays used by atp. - * to prevent a race condition if the funnel is dropped - * while calling kmem_free, the fields are grabbed and - * zeroed first. - */ - if (atp_rcb_data != NULL) { - temp_rcb_data = (vm_offset_t)atp_rcb_data; - atp_rcb_data = NULL; - atp_rcb_free_list = NULL; - } else - temp_rcb_data = NULL; - if (atp_state_data != NULL) { - temp_state_data = (vm_offset_t)atp_state_data; - atp_state_data = NULL; - atp_free_list = NULL; - } else - temp_state_data = NULL; - - if (temp_rcb_data) - kmem_free(kernel_map, temp_rcb_data, sizeof(struct atp_rcb) * NATP_RCB); - if (temp_state_data) - kmem_free(kernel_map, temp_state_data, sizeof(struct atp_state) * NATP_STATE); - splx(s); return(0); } /* ddp_shutdown */ @@ -1364,7 +1351,7 @@ ATDISABLE(s, arpinp_lock); elapp = probe_cb->elapp; - if ( (elapp != NULL) && elapp->startup_inprogress ) { + if ( (elapp != NULL) && elapp->startup_inprogress && elapp->aa_ifp != 0) { ATENABLE(s, arpinp_lock); /* was AARPContinue */ diff -urN xnu-344.49/bsd/netat/ddp_r_rtmp.c xnu-517/bsd/netat/ddp_r_rtmp.c --- xnu-344.49/bsd/netat/ddp_r_rtmp.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_r_rtmp.c Sat Oct 25 00:25:55 2003 @@ -86,7 +86,7 @@ void rtmp_timeout(); void rtmp_send_port(); void rtmp_send_port_funnel(); -void rtmp_dropper(); +void rtmp_dropper(void *); void rtmp_shutdown(); static void rtmp_update(); static void rtmp_request(); @@ -1223,7 +1223,7 @@ * the actual packet dropping is done in ddp_input */ -void rtmp_dropper() +void rtmp_dropper(void *arg) { boolean_t funnel_state; diff -urN xnu-344.49/bsd/netat/ddp_r_zip.c xnu-517/bsd/netat/ddp_r_zip.c --- xnu-344.49/bsd/netat/ddp_r_zip.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_r_zip.c Sat Oct 25 00:25:55 2003 @@ -92,8 +92,8 @@ static int netinfo_reply_pending; static void zip_netinfo_reply(at_x_zip_t *, at_ifaddr_t *); static void zip_getnetinfo(at_ifaddr_t *); -static void zip_getnetinfo_funnel(at_ifaddr_t *); -static void send_phony_reply(gbuf_t *); +static void zip_getnetinfo_funnel(void *); +static void send_phony_reply(void *); /* * zip_send_getnetinfo_reply: we received a GetNetInfo packet, we need to reply @@ -992,9 +992,10 @@ } /* funnel version of zip_getnetinfo */ -static void zip_getnetinfo_funnel(ifID) - register at_ifaddr_t *ifID; +static void zip_getnetinfo_funnel(arg) + void *arg; { + at_ifaddr_t *ifID = (at_ifaddr_t *)arg; thread_funnel_set(network_flock, TRUE); zip_getnetinfo(ifID); thread_funnel_set(network_flock, FALSE); @@ -1261,9 +1262,10 @@ } static void -send_phony_reply(rm) - gbuf_t *rm; +send_phony_reply(arg) + void *arg; { + gbuf_t *rm = (gbuf_t *)arg; boolean_t funnel_state; funnel_state = thread_funnel_set(network_flock, TRUE); diff -urN xnu-344.49/bsd/netat/ddp_usrreq.c xnu-517/bsd/netat/ddp_usrreq.c --- xnu-344.49/bsd/netat/ddp_usrreq.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/ddp_usrreq.c Sat Oct 25 00:25:55 2003 @@ -87,12 +87,15 @@ at_ddp_t *ddp = NULL; struct atpcb *pcb = (struct atpcb *)((so)->so_pcb); + error = soreserve(so, ddp_sendspace, ddp_recvspace); + if (error != 0) + return error; + s = splnet(); error = at_pcballoc(so, &ddp_head); splx(s); if (error) return error; - error = soreserve(so, ddp_sendspace, ddp_recvspace); pcb = (struct atpcb *)((so)->so_pcb); pcb->pid = current_proc()->p_pid; pcb->ddptype = (u_char) proto; /* set in socreate() */ diff -urN xnu-344.49/bsd/netat/drv_dep.c xnu-517/bsd/netat/drv_dep.c --- xnu-344.49/bsd/netat/drv_dep.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/drv_dep.c Sat Oct 25 00:25:55 2003 @@ -296,7 +296,7 @@ m_freem(m); continue; /* was EAFNOSUPPORT */ } -#endif COMMENT +#endif /* COMMENT */ llc_header = (llc_header_t *)(enet_header+1); diff -urN xnu-344.49/bsd/netat/sys_dep.c xnu-517/bsd/netat/sys_dep.c --- xnu-344.49/bsd/netat/sys_dep.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/sys_dep.c Sat Oct 25 00:25:55 2003 @@ -63,11 +63,12 @@ extern at_state_t at_state; /* global state of AT network */ extern at_ifaddr_t *ifID_home; /* default interface */ +struct ATsocket_args { + int proto; +}; int ATsocket(proc, uap, retval) void *proc; - struct { - int proto; - } *uap; + struct ATsocket_args *uap; int *retval; { int err; @@ -87,14 +88,15 @@ return err; } +struct ATgetmsg_args { + int fd; + void *ctlptr; + void *datptr; + int *flags; +}; int ATgetmsg(proc, uap, retval) void *proc; - struct { - int fd; - void *ctlptr; - void *datptr; - int *flags; - } *uap; + struct ATgetmsg_args *uap; int *retval; { int err; @@ -116,14 +118,15 @@ return err; } -int ATputmsg(proc, uap, retval) - void *proc; - struct { +struct ATputmsg_args { int fd; void *ctlptr; void *datptr; int flags; - } *uap; +}; +int ATputmsg(proc, uap, retval) + void *proc; + struct ATputmsg_args *uap; int *retval; { int err; @@ -145,14 +148,15 @@ return err; } +struct ATPsndreq_args { + int fd; + unsigned char *buf; + int len; + int nowait; +}; int ATPsndreq(proc, uap, retval) void *proc; - struct { - int fd; - unsigned char *buf; - int len; - int nowait; - } *uap; + struct ATPsndreq_args *uap; int *retval; { int err; @@ -174,14 +178,15 @@ return err; } -int ATPsndrsp(proc, uap, retval) - void *proc; - struct { +struct ATPsndrsp_args { int fd; unsigned char *respbuff; int resplen; int datalen; - } *uap; +}; +int ATPsndrsp(proc, uap, retval) + void *proc; + struct ATPsndrsp_args *uap; int *retval; { int err; @@ -203,13 +208,14 @@ return err; } -int ATPgetreq(proc, uap, retval) - void *proc; - struct { +struct ATPgetreq_args { int fd; unsigned char *buf; int buflen; - } *uap; +}; +int ATPgetreq(proc, uap, retval) + void *proc; + struct ATPgetreq_args *uap; int *retval; { int err; @@ -231,12 +237,13 @@ return err; } -int ATPgetrsp(proc, uap, retval) - void *proc; - struct { +struct ATPgetrsp_args { int fd; unsigned char *bdsp; - } *uap; +}; +int ATPgetrsp(proc, uap, retval) + void *proc; + struct ATPgetrsp_args *uap; int *retval; { int err = 0; @@ -277,9 +284,9 @@ int *retfd; struct proc *proc; { - extern int _ATread(), _ATwrite(),_ATioctl(), _ATselect(), _ATclose(); + extern int _ATread(), _ATwrite(),_ATioctl(), _ATselect(), _ATclose(), _ATkqfilter(); static struct fileops fileops = - {_ATread, _ATwrite, _ATioctl, _ATselect, _ATclose}; + {_ATread, _ATwrite, _ATioctl, _ATselect, _ATclose, _ATkqfilter}; int err, fd; struct file *fp; @@ -324,7 +331,8 @@ return EBADF; } } - if ((*grefp = (gref_t *)fp->f_data) == 0) { + *grefp = (gref_t *)fp->f_data; + if (*grefp == 0 || *grefp == (gref_t *)(-1)) { thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); return EBADF; } diff -urN xnu-344.49/bsd/netat/sys_glue.c xnu-517/bsd/netat/sys_glue.c --- xnu-344.49/bsd/netat/sys_glue.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/sys_glue.c Sat Oct 25 00:25:55 2003 @@ -82,7 +82,7 @@ &dbgBits, dbgBits, "AppleTalk Debug Flags"); volatile int RouterMix = RT_MIX_DEFAULT; /* default for nbr of ppsec */ SYSCTL_INT(_net_appletalk, OID_AUTO, routermix, CTLFLAG_WR, - &RouterMix, 0, "Appletalk RouterMix"); + (int *)&RouterMix, 0, "Appletalk RouterMix"); at_ddp_stats_t at_ddp_stats; /* DDP statistics */ SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD, &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats"); @@ -635,6 +635,14 @@ return rc; } +int _ATkqfilter(fp, kn, p) + struct file *fp; + struct knote *kn; + struct proc *p; +{ + return (EOPNOTSUPP); +} + void atalk_putnext(gref, m) gref_t *gref; gbuf_t *m; @@ -925,9 +933,9 @@ struct mbuf *m_clattach(extbuf, extfree, extsize, extarg, wait) caddr_t extbuf; - int (*extfree)(); - int extsize; - int extarg; + void (*extfree)(caddr_t , u_int, caddr_t); + u_int extsize; + caddr_t extarg; int wait; { struct mbuf *m; @@ -985,8 +993,9 @@ */ void m_lgbuf_free(buf, size, arg) - void *buf; - int size, arg; /* not needed, but they're in m_free() */ + caddr_t buf; + u_int size; + caddr_t arg; /* not needed, but they're in m_free() */ { /* FREE(buf, M_MCLUST); - can't free here - called from m_free while under lock */ @@ -1030,7 +1039,7 @@ if (NULL == (m = m_clattach(buf, m_lgbuf_free, size, 0, (wait)? M_WAIT: M_DONTWAIT))) { - m_lgbuf_free(buf); + m_lgbuf_free(buf, 0, 0); return(NULL); } } else { diff -urN xnu-344.49/bsd/netat/sysglue.h xnu-517/bsd/netat/sysglue.h --- xnu-344.49/bsd/netat/sysglue.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netat/sysglue.h Sat Oct 25 00:25:55 2003 @@ -103,7 +103,6 @@ * in MacOSX. Need to find a better Error code ###LD */ #define ENOTREADY ESHUTDOWN -#define ENOMSG EOPNOTSUPP #define EPROTO EPROTOTYPE /* T_MPSAFE is used only in atp_open. I suspect it's a diff -urN xnu-344.49/bsd/netinet/dhcp_options.c xnu-517/bsd/netinet/dhcp_options.c --- xnu-344.49/bsd/netinet/dhcp_options.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/dhcp_options.c Sat Oct 25 00:25:55 2003 @@ -102,7 +102,7 @@ else if (list->size == list->count) { #ifdef DEBUG printf("doubling %d to %d\n", list->size, list->size * 2); -#endif DEBUG +#endif /* DEBUG */ list->array = my_realloc(list->array, sizeof(*list->array) * list->size, sizeof(*list->array) * list->size * 2); @@ -540,4 +540,4 @@ } exit(0); } -#endif TEST_DHCP_OPTIONS +#endif /* TEST_DHCP_OPTIONS */ diff -urN xnu-344.49/bsd/netinet/icmp6.h xnu-517/bsd/netinet/icmp6.h --- xnu-344.49/bsd/netinet/icmp6.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/icmp6.h Sat Oct 25 00:25:55 2003 @@ -658,7 +658,7 @@ void icmp6_init __P((void)); void icmp6_paramerror __P((struct mbuf *, int)); void icmp6_error __P((struct mbuf *, int, int, int)); -int icmp6_input __P((struct mbuf **, int *, int)); +int icmp6_input __P((struct mbuf **, int *)); void icmp6_fasttimo __P((void)); void icmp6_reflect __P((struct mbuf *, size_t)); void icmp6_prepare __P((struct mbuf *)); diff -urN xnu-344.49/bsd/netinet/icmp_var.h xnu-517/bsd/netinet/icmp_var.h --- xnu-344.49/bsd/netinet/icmp_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/icmp_var.h Sat Oct 25 00:25:55 2003 @@ -90,13 +90,15 @@ #define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ #define ICMPCTL_STATS 2 /* statistics (read-only) */ #define ICMPCTL_ICMPLIM 3 -#define ICMPCTL_MAXID 4 +#define ICMPCTL_TIMESTAMP 4 /* allow replies to time stamp requests */ +#define ICMPCTL_MAXID 5 #define ICMPCTL_NAMES { \ { 0, 0 }, \ { "maskrepl", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ { "icmplim", CTLTYPE_INT }, \ + { "icmptimestamp", CTLTYPE_INT }, \ } #endif /* __APPLE_API_UNSTABLE */ diff -urN xnu-344.49/bsd/netinet/if_ether.c xnu-517/bsd/netinet/if_ether.c --- xnu-344.49/bsd/netinet/if_ether.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/if_ether.c Sat Oct 25 00:25:55 2003 @@ -405,7 +405,7 @@ (void)memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa)); sa.sa_family = AF_UNSPEC; sa.sa_len = sizeof(sa); - dlil_output(((struct ifnet *)ac)->if_data.default_proto, m, 0, &sa, 0); + dlil_output(ifptodlt(((struct ifnet *)ac), PF_INET), m, 0, &sa, 0); } /* diff -urN xnu-344.49/bsd/netinet/igmp.c xnu-517/bsd/netinet/igmp.c --- xnu-344.49/bsd/netinet/igmp.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/igmp.c Sat Oct 25 00:25:55 2003 @@ -144,31 +144,35 @@ find_rti(ifp) struct ifnet *ifp; { - register struct router_info *rti = Head; - + register struct router_info *rti = Head; + + #if IGMP_DEBUG printf("[igmp.c, _find_rti] --> entering \n"); #endif - while (rti) { - if (rti->rti_ifp == ifp) { + while (rti) { + if (rti->rti_ifp == ifp) { #if IGMP_DEBUG printf("[igmp.c, _find_rti] --> found old entry \n"); #endif - return rti; - } - rti = rti->rti_next; - } - + return rti; + } + rti = rti->rti_next; + } + MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); - rti->rti_ifp = ifp; - rti->rti_type = IGMP_V2_ROUTER; - rti->rti_time = 0; - rti->rti_next = Head; - Head = rti; + if (rti != NULL) + { + rti->rti_ifp = ifp; + rti->rti_type = IGMP_V2_ROUTER; + rti->rti_time = 0; + rti->rti_next = Head; + Head = rti; + } #if IGMP_DEBUG - printf("[igmp.c, _find_rti] --> created an entry \n"); + if (rti) printf("[igmp.c, _find_rti] --> created an entry \n"); #endif - return rti; + return rti; } void @@ -227,6 +231,10 @@ if (timer == 0) timer = 1; rti = find_rti(ifp); + if (rti == NULL) { + m_freem(m); + return; + } /* * In the IGMPv2 specification, there are 3 states and a flag. @@ -364,7 +372,7 @@ rip_input(m, iphlen); } -void +int igmp_joingroup(inm) struct in_multi *inm; { @@ -376,12 +384,14 @@ inm->inm_state = IGMP_OTHERMEMBER; } else { inm->inm_rti = find_rti(inm->inm_ifp); + if (inm->inm_rti == NULL) return ENOMEM; igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); inm->inm_state = IGMP_IREPORTEDLAST; igmp_timers_are_running = 1; } + return 0; splx(s); } diff -urN xnu-344.49/bsd/netinet/igmp_var.h xnu-517/bsd/netinet/igmp_var.h --- xnu-344.49/bsd/netinet/igmp_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/igmp_var.h Sat Oct 25 00:25:55 2003 @@ -114,7 +114,7 @@ void igmp_init __P((void)); void igmp_input __P((struct mbuf *, int)); -void igmp_joingroup __P((struct in_multi *)); +int igmp_joingroup __P((struct in_multi *)); void igmp_leavegroup __P((struct in_multi *)); void igmp_fasttimo __P((void)); void igmp_slowtimo __P((void)); diff -urN xnu-344.49/bsd/netinet/in.c xnu-517/bsd/netinet/in.c --- xnu-344.49/bsd/netinet/in.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in.c Sat Oct 25 00:25:55 2003 @@ -105,11 +105,6 @@ struct in_multihead in_multihead; /* XXX BSS initialization */ extern void arp_rtrequest(); -extern int ether_detach_inet(struct ifnet *ifp); - -#if INET6 -extern int ip6_auto_on; -#endif /* * Return 1 if an internet address is for a ``local'' host @@ -340,24 +335,15 @@ * Temorary code for protocol attachment XXX */ - if (ifp->if_type == IFT_ETHER) - dl_tag = ether_attach_inet(ifp); - - if (ifp->if_type == IFT_LOOP) - dl_tag = lo_attach_inet(ifp); -#if NFAITH - /* Is this right? */ - if (ifp && ifp->if_type == IFT_FAITH) - dl_tag = faith_attach_inet(ifp); -#endif -#if NGIF - /* Is this right? */ - if (ifp && ifp->if_type == IFT_GIF) - dl_tag = gif_attach_proto_family(ifp, PF_INET); -#endif + /* Generic protocol plumbing */ + + if (error = dlil_plumb_protocol(PF_INET, ifp, &dl_tag)) { + kprintf("in.c: warning can't plumb proto if=%s%n type %d error=%d\n", + ifp->if_name, ifp->if_unit, ifp->if_type, error); + error = 0; /*discard error, can be cold with unsupported interfaces */ + } /* End of temp code */ - ifa->ifa_dlt = dl_tag; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; @@ -379,8 +365,6 @@ return error; if (ifp == 0) return (EADDRNOTAVAIL); - if (strcmp(ifp->if_name, "en")) - return ENODEV; break; case SIOCSIFBRDADDR: @@ -523,11 +507,9 @@ (struct sockaddr_in *) &ifr->ifr_addr, 1)); case SIOCPROTOATTACH: - ether_attach_inet(ifp); -#if INET6 - if (ip6_auto_on) /* FreeBSD compat mode: Acquire linklocal addresses for IPv6 for if */ - in6_if_up(ifp); -#endif + error = dlil_plumb_protocol(PF_INET, ifp, &dl_tag); + if (error) + return(error); break; case SIOCPROTODETACH: @@ -535,17 +517,10 @@ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) return EBUSY; - error = ether_detach_inet(ifp); + + error = dlil_unplumb_protocol(PF_INET, ifp); if (error) return(error); -#if INET6 - if (ip6_auto_on) { /* if we linked ipv6 addresses to v4, remove them now */ - in6_purgeif(ifp); - error = ether_detach_inet6(ifp); - if (error) - return(error); - } -#endif break; @@ -1062,7 +1037,6 @@ register u_long i = ntohl(sin->sin_addr.s_addr); struct sockaddr_in oldaddr; int s = splimp(), flags = RTF_UP, error; - u_long dl_tag; oldaddr = ia->ia_addr; ia->ia_addr = *sin; @@ -1243,7 +1217,13 @@ /* * Let IGMP know that we have joined a new IP multicast group. */ - igmp_joingroup(inm); + error = igmp_joingroup(inm); + if (error) { + if_delmultiaddr(ifma); + LIST_REMOVE(inm, inm_link); + _FREE(inm, M_IPMADDR); + inm = NULL; + } splx(s); return (inm); } @@ -1260,7 +1240,7 @@ /* We intentionally do this a bit differently than BSD */ - if (ifma->ifma_refcount == 1) { + if (ifma && ifma->ifma_refcount == 1) { /* * No remaining claims to this record; let IGMP know that * we are leaving the multicast group. @@ -1271,6 +1251,7 @@ FREE(inm, M_IPMADDR); } /* XXX - should be separate API for when we have an ifma? */ - if_delmulti(ifma->ifma_ifp, ifma->ifma_addr); + if (ifma) + if_delmultiaddr(ifma); splx(s); } diff -urN xnu-344.49/bsd/netinet/in.h xnu-517/bsd/netinet/in.h --- xnu-344.49/bsd/netinet/in.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in.h Sat Oct 25 00:25:55 2003 @@ -363,6 +363,7 @@ #ifdef __APPLE__ #define IP_STRIPHDR 23 /* bool: drop receive of raw IP header */ #endif +#define IP_RECVTTL 24 /* bool; receive reception TTL w/dgram */ #define IP_FW_ADD 40 /* add a firewall rule to chain */ diff -urN xnu-344.49/bsd/netinet/in_bootp.c xnu-517/bsd/netinet/in_bootp.c --- xnu-344.49/bsd/netinet/in_bootp.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in_bootp.c Sat Oct 25 00:25:55 2003 @@ -71,9 +71,9 @@ #ifdef BOOTP_DEBUG #define dprintf(x) printf x; -#else BOOTP_DEBUG +#else /* !BOOTP_DEBUG */ #define dprintf(x) -#endif BOOTP_DEBUG +#endif /* BOOTP_DEBUG */ /* ip address formatting macros */ #define IP_FORMAT "%d.%d.%d.%d" @@ -228,7 +228,7 @@ " slen %d addr ", dl_p->sdl_len, dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); -#endif 0 +#endif for (i = 0; i < dl_p->sdl_alen; i++) printf("%s%x", i ? ":" : "", (link_address(dl_p))[i]); @@ -272,7 +272,7 @@ sin.sin_addr.s_addr = INADDR_BROADCAST; m = ip_pkt_to_mbuf((caddr_t)pkt, sizeof(*pkt)); - return (dlil_output(ifp->if_data.default_proto, m, 0, (struct sockaddr *)&sin, 0)); + return (dlil_output(ifptodlt(ifp, PF_INET), m, 0, (struct sockaddr *)&sin, 0)); } /* @@ -451,7 +451,7 @@ #ifdef BOOTP_DEBUG print_reply_short(reply, n); -#endif BOOTP_DEBUG +#endif /* BOOTP_DEBUG */ (void)dhcpol_parse_packet(&options, (struct dhcp *)reply, n, NULL); rating = rate_packet(reply, n, &options); diff -urN xnu-344.49/bsd/netinet/in_pcb.c xnu-517/bsd/netinet/in_pcb.c --- xnu-344.49/bsd/netinet/in_pcb.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in_pcb.c Sat Oct 25 00:25:55 2003 @@ -105,6 +105,8 @@ extern int ipsec_bypass; #endif +extern u_long route_generation; + #define DBG_FNC_PCB_LOOKUP NETDBG_CODE(DBG_NETTCP, (6 << 8)) #define DBG_FNC_PCB_HLOOKUP NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1)) @@ -451,12 +453,16 @@ /* * If route is known or can be allocated now, * our src addr is taken from the i/f, else punt. + * Note that we should check the address family of the cached + * destination, in case of sharing the cache with IPv6. */ ro = &inp->inp_route; if (ro->ro_rt && - (satosin(&ro->ro_dst)->sin_addr.s_addr != + (ro->ro_dst.sa_family != AF_INET || + satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr || - inp->inp_socket->so_options & SO_DONTROUTE)) { + inp->inp_socket->so_options & SO_DONTROUTE || + ro->ro_rt->generation_id != route_generation)) { rtfree(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } @@ -464,6 +470,7 @@ (ro->ro_rt == (struct rtentry *)0 || ro->ro_rt->rt_ifp == (struct ifnet *)0)) { /* No route yet, so try to acquire one */ + bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = @@ -557,6 +564,7 @@ return (error); } inp->inp_laddr = ifaddr->sin_addr; + inp->inp_flags |= INP_INADDR_ANY; } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; @@ -614,8 +622,10 @@ rt->rt_gateway, rt_mask(rt), rt->rt_flags, (struct rtentry **)0); } - else + else { rtfree(rt); + inp->inp_route.ro_rt = 0; + } } ip_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; @@ -1115,7 +1125,7 @@ LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); - if (LIST_FIRST(&phd->phd_pcblist) == NULL) { + if (phd != NULL && (LIST_FIRST(&phd->phd_pcblist) == NULL)) { LIST_REMOVE(phd, phd_hash); FREE(phd, M_PCB); } diff -urN xnu-344.49/bsd/netinet/in_pcb.h xnu-517/bsd/netinet/in_pcb.h --- xnu-344.49/bsd/netinet/in_pcb.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in_pcb.h Sat Oct 25 00:25:55 2003 @@ -103,8 +103,8 @@ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ - struct in_addr inp_faddr; /* foreign host table entry */ - struct in_addr inp_laddr; /* local host table entry */ + struct in_addr reserved1; /* APPLE reserved: inp_faddr defined in protcol indep. part */ + struct in_addr reserved2; /* APPLE reserved */ u_short inp_fport; /* foreign port */ u_short inp_lport; /* local port */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ @@ -273,7 +273,10 @@ #ifdef __APPLE__ #define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ #endif -#define INP_FAITH 0x400 /* accept FAITH'ed connections */ +#define INP_FAITH 0x400 /* accept FAITH'ed connections */ +#define INP_INADDR_ANY 0x800 /* local address wasn't specified */ + +#define INP_RECVTTL 0x1000 #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ @@ -290,7 +293,7 @@ INP_RECVIF|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ - IN6P_AUTOFLOWLABEL) + IN6P_AUTOFLOWLABEL|INP_RECVTTL) #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_AUTOFLOWLABEL) diff -urN xnu-344.49/bsd/netinet/in_rmx.c xnu-517/bsd/netinet/in_rmx.c --- xnu-344.49/bsd/netinet/in_rmx.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in_rmx.c Sat Oct 25 00:25:55 2003 @@ -223,6 +223,9 @@ &check_routeselfref , 0, ""); #endif +__private_extern__ int use_routegenid = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW, + &use_routegenid , 0, ""); /* * On last reference drop, mark the route as belong to us so that it can be diff -urN xnu-344.49/bsd/netinet/in_var.h xnu-517/bsd/netinet/in_var.h --- xnu-344.49/bsd/netinet/in_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/in_var.h Sat Oct 25 00:25:55 2003 @@ -108,7 +108,7 @@ * Event data, internet style. */ struct kev_in_data { - struct net_event_data link_data; + struct net_event_data link_data; struct in_addr ia_addr; u_long ia_net; /* network number of interface */ u_long ia_netmask; /* mask of net part */ diff -urN xnu-344.49/bsd/netinet/ip_divert.c xnu-517/bsd/netinet/ip_divert.c --- xnu-344.49/bsd/netinet/ip_divert.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_divert.c Sat Oct 25 00:25:55 2003 @@ -554,7 +554,9 @@ return error; } +#ifndef __APPLE__ #warning Fix SYSCTL net_inet_divert +#endif #if 0 SYSCTL_DECL(_net_inet_divert); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, diff -urN xnu-344.49/bsd/netinet/ip_flow.c xnu-517/bsd/netinet/ip_flow.c --- xnu-344.49/bsd/netinet/ip_flow.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_flow.c Sat Oct 25 00:25:55 2003 @@ -197,7 +197,7 @@ dst = &ipf->ipf_ro.ro_dst; #ifdef __APPLE__ /* Not sure the rt_dlt is valid here !! XXX */ - if ((error = dlil_output((u_long)rt->rt_dlt, m, (caddr_t) rt, dst, 0)) != 0) { + if ((error = dlil_output(ifptodlt(rt->rt_ifp, PF_INET), m, (caddr_t) rt, dst, 0)) != 0) { #else if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) { diff -urN xnu-344.49/bsd/netinet/ip_icmp.c xnu-517/bsd/netinet/ip_icmp.c --- xnu-344.49/bsd/netinet/ip_icmp.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_icmp.c Sat Oct 25 00:25:55 2003 @@ -102,6 +102,10 @@ SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, &icmpmaskrepl, 0, ""); +static int icmptimestamp = 0; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW, + &icmptimestamp, 0, ""); + static int drop_redirect = 0; SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, &drop_redirect, 0, ""); @@ -117,7 +121,7 @@ * variable content is -1 and read-only. */ -static int icmplim = 100; +static int icmplim = 250; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, &icmplim, 0, ""); #else @@ -483,6 +487,10 @@ goto reflect; case ICMP_TSTAMP: + + if (icmptimestamp == 0) + break; + if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { icmpstat.icps_bmcasttstamp++; @@ -1011,6 +1019,7 @@ case IP_FAITH: #endif case IP_STRIPHDR: + case IP_RECVTTL: error = rip_ctloutput(so, sopt); break; diff -urN xnu-344.49/bsd/netinet/ip_input.c xnu-517/bsd/netinet/ip_input.c --- xnu-344.49/bsd/netinet/ip_input.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_input.c Sat Oct 25 00:25:55 2003 @@ -286,6 +286,9 @@ extern u_short ip_id; #endif +extern u_long route_generation; +extern int apple_hwcksum_rx; + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -363,7 +366,7 @@ u_int16_t divert_cookie; /* firewall cookie */ struct in_addr pkt_dst; #if IPDIVERT - u_int32_t divert_info = 0; /* packet divert/tee info */ + u_int16_t divert_info = 0; /* packet divert/tee info */ #endif struct ip_fw_chain *rule = NULL; @@ -450,11 +453,9 @@ goto bad; } } - if (m->m_pkthdr.rcvif->if_hwassist == 0) - m->m_pkthdr.csum_flags = 0; - - if ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP) - m->m_pkthdr.csum_flags = 0; + if ((m->m_pkthdr.rcvif->if_hwassist == 0) || (apple_hwcksum_rx == 0) || + ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP)) + m->m_pkthdr.csum_flags = 0; /* invalidate HW generated checksum flags */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); @@ -658,7 +659,7 @@ * ether_output() with the loopback into the stack for * SIMPLEX interfaces handled by ether_output(). */ - if (ia->ia_ifp == m->m_pkthdr.rcvif && + if ((!checkif || ia->ia_ifp == m->m_pkthdr.rcvif) && ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == pkt_dst.s_addr) @@ -685,7 +686,7 @@ } /* - * The process-level routing demon needs to receive + * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ @@ -836,6 +837,9 @@ goto bad; } m->m_flags |= M_FRAG; + } else { + /* Clear the flag in case packet comes from loopback */ + m->m_flags &= ~M_FRAG; } ip->ip_off <<= 3; @@ -1567,7 +1571,8 @@ sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; - if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr || + ipforward_rt.ro_rt->generation_id != route_generation) { if (ipforward_rt.ro_rt) { rtfree(ipforward_rt.ro_rt); ipforward_rt.ro_rt = 0; @@ -1769,7 +1774,8 @@ sin = (struct sockaddr_in *)&ipforward_rt.ro_dst; if ((rt = ipforward_rt.ro_rt) == 0 || - ip->ip_dst.s_addr != sin->sin_addr.s_addr) { + ip->ip_dst.s_addr != sin->sin_addr.s_addr || + ipforward_rt.ro_rt->generation_id != route_generation) { if (ipforward_rt.ro_rt) { rtfree(ipforward_rt.ro_rt); ipforward_rt.ro_rt = 0; @@ -2032,6 +2038,10 @@ IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVTTL) { + *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP); + if (*mp) mp = &(*mp)->m_next; } } diff -urN xnu-344.49/bsd/netinet/ip_output.c xnu-517/bsd/netinet/ip_output.c --- xnu-344.49/bsd/netinet/ip_output.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_output.c Sat Oct 25 00:25:55 2003 @@ -87,6 +87,7 @@ #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) +#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) #if vax @@ -134,6 +135,9 @@ static int ip_setmoptions __P((struct sockopt *, struct ip_moptions **)); +int ip_createmoptions(struct ip_moptions **imop); +int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); +int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_optcopy __P((struct ip *, struct ip *)); extern int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **)); #ifdef __APPLE__ @@ -144,6 +148,7 @@ void in_delayed_cksum(struct mbuf *m); extern int apple_hwcksum_tx; +extern u_long route_generation; extern struct protosw inetsw[]; @@ -169,12 +174,11 @@ struct ip_moptions *imo; { struct ip *ip, *mhip; - struct ifnet *ifp; - u_long dl_tag; + struct ifnet *ifp = NULL; struct mbuf *m = m0; int hlen = sizeof (struct ip); int len, off, error = 0; - struct sockaddr_in *dst; + struct sockaddr_in *dst = NULL; struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; #if IPSEC @@ -216,10 +220,10 @@ imo = NULL ; dst = ((struct dn_pkt *)m)->dn_dst ; ifp = ((struct dn_pkt *)m)->ifp ; - flags = ((struct dn_pkt *)m)->flags ; + flags = ((struct dn_pkt *)m)->flags; m0 = m = m->m_next ; #if IPSEC - if (ipsec_bypass == 0) { + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { so = ipsec_getsocket(m); (void)ipsec_setsocket(m, NULL); } @@ -233,7 +237,7 @@ rule = NULL ; #endif #if IPSEC - if (ipsec_bypass == 0) { + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { so = ipsec_getsocket(m); (void)ipsec_setsocket(m, NULL); } @@ -271,17 +275,24 @@ ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); dst = (struct sockaddr_in *)&ro->ro_dst; + /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != ip->ip_dst.s_addr || + ro->ro_rt->generation_id != route_generation) ) { rtfree(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } if (ro->ro_rt == 0) { + bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; @@ -300,7 +311,6 @@ goto bad; } ifp = ia->ia_ifp; - dl_tag = ia->ia_ifa.ifa_dlt; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else { @@ -322,7 +332,6 @@ } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; - dl_tag = ro->ro_rt->rt_dlt; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; @@ -345,16 +354,16 @@ * See if the caller provided any multicast options */ if (imo != NULL) { - ip->ip_ttl = imo->imo_multicast_ttl; + if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - dl_tag = ifp->if_data.default_proto; } - if (imo->imo_multicast_vif != -1) + if (imo->imo_multicast_vif != -1 && + ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); + ip_mcast_src(imo->imo_multicast_vif); } else - ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ @@ -375,8 +384,13 @@ TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; + break; } + if (ip->ip_src.s_addr == INADDR_ANY) { + error = ENETUNREACH; + goto bad; + } } IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); @@ -499,9 +513,11 @@ #if IPSEC /* temporary for testing only: bypass ipsec alltogether */ - if (ipsec_bypass != 0) + if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0) goto skip_ipsec; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + /* get SP for this packet */ if (so == NULL) sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); @@ -510,6 +526,7 @@ if (sp == NULL) { ipsecstat.out_inval++; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); goto bad; } @@ -522,17 +539,20 @@ * This packet is just discarded. */ ipsecstat.out_polvio++; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); goto bad; } break; @@ -568,7 +588,8 @@ error = ipsec4_output(&state, sp, flags); - m = state.m; + m0 = m = state.m; + if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore @@ -580,6 +601,7 @@ } } else ro = state.ro; + dst = (struct sockaddr_in *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ @@ -599,33 +621,48 @@ error = 0; break; } + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0); goto bad; } } /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif + /* Check that there wasn't a route change and src is still valid */ + + if (ro->ro_rt->generation_id != route_generation) { + if (ifa_foraddr(ip->ip_src.s_addr) == NULL && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + error = EADDRNOTAVAIL; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 5,0,0,0,0); + goto bad; + } + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { printf("ip_output: " "can't update route after IPsec processing\n"); - error = EHOSTUNREACH; /*XXX*/ + error = EHOSTUNREACH; /*XXX*/ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 6,0,0,0,0); goto bad; } } else { ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; - dl_tag = ia->ia_ifa.ifa_dlt; } /* make it flipped, again. */ NTOHS(ip->ip_len); NTOHS(ip->ip_off); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); skip_ipsec: #endif /*IPSEC*/ @@ -641,7 +678,7 @@ if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) goto done; - ip = mtod(m = m1, struct ip *); + ip = mtod(m0 = m = m1, struct ip *); } /* @@ -666,6 +703,7 @@ * unsupported rules), but better play safe and drop * packets in case of doubt. */ + m0 = m; if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) { if (m) m_freem(m); @@ -718,7 +756,7 @@ /* If 'tee', continue with original packet */ if (clone != NULL) { - m = clone; + m0 = m = clone; ip = mtod(m, struct ip *); goto pass; } @@ -778,7 +816,7 @@ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m0->m_pkthdr.csum_data = 0xffff; + m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; @@ -806,7 +844,6 @@ ia = ifatoia(ro_fwd->ro_rt->rt_ifa); ifp = ro_fwd->ro_rt->rt_ifp; - dl_tag = ro_fwd->ro_rt->rt_dlt; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; @@ -895,11 +932,11 @@ #if IPSEC /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0) + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) ipsec_delaux(m); #endif #if __APPLE__ - error = dlil_output(dl_tag, m, (void *) ro->ro_rt, + error = dlil_output(ifptodlt(ifp, PF_INET), m, (void *) ro->ro_rt, (struct sockaddr *)dst, 0); #else error = (*ifp->if_output)(ifp, m, @@ -1032,7 +1069,7 @@ m->m_nextpkt = 0; #if IPSEC /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0) + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) ipsec_delaux(m); #endif if (error == 0) { @@ -1045,7 +1082,7 @@ #endif #if __APPLE__ - error = dlil_output(dl_tag, m, (void *) ro->ro_rt, + error = dlil_output(ifptodlt(ifp, PF_INET), m, (void *) ro->ro_rt, (struct sockaddr *)dst, 0); #else error = (*ifp->if_output)(ifp, m, @@ -1060,7 +1097,7 @@ } done: #if IPSEC - if (ipsec_bypass == 0) { + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { if (ro == &iproute && ro->ro_rt) { rtfree(ro->ro_rt); ro->ro_rt = NULL; @@ -1255,6 +1292,7 @@ case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: + case IP_RECVTTL: #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif @@ -1293,6 +1331,10 @@ OPTSET(INP_RECVIF); break; + case IP_RECVTTL: + OPTSET(INP_RECVTTL); + break; + #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: OPTSET(INP_FAITH); @@ -1391,6 +1433,7 @@ case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: + case IP_RECVTTL: case IP_PORTRANGE: #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: @@ -1423,6 +1466,10 @@ optval = OPTBIT(INP_RECVIF); break; + case IP_RECVTTL: + optval = OPTBIT(INP_RECVTTL); + break; + case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; @@ -1632,8 +1679,6 @@ struct ip_mreq mreq; struct ifnet *ifp = NULL; struct ip_moptions *imo = *imop; - struct route ro; - struct sockaddr_in *dst; int ifindex; int s; @@ -1642,18 +1687,10 @@ * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; + error = ip_createmoptions(imop); + if (error != 0) + return error; + imo = *imop; } switch (sopt->sopt_name) { @@ -1766,78 +1803,8 @@ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; - - if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { - error = EINVAL; - break; - } - s = splimp(); - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq.imr_interface.s_addr == INADDR_ANY) { - bzero((caddr_t)&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq.imr_multiaddr; - rtalloc(&ro); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - } - else { - /* If there's no default route, try using loopback */ - mreq.imr_interface.s_addr = INADDR_LOOPBACK; - } - } - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq.imr_interface, NULL); - } - - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq.imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - splx(s); - break; - } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - splx(s); - break; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - splx(s); - break; - } - ++imo->imo_num_memberships; - splx(s); + error = ip_addmembership(imo, &mreq); break; case IP_DROP_MEMBERSHIP: @@ -1848,54 +1815,8 @@ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; - - if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { - error = EINVAL; - break; - } - - s = splimp(); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq.imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq.imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq.imr_multiaddr.s_addr) - break; - } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - splx(s); + + error = ip_dropmembership(imo, &mreq); break; default: @@ -1916,6 +1837,184 @@ } return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + */ +__private_extern__ int +ip_createmoptions( + struct ip_moptions **imop) +{ + struct ip_moptions *imo; + imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + + return 0; +} + +/* + * Add membership to an IPv4 multicast. + */ +__private_extern__ int +ip_addmembership( + struct ip_moptions *imo, + struct ip_mreq *mreq) +{ + struct route ro; + struct sockaddr_in *dst; + struct ifnet *ifp = NULL; + int error = 0; + int s = 0; + int i; + + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + return error; + } + s = splimp(); + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) { + bzero((caddr_t)&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq->imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + /* If there's no default route, try using loopback */ + mreq->imr_interface.s_addr = INADDR_LOOPBACK; + } + } + + if (ifp == NULL) { + ifp = ip_multicast_if(&mreq->imr_interface, NULL); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + splx(s); + return error; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq->imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + splx(s); + return error; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + splx(s); + return error; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + splx(s); + return error; + } + ++imo->imo_num_memberships; + splx(s); + + return error; +} + +/* + * Drop membership of an IPv4 multicast. + */ +__private_extern__ int +ip_dropmembership( + struct ip_moptions *imo, + struct ip_mreq *mreq) +{ + int error = 0; + int s = 0; + struct ifnet* ifp = NULL; + int i; + + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + return error; + } + + s = splimp(); + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + ifp = ip_multicast_if(&mreq->imr_interface, NULL); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + splx(s); + return error; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq->imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + splx(s); + return error; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + splx(s); + + return error; } /* diff -urN xnu-344.49/bsd/netinet/ip_var.h xnu-517/bsd/netinet/ip_var.h --- xnu-344.49/bsd/netinet/ip_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/ip_var.h Sat Oct 25 00:25:55 2003 @@ -172,8 +172,9 @@ /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ -#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ -#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ +#define IP_NOIPSEC 0x4 /* No IPSec processing */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables (0x0010) */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets (0x0020) */ struct ip; struct inpcb; diff -urN xnu-344.49/bsd/netinet/raw_ip.c xnu-517/bsd/netinet/raw_ip.c --- xnu-344.49/bsd/netinet/raw_ip.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/raw_ip.c Sat Oct 25 00:25:55 2003 @@ -300,6 +300,23 @@ inp->inp_moptions)); } +int +load_ipfw() +{ + kern_return_t err; + + /* Load the kext by the identifier */ + err = kmod_load_extension("com.apple.nke.IPFirewall"); + if (err) return err; + + if (ip_fw_ctl_ptr == NULL) { + /* Wait for the kext to finish loading */ + err = tsleep(&ip_fw_ctl_ptr, PWAIT | PCATCH, "load_ipfw_kext", 5 * 60 /* 5 seconds */); + } + + return err == 0 && ip_fw_ctl_ptr == NULL ? -1 : err; +} + /* * Raw IP socket option processing. */ @@ -334,9 +351,11 @@ case IP_OLD_FW_ADD: case IP_OLD_FW_GET: if (ip_fw_ctl_ptr == 0) - error = ENOPROTOOPT; - else + error = load_ipfw(); + if (ip_fw_ctl_ptr && error == 0) error = ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; break; #if DUMMYNET @@ -401,9 +420,11 @@ case IP_OLD_FW_ZERO: case IP_OLD_FW_RESETLOG: if (ip_fw_ctl_ptr == 0) - error = ENOPROTOOPT; - else + error = load_ipfw(); + if (ip_fw_ctl_ptr && error == 0) error = ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; break; #if DUMMYNET diff -urN xnu-344.49/bsd/netinet/tcp.h xnu-517/bsd/netinet/tcp.h --- xnu-344.49/bsd/netinet/tcp.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp.h Sat Oct 25 00:25:55 2003 @@ -158,5 +158,6 @@ #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ +#define TCP_KEEPALIVE 0x10 /* idle time used when SO_KEEPALIVE is enabled */ #endif diff -urN xnu-344.49/bsd/netinet/tcp_debug.c xnu-517/bsd/netinet/tcp_debug.c --- xnu-344.49/bsd/netinet/tcp_debug.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_debug.c Sat Oct 25 00:25:55 2003 @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -91,7 +92,9 @@ #include #if TCPDEBUG -static int tcpconsdebug = 0; +__private_extern__ int tcpconsdebug = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcpconsdebug, CTLFLAG_RW, + &tcpconsdebug, 0, "Turn tcp debugging on or off"); #endif static struct tcp_debug tcp_debug[TCP_NDEBUG]; @@ -186,7 +189,7 @@ if (tcpconsdebug == 0) return; if (tp) - printf("%p %s:", tp, tcpstates[ostate]); + printf("%x %s:", tp, tcpstates[ostate]); else printf("???????? "); printf("%s ", tanames[act]); diff -urN xnu-344.49/bsd/netinet/tcp_input.c xnu-517/bsd/netinet/tcp_input.c --- xnu-344.49/bsd/netinet/tcp_input.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_input.c Sat Oct 25 00:25:55 2003 @@ -157,7 +157,7 @@ "Listen Queue Overflow"); #if TCP_DROP_SYNFIN -static int drop_synfin = 0; +static int drop_synfin = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif @@ -365,9 +365,9 @@ */ #if INET6 int -tcp6_input(mp, offp, proto) +tcp6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; @@ -800,6 +800,7 @@ #if INET6 struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ + int ogencnt = so->so_gencnt; #if !IPSEC /* @@ -879,6 +880,12 @@ if (!so2) goto drop; } + /* + * Make sure listening socket did not get closed during socket allocation, + * not only this is incorrect but it is know to cause panic + */ + if (so->so_gencnt != ogencnt) + goto drop; #if IPSEC oso = so; #endif @@ -1000,7 +1007,7 @@ */ tp->t_rcvtime = 0; if (TCPS_HAVEESTABLISHED(tp->t_state)) - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); /* * Process options if not in LISTEN state, @@ -1499,7 +1506,7 @@ thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); } } else { /* @@ -1527,7 +1534,7 @@ tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); } tp->t_flags |= TF_NEEDSYN; } else @@ -1598,6 +1605,16 @@ goto drop; } break; /* continue normal processing */ + + /* Received a SYN while connection is already established. + * This is a "half open connection and other anomalies" described + * in RFC793 page 34, send an ACK so the remote reset the connection + * or recovers by adjusting its sequence numberering + */ + case TCPS_ESTABLISHED: + if (thflags & TH_SYN) + goto dropafterack; + break; } /* @@ -1918,7 +1935,7 @@ tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); } /* * If segment contains data or ACK, will call tcp_reass() @@ -2992,21 +3009,16 @@ (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; -#if (MCLBYTES & (MCLBYTES - 1)) == 0 - if (mss > MCLBYTES) - mss &= ~(MCLBYTES-1); -#else - if (mss > MCLBYTES) - mss = mss / MCLBYTES * MCLBYTES; -#endif /* - * If there's a pipesize, change the socket buffer - * to that size. Make the socket buffers an integral + * If there's a pipesize (ie loopback), change the socket + * buffer to that size only if it's bigger than the current + * sockbuf size. Make the socket buffers an integral * number of mss units; if the mss is larger than * the socket buffer, decrease the mss. */ #if RTV_SPIPE - if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) + bufsize = rt->rt_rmx.rmx_sendpipe; + if (bufsize < so->so_snd.sb_hiwat) #endif bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) @@ -3020,7 +3032,8 @@ tp->t_maxseg = mss; #if RTV_RPIPE - if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) + bufsize = rt->rt_rmx.rmx_recvpipe; + if (bufsize < so->so_rcv.sb_hiwat) #endif bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { diff -urN xnu-344.49/bsd/netinet/tcp_output.c xnu-517/bsd/netinet/tcp_output.c --- xnu-344.49/bsd/netinet/tcp_output.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_output.c Sat Oct 25 00:25:55 2003 @@ -133,6 +133,8 @@ #endif extern int slowlink_wsize; /* window correction for slow links */ +extern u_long route_generation; + /* * Tcp output routine: figure out what should be sent and send it. @@ -157,35 +159,15 @@ int maxburst = TCP_MAXBURST; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; -#if INET6 - int isipv6; -#endif - int last_off; + int last_off = 0; int m_off; struct mbuf *m_last = 0; struct mbuf *m_head = 0; - - - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); #if INET6 - if (isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)) { - - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | - (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), - 0,0,0); - } - else + int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; #endif - { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | - (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), - 0,0,0); - } + /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -220,7 +202,68 @@ else tp->snd_cwnd = tp->t_maxseg * ss_fltsz; } + again: + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + +#if INET6 + if (isipv6) { + + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + sendalot,0,0); + } + else +#endif + + { + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + sendalot,0,0); + /* + * If the route generation id changed, we need to check that our + * local (source) IP address is still valid. If it isn't either + * return error or silently do nothing (assuming the address will + * come back before the TCP connection times out). + */ + + if (tp->t_inpcb->inp_route.ro_rt != NULL && + (tp->t_inpcb->inp_route.ro_rt->generation_id != route_generation)) { + /* check that the source address is still valid */ + if (ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr) == NULL) { + if (tp->t_state >= TCPS_CLOSE_WAIT) { + tcp_close(tp); + return(EADDRNOTAVAIL); + } + + /* set Retransmit timer if it wasn't set + * reset Persist timer and shift register as the + * adversed peer window may not be valid anymore + */ + + if (!tp->t_timer[TCPT_REXMT]) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + if (so->so_flags & SOF_NOADDRAVAIL) + return(EADDRNOTAVAIL); + else + return(0); /* silently ignore and keep data in socket */ + } + else { /* Clear the cached route, will be reacquired later */ + rtfree(tp->t_inpcb->inp_route.ro_rt); + tp->t_inpcb->inp_route.ro_rt = (struct rtentry *)0; + } + } + } sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); @@ -678,6 +721,12 @@ m->m_data += max_linkhdr; m->m_len = hdrlen; } + /* makes sure we still have data left to be sent at this point */ + if (so->so_snd.sb_mb == NULL || off == -1) { + if (m != NULL) m_freem(m); + error = 0; /* should we return an error? */ + goto out; + } m_copydata(so->so_snd.sb_mb, off, (int) len, mtod(m, caddr_t) + hdrlen); m->m_len += len; @@ -704,7 +753,13 @@ m_last = NULL; last_off = off + len; m_head = so->so_snd.sb_mb; - + + /* makes sure we still have data left to be sent at this point */ + if (m_head == NULL) { + error = 0; /* should we return an error? */ + goto out; + } + /* * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not @@ -956,7 +1011,7 @@ struct rtentry *rt; ip->ip_len = m->m_pkthdr.len; #if INET6 - if (INP_CHECK_SOCKAF(so, AF_INET6)) + if (isipv6) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, tp->t_inpcb->in6p_route.ro_rt ? tp->t_inpcb->in6p_route.ro_rt->rt_ifp @@ -1060,9 +1115,10 @@ tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); if (sendalot) goto again; - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } diff -urN xnu-344.49/bsd/netinet/tcp_subr.c xnu-517/bsd/netinet/tcp_subr.c --- xnu-344.49/bsd/netinet/tcp_subr.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_subr.c Sat Oct 25 00:25:55 2003 @@ -214,6 +214,7 @@ extern struct inpcbhead time_wait_slots[]; extern int cur_tw_slot; extern u_long *delack_bitmask; +extern u_long route_generation; int get_inpcb_str_size() @@ -702,6 +703,14 @@ callout_stop(tp->tt_keep); callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); +#else + /* Clear the timers before we delete the PCB. */ + { + int i; + for (i = 0; i < TCPT_NTIMERS; i++) { + tp->t_timer[i] = 0; + } + } #endif KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); @@ -740,11 +749,16 @@ goto no_valid_rt; } else -#endif /* INET6 */ - if ((rt = inp->inp_route.ro_rt) == NULL || +#endif /* INET6 */ + rt = inp->inp_route.ro_rt; + if (rt == NULL || ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr - == INADDR_ANY) + == INADDR_ANY || rt->generation_id != route_generation) { + if (tp->t_state >= TCPS_CLOSE_WAIT) + tp->t_state = TCPS_CLOSING; + goto no_valid_rt; + } if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * @@ -915,7 +929,12 @@ struct inpcb *inp; int error; { - struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + struct tcpcb *tp; + + if (inp == NULL) + return; /* pcb is gone already */ + + tp = (struct tcpcb *)inp->inp_ppcb; /* * Ignore some errors if we are hooked up. @@ -1453,13 +1472,7 @@ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) mss -= TCPOLEN_CC_APPA; -#if (MCLBYTES & (MCLBYTES - 1)) == 0 - if (mss > MCLBYTES) - mss &= ~(MCLBYTES-1); -#else - if (mss > MCLBYTES) - mss = mss / MCLBYTES * MCLBYTES; -#endif + if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; @@ -1489,7 +1502,7 @@ if (ro == NULL) return (NULL); rt = ro->ro_rt; - if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + if (rt == NULL || !(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation) { /* No route yet, so try to acquire one */ if (inp->inp_faddr.s_addr != INADDR_ANY) { ro->ro_dst.sa_family = AF_INET; diff -urN xnu-344.49/bsd/netinet/tcp_timer.c xnu-517/bsd/netinet/tcp_timer.c --- xnu-344.49/bsd/netinet/tcp_timer.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_timer.c Sat Oct 25 00:25:55 2003 @@ -93,6 +93,13 @@ #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8)) #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1) +/* + * NOTE - WARNING + * + * + * + * + */ static int sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS { @@ -360,6 +367,10 @@ struct socket *so_tmp; struct tcptemp *t_template; +#if TCPDEBUG + int ostate; +#endif + #if INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; #endif /* INET6 */ @@ -537,7 +548,7 @@ if ((always_keepalive || tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { - if (tp->t_rcvtime >= tcp_keepidle + tcp_maxidle) + if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + tcp_maxidle) goto dropit; /* * Send a packet designed to force a response @@ -561,7 +572,7 @@ } tp->t_timer[TCPT_KEEP] = tcp_keepintvl; } else - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); break; #if TCPDEBUG diff -urN xnu-344.49/bsd/netinet/tcp_timer.h xnu-517/bsd/netinet/tcp_timer.h --- xnu-344.49/bsd/netinet/tcp_timer.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_timer.h Sat Oct 25 00:25:55 2003 @@ -155,6 +155,11 @@ } while(0) #ifdef KERNEL + +#define TCP_KEEPIDLE(tp) \ + (tp->t_keepidle && (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ? \ + tp->t_keepidle : tcp_keepidle) + extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ diff -urN xnu-344.49/bsd/netinet/tcp_usrreq.c xnu-517/bsd/netinet/tcp_usrreq.c --- xnu-344.49/bsd/netinet/tcp_usrreq.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_usrreq.c Sat Oct 25 00:25:55 2003 @@ -262,7 +262,7 @@ } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; - if (ip6_mapped_addr_on && (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { @@ -313,8 +313,7 @@ COMMON_START(); if (inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; - if (ip6_mapped_addr_on && - (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, p); } @@ -387,9 +386,8 @@ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; - if (!ip6_mapped_addr_on || - (inp->inp_flags & IN6P_IPV6_V6ONLY)) - return(EINVAL); + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) + return (EINVAL); in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; @@ -993,6 +991,17 @@ error = EINVAL; break; + case TCP_KEEPALIVE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval < 0) + error = EINVAL; + else + tp->t_keepidle = optval * PR_SLOWHZ; + break; + default: error = ENOPROTOOPT; break; @@ -1007,6 +1016,9 @@ case TCP_MAXSEG: optval = tp->t_maxseg; break; + case TCP_KEEPALIVE: + optval = tp->t_keepidle / PR_SLOWHZ; + break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; break; @@ -1037,6 +1049,11 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); +__private_extern__ int tcp_sockthreshold = 256; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW, + &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold"); + +#define TCP_INCREASED_SPACE 65535 /* Automatically increase tcp send/rcv space to this value */ /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, @@ -1054,15 +1071,28 @@ int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL; #endif - if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, tcp_sendspace, tcp_recvspace); - if (error) - return (error); - } error = in_pcballoc(so, &tcbinfo, p); if (error) return (error); + inp = sotoinpcb(so); + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + /* + * The goal is to let clients have large send/rcv default windows (TCP_INCREASED_SPACE) + * while not hogging mbuf space for servers. This is done by watching a threshold + * of tcpcbs in use and bumping the default send and rcvspace only if under that threshold. + * The theory being that busy servers have a lot more active tcpcbs and don't want the potential + * memory penalty of having much larger sockbuffs. The sysctl allows to fine tune that threshold value. */ + + if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) + error = soreserve(so, MAX(TCP_INCREASED_SPACE, tcp_sendspace), MAX(TCP_INCREASED_SPACE,tcp_recvspace)); + else + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + #if INET6 if (isipv6) { inp->inp_vflag |= INP_IPV6; diff -urN xnu-344.49/bsd/netinet/tcp_var.h xnu-517/bsd/netinet/tcp_var.h --- xnu-344.49/bsd/netinet/tcp_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/tcp_var.h Sat Oct 25 00:25:55 2003 @@ -101,6 +101,7 @@ * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ +#if KERNEL struct tcpcb { struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ @@ -197,8 +198,119 @@ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ + + int t_keepidle; /* keepalive idle timer (override global if > 0) */ +}; +#else + +#define tcpcb otcpcb + +#endif + + +/* + * Jaguar compatible TCP control block, for xtcpcb + * Does not have the old fields + */ +struct otcpcb { + struct tsegqe_head t_segq; + int t_dupacks; /* consecutive dup acks recd */ + struct tcptemp *unused; /* unused now: was t_template */ + + int t_timer[TCPT_NTIMERS]; /* tcp timers */ + + struct inpcb *t_inpcb; /* back pointer to internet pcb */ + int t_state; /* state of this connection */ + u_int t_flags; +#define TF_ACKNOW 0x00001 /* ack peer immediately */ +#define TF_DELACK 0x00002 /* ack, but try to delay it */ +#define TF_NODELAY 0x00004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x00008 /* don't use tcp options */ +#define TF_SENTFIN 0x00010 /* have sent FIN */ +#define TF_REQ_SCALE 0x00020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x00040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x00080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x00100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x00200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x00400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x00800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x01000 /* don't push */ +#define TF_REQ_CC 0x02000 /* have/will request CC */ +#define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x20000 /* listen queue overflow */ +#define TF_RXWIN0SENT 0x40000 /* sent a receiver win 0 in response */ +#define TF_SLOWLINK 0x80000 /* route is a on a modem speed link */ + + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + u_int t_maxopd; /* mss plus options */ + + u_long t_rcvtime; /* inactivity time */ + u_long t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_long t_rttupdated; /* number of times rtt sampled */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; +/* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ + tcp_seq snd_recover; /* for use in fast recovery */ +/* experimental */ + u_long snd_cwnd_prev; /* cwnd prior to retransmit */ + u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_long t_badrxtwin; /* window for retransmit recovery */ }; + /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. @@ -356,7 +468,11 @@ struct xtcpcb { size_t xt_len; struct inpcb xt_inp; - struct tcpcb xt_tp; +#if KERNEL + struct otcpcb xt_tp; +#else + struct tcpcb xt_tp; +#endif struct xsocket xt_socket; u_quad_t xt_alignment_hack; }; diff -urN xnu-344.49/bsd/netinet/udp_usrreq.c xnu-517/bsd/netinet/udp_usrreq.c --- xnu-344.49/bsd/netinet/udp_usrreq.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet/udp_usrreq.c Sat Oct 25 00:25:55 2003 @@ -104,8 +104,6 @@ #define DBG_FNC_UDP_INPUT NETDBG_CODE(DBG_NETUDP, (5 << 8)) #define DBG_FNC_UDP_OUTPUT NETDBG_CODE(DBG_NETUDP, (6 << 8) | 1) - -#define __STDC__ 1 /* * UDP protocol implementation. * Per RFC 768, August, 1980. @@ -135,6 +133,8 @@ #endif extern int apple_hwcksum_rx; +extern int esp_udp_encap_port; +extern u_long route_generation; struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, @@ -429,6 +429,53 @@ udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); return; } + + /* + * UDP to port 4500 with a payload where the first four bytes are + * not zero is a UDP encapsulated IPSec packet. Packets where + * the payload is one byte and that byte is 0xFF are NAT keepalive + * packets. Decapsulate the ESP packet and carry on with IPSec input + * or discard the NAT keep-alive. + */ + if (ipsec_bypass == 0 && (esp_udp_encap_port & 0xFFFF) != 0 && + uh->uh_dport == ntohs((u_short)esp_udp_encap_port)) { + int payload_len = len - sizeof(struct udphdr) > 4 ? 4 : len - sizeof(struct udphdr); + if (m->m_len < iphlen + sizeof(struct udphdr) + payload_len) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr) + payload_len)) == 0) { + udpstat.udps_hdrops++; + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + return; + } + ip = mtod(m, struct ip *); + uh = (struct udphdr *)((caddr_t)ip + iphlen); + } + /* Check for NAT keepalive packet */ + if (payload_len == 1 && *(u_int8_t*)((caddr_t)uh + sizeof(struct udphdr)) == 0xFF) { + m_freem(m); + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + return; + } + else if (payload_len == 4 && *(u_int32_t*)((caddr_t)uh + sizeof(struct udphdr)) != 0) { + /* UDP encapsulated IPSec packet to pass through NAT */ + size_t stripsiz; + + stripsiz = sizeof(struct udphdr); + + ip = mtod(m, struct ip *); + ovbcopy((caddr_t)ip, (caddr_t)(((u_char *)ip) + stripsiz), iphlen); + m->m_data += stripsiz; + m->m_len -= stripsiz; + m->m_pkthdr.len -= stripsiz; + ip = mtod(m, struct ip *); + ip->ip_len = ip->ip_len - stripsiz; + ip->ip_p = IPPROTO_ESP; + + KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); + esp4_input(m, iphlen); + return; + } + } + /* * Locate pcb for datagram. */ @@ -757,6 +804,24 @@ goto release; } + /* If there was a routing change, discard cached route and check + * that we have a valid source address. + * Reacquire a new source address if INADDR_ANY was specified + */ + + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->generation_id != route_generation) { + if (ifa_foraddr(inp->inp_laddr.s_addr) == NULL) { /* src address is gone */ + if (inp->inp_flags & INP_INADDR_ANY) + inp->inp_faddr.s_addr = INADDR_ANY; /* new src will be set later */ + else { + error = EADDRNOTAVAIL; + goto release; + } + } + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)0; + } + if (addr) { laddr = inp->inp_laddr; if (inp->inp_faddr.s_addr != INADDR_ANY) { @@ -778,6 +843,8 @@ goto release; } } + + /* * Calculate data length and get a mbuf * for UDP and IP headers. @@ -785,9 +852,7 @@ M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); if (m == 0) { error = ENOBUFS; - if (addr) - splx(s); - goto release; + goto abort; } /* @@ -825,7 +890,7 @@ #if IPSEC if (ipsec_bypass == 0 && ipsec_setsocket(m, inp->inp_socket) != 0) { error = ENOBUFS; - goto release; + goto abort; } #endif /*IPSEC*/ error = ip_output(m, inp->inp_options, &inp->inp_route, @@ -839,6 +904,13 @@ } KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); return (error); + +abort: + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; /* XXX rehash? */ + splx(s); + } release: m_freem(m); diff -urN xnu-344.49/bsd/netinet6/Makefile xnu-517/bsd/netinet6/Makefile --- xnu-344.49/bsd/netinet6/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/netinet6/Makefile Tue Oct 21 21:24:55 2003 @@ -27,7 +27,7 @@ esp.h in6.h in6_prefix.h \ ipcomp.h mld6_var.h raw_ip6.h esp6.h \ in6_gif.h in6_var.h ip6_mroute.h ipcomp6.h \ - nd6.h scope6_var.h + nd6.h scope6_var.h ip6_fw.h diff -urN xnu-344.49/bsd/netinet6/ah6.h xnu-517/bsd/netinet6/ah6.h --- xnu-344.49/bsd/netinet6/ah6.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ah6.h Sat Oct 25 00:25:55 2003 @@ -42,7 +42,7 @@ #ifdef __APPLE_API_PRIVATE struct secasvar; -extern int ah6_input __P((struct mbuf **, int *, int)); +extern int ah6_input __P((struct mbuf **, int *)); extern int ah6_output __P((struct mbuf *, u_char *, struct mbuf *, struct ipsecrequest *)); extern int ah6_calccksum __P((struct mbuf *, caddr_t, size_t, diff -urN xnu-344.49/bsd/netinet6/ah_input.c xnu-517/bsd/netinet6/ah_input.c --- xnu-344.49/bsd/netinet6/ah_input.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ah_input.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/netinet6/ah_input.c,v 1.1.2.4 2001/07/03 11:01:49 ume Exp $ */ -/* $KAME: ah_input.c,v 1.59 2001/05/16 04:01:27 jinmei Exp $ */ +/* $FreeBSD: src/sys/netinet6/ah_input.c,v 1.1.2.6 2002/04/28 05:40:26 suz Exp $ */ +/* $KAME: ah_input.c,v 1.67 2002/01/07 11:39:56 kjc Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -417,14 +417,6 @@ goto fail; } -#if 0 /* XXX should we call ipfw rather than ipsec_in_reject? */ - /* drop it if it does not match the default policy */ - if (ipsec4_in_reject(m, NULL)) { - ipsecstat.in_polvio++; - goto fail; - } -#endif - #if 1 /* * Should the inner packet be considered authentic? @@ -505,9 +497,9 @@ goto fail; } m_adj(n, stripsiz); - m_cat(m, n); /* m_cat does not update m_pkthdr.len */ m->m_pkthdr.len += n->m_pkthdr.len; + m_cat(m, n); } #endif @@ -567,9 +559,9 @@ #if INET6 int -ah6_input(mp, offp, proto) +ah6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp; int off = *offp; @@ -842,14 +834,6 @@ goto fail; } -#if 0 /* XXX should we call ipfw rather than ipsec_in_reject? */ - /* drop it if it does not match the default policy */ - if (ipsec6_in_reject(m, NULL)) { - ipsec6stat.in_polvio++; - goto fail; - } -#endif - #if 1 /* * should the inner packet be considered authentic? @@ -874,7 +858,7 @@ } IF_ENQUEUE(&ip6intrq, m); m = NULL; - schednetisr(NETISR_IPV6); /*can be skipped but to make sure*/ + schednetisr(NETISR_IPV6); /* can be skipped but to make sure */ splx(s); nxt = IPPROTO_DONE; } else { @@ -924,9 +908,9 @@ goto fail; } m_adj(n, stripsiz); - m_cat(m, n); /* m_cat does not update m_pkthdr.len */ m->m_pkthdr.len += n->m_pkthdr.len; + m_cat(m, n); } #endif ip6 = mtod(m, struct ip6_hdr *); @@ -975,7 +959,7 @@ struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; int off; - struct sockaddr_in6 sa6_src, sa6_dst; + struct sockaddr_in6 *sa6_src, *sa6_dst; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) @@ -1021,9 +1005,11 @@ * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ + sa6_src = ip6cp->ip6c_src; + sa6_dst = (struct sockaddr_in6 *)sa; sav = key_allocsa(AF_INET6, - (caddr_t)&sa6_src.sin6_addr, - (caddr_t)&sa6_dst.sin6_addr, + (caddr_t)&sa6_src->sin6_addr, + (caddr_t)&sa6_dst->sin6_addr, IPPROTO_AH, ahp->ah_spi); if (sav) { if (sav->state == SADB_SASTATE_MATURE || diff -urN xnu-344.49/bsd/netinet6/dest6.c xnu-517/bsd/netinet6/dest6.c --- xnu-344.49/bsd/netinet6/dest6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/dest6.c Sat Oct 25 00:25:55 2003 @@ -54,9 +54,9 @@ * Destination options header processing. */ int -dest6_input(mp, offp, proto) +dest6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp; int off = *offp, dstoptlen, optlen; diff -urN xnu-344.49/bsd/netinet6/esp6.h xnu-517/bsd/netinet6/esp6.h --- xnu-344.49/bsd/netinet6/esp6.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/esp6.h Sat Oct 25 00:25:55 2003 @@ -42,7 +42,7 @@ #ifdef __APPLE_API_PRIVATE extern int esp6_output __P((struct mbuf *, u_char *, struct mbuf *, struct ipsecrequest *)); -extern int esp6_input __P((struct mbuf **, int *, int)); +extern int esp6_input __P((struct mbuf **, int *)); extern void esp6_ctlinput __P((int, struct sockaddr *, void *)); #endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/netinet6/esp_core.c xnu-517/bsd/netinet6/esp_core.c --- xnu-344.49/bsd/netinet6/esp_core.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/esp_core.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/esp_core.c,v 1.1.2.2 2001/07/03 11:01:49 ume Exp $ */ +/* $FreeBSD: src/sys/netinet6/esp_core.c,v 1.1.2.4 2002/03/26 10:12:29 ume Exp $ */ /* $KAME: esp_core.c,v 1.50 2000/11/02 12:27:38 itojun Exp $ */ /* @@ -77,6 +77,11 @@ #include +#include +#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) +#define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) +#define DBG_FNC_ESPAUTH NETDBG_CODE(DBG_NETIPSEC, (8 << 8)) + static int esp_null_mature __P((struct secasvar *)); static int esp_null_decrypt __P((struct mbuf *, size_t, struct secasvar *, const struct esp_algorithm *, int)); @@ -219,6 +224,8 @@ sav->schedlen = (*algo->schedlen)(algo); if (sav->schedlen < 0) return EINVAL; + +//#### that malloc should be replaced by a saved buffer... sav->sched = _MALLOC(sav->schedlen, M_SECA, M_DONTWAIT); if (!sav->sched) { sav->schedlen = 0; @@ -229,6 +236,7 @@ if (error) { ipseclog((LOG_ERR, "esp_schedule %s: error %d\n", algo->name, error)); + bzero(sav->sched, sav->schedlen); FREE(sav->sched, M_SECA); sav->sched = NULL; sav->schedlen = 0; @@ -470,13 +478,13 @@ u_int8_t *s; u_int8_t *d; { - /* HOLY COW! BF_encrypt() takes values in host byteorder */ + /* HOLY COW! BF_decrypt() takes values in host byteorder */ BF_LONG t[2]; bcopy(s, t, sizeof(t)); t[0] = ntohl(t[0]); t[1] = ntohl(t[1]); - BF_encrypt(t, (BF_KEY *)sav->sched, BF_DECRYPT); + BF_decrypt(t, (BF_KEY *)sav->sched); t[0] = htonl(t[0]); t[1] = htonl(t[1]); bcopy(t, d, sizeof(t)); @@ -496,7 +504,7 @@ bcopy(s, t, sizeof(t)); t[0] = ntohl(t[0]); t[1] = ntohl(t[1]); - BF_encrypt(t, (BF_KEY *)sav->sched, BF_ENCRYPT); + BF_encrypt(t, (BF_KEY *)sav->sched); t[0] = htonl(t[0]); t[1] = htonl(t[1]); bcopy(t, d, sizeof(t)); @@ -592,9 +600,8 @@ /* assumption: d has a good alignment */ p = (des_key_schedule *)sav->sched; bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[2], DES_DECRYPT); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[1], DES_ENCRYPT); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[0], DES_DECRYPT); + des_ecb3_encrypt((des_cblock *)d, (des_cblock *)d, + p[0], p[1], p[2], DES_DECRYPT); return 0; } @@ -610,9 +617,8 @@ /* assumption: d has a good alignment */ p = (des_key_schedule *)sav->sched; bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[0], DES_ENCRYPT); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[1], DES_DECRYPT); - des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, p[2], DES_ENCRYPT); + des_ecb3_encrypt((des_cblock *)d, (des_cblock *)d, + p[0], p[1], p[2], DES_ENCRYPT); return 0; } @@ -637,8 +643,8 @@ { struct mbuf *s; struct mbuf *d, *d0, *dp; - int soff, doff; /*offset from the head of chain, to head of this mbuf */ - int sn, dn; /*offset from the head of the mbuf, to meat */ + int soff, doff; /* offset from the head of chain, to head of this mbuf */ + int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; u_int8_t iv[MAXIVLEN], *ivp; u_int8_t sbuf[MAXIVLEN], *sp; @@ -841,8 +847,8 @@ { struct mbuf *s; struct mbuf *d, *d0, *dp; - int soff, doff; /*offset from the head of chain, to head of this mbuf */ - int sn, dn; /*offset from the head of the mbuf, to meat */ + int soff, doff; /* offset from the head of chain, to head of this mbuf */ + int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; u_int8_t iv[MAXIVLEN], *ivp; u_int8_t sbuf[MAXIVLEN], *sp; @@ -1067,16 +1073,20 @@ "esp_auth: mbuf length < skip + length\n")); return EINVAL; } + + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_START, skip,length,0,0,0); /* * length of esp part (excluding authentication data) must be 4n, * since nexthdr must be at offset 4n+3. */ if (length % 4) { ipseclog((LOG_ERR, "esp_auth: length is not multiple of 4\n")); + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 1,0,0,0,0); return EINVAL; } if (!sav) { ipseclog((LOG_DEBUG, "esp_auth: NULL SA passed\n")); + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 2,0,0,0,0); return EINVAL; } algo = ah_algorithm_lookup(sav->alg_auth); @@ -1084,6 +1094,7 @@ ipseclog((LOG_ERR, "esp_auth: bad ESP auth algorithm passed: %d\n", sav->alg_auth)); + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 3,0,0,0,0); return EINVAL; } @@ -1095,6 +1106,7 @@ ipseclog((LOG_DEBUG, "esp_auth: AH_MAXSUMSIZE is too small: siz=%lu\n", (u_long)siz)); + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 4,0,0,0,0); return EINVAL; } @@ -1113,8 +1125,10 @@ } error = (*algo->init)(&s, sav); - if (error) + if (error) { + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 5,0,0,0,0); return error; + } while (0 < length) { if (!m) @@ -1134,5 +1148,6 @@ (*algo->result)(&s, sumbuf); bcopy(sumbuf, sum, siz); /*XXX*/ + KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6,0,0,0,0); return 0; } diff -urN xnu-344.49/bsd/netinet6/esp_input.c xnu-517/bsd/netinet6/esp_input.c --- xnu-344.49/bsd/netinet6/esp_input.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/esp_input.c Sat Oct 25 00:25:55 2003 @@ -89,6 +89,11 @@ #include +#include +#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) +#define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) +#define DBG_FNC_ESPIN NETDBG_CODE(DBG_NETIPSEC, (6 << 8)) +#define DBG_FNC_DECRYPT NETDBG_CODE(DBG_NETIPSEC, (7 << 8)) #define IPLEN_FLIPPED #if INET @@ -116,6 +121,7 @@ size_t esplen; int s; + KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_START, 0,0,0,0,0); /* sanity check for alignment. */ if (off % 4 != 0 || m->m_pkthdr.len % 4 != 0) { ipseclog((LOG_ERR, "IPv4 ESP input: packet alignment problem " @@ -308,14 +314,17 @@ */ if (!algo->decrypt) panic("internal error: no decrypt function"); + KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_START, 0,0,0,0,0); if ((*algo->decrypt)(m, off, sav, algo, ivlen)) { /* m is already freed */ m = NULL; ipseclog((LOG_ERR, "decrypt fail in IPv4 ESP input: %s\n", ipsec_logsastr(sav))); ipsecstat.in_inval++; + KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 1,0,0,0,0); goto bad; } + KERNEL_DEBUG(DBG_FNC_DECRYPT | DBG_FUNC_END, 2,0,0,0,0); ipsecstat.in_esphist[sav->alg_enc]++; m->m_flags |= M_DECRYPTED; @@ -378,20 +387,15 @@ goto bad; } -#if 0 /* XXX should call ipfw rather than ipsec_in_reject, shouldn't it ? */ - /* drop it if it does not match the default policy */ - if (ipsec4_in_reject(m, NULL)) { - ipsecstat.in_polvio++; - goto bad; - } -#endif - key_sa_recordxfer(sav, m); if (ipsec_addhist(m, IPPROTO_ESP, spi) != 0 || ipsec_addhist(m, IPPROTO_IPV4, 0) != 0) { ipsecstat.in_nomem++; goto bad; } + + /* Clear the csum flags, they can't be valid for the inner headers */ + m->m_pkthdr.csum_flags = 0; s = splimp(); if (IF_QFULL(&ipintrq)) { @@ -404,6 +408,7 @@ schednetisr(NETISR_IP); /*can be skipped but to make sure*/ splx(s); nxt = IPPROTO_DONE; + KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 2,0,0,0,0); } else { /* * strip off ESP header and IV. @@ -433,6 +438,17 @@ ipsecstat.in_nomem++; goto bad; } + + /* + * Set the csum valid flag, if we authenticated the + * packet, the payload shouldn't be corrupt unless + * it was corrupted before being signed on the other + * side. + */ + if (nxt == IPPROTO_TCP || nxt == IPPROTO_UDP) { + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + } if (nxt != IPPROTO_DONE) { if ((ip_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && @@ -440,6 +456,7 @@ ipsecstat.in_polvio++; goto bad; } + KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 3,0,0,0,0); (*ip_protox[nxt]->pr_input)(m, off); } else m_freem(m); @@ -462,15 +479,16 @@ } if (m) m_freem(m); + KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 4,0,0,0,0); return; } #endif /* INET */ #if INET6 int -esp6_input(mp, offp, proto) +esp6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp; int off = *offp; @@ -752,14 +770,6 @@ goto bad; } -#if 0 /* XXX should call ipfw rather than ipsec_in_reject, shouldn't it ? */ - /* drop it if it does not match the default policy */ - if (ipsec6_in_reject(m, NULL)) { - ipsec6stat.in_polvio++; - goto bad; - } -#endif - key_sa_recordxfer(sav, m); if (ipsec_addhist(m, IPPROTO_ESP, spi) != 0 || ipsec_addhist(m, IPPROTO_IPV6, 0) != 0) { @@ -814,9 +824,9 @@ goto bad; } m_adj(n, stripsiz); - m_cat(m, n); /* m_cat does not update m_pkthdr.len */ m->m_pkthdr.len += n->m_pkthdr.len; + m_cat(m, n); } #ifndef PULLDOWN_TEST @@ -855,10 +865,10 @@ m_freem(m); } else { m_copydata(m, 0, maxlen, mtod(n, caddr_t)); - m_adj(m, maxlen); n->m_len = maxlen; n->m_pkthdr.len = m->m_pkthdr.len; n->m_next = m; + m_adj(m, maxlen); m->m_flags &= ~M_PKTHDR; } m = n; @@ -910,7 +920,7 @@ struct ip6_hdr *ip6; struct mbuf *m; int off; - struct sockaddr_in6 sa6_src, sa6_dst; + struct sockaddr_in6 *sa6_src, *sa6_dst; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) @@ -974,10 +984,12 @@ * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ + sa6_src = ip6cp->ip6c_src; + sa6_dst = (struct sockaddr_in6 *)sa; sav = key_allocsa(AF_INET6, - (caddr_t)&sa6_src.sin6_addr, - (caddr_t)&sa6_dst, IPPROTO_ESP, - espp->esp_spi); + (caddr_t)&sa6_src->sin6_addr, + (caddr_t)&sa6_dst->sin6_addr, + IPPROTO_ESP, espp->esp_spi); if (sav) { if (sav->state == SADB_SASTATE_MATURE || sav->state == SADB_SASTATE_DYING) diff -urN xnu-344.49/bsd/netinet6/esp_output.c xnu-517/bsd/netinet6/esp_output.c --- xnu-344.49/bsd/netinet6/esp_output.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/esp_output.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/netinet6/esp_output.c,v 1.1.2.2 2001/07/03 11:01:50 ume Exp $ */ -/* $KAME: esp_output.c,v 1.43 2001/03/01 07:10:45 itojun Exp $ */ +/* $FreeBSD: src/sys/netinet6/esp_output.c,v 1.1.2.3 2002/04/28 05:40:26 suz Exp $ */ +/* $KAME: esp_output.c,v 1.44 2001/07/26 06:53:15 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -56,6 +56,7 @@ #include #include #include +#include /* for nat traversal */ #if INET6 #include @@ -80,9 +81,18 @@ #include +#include +#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) +#define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) +#define DBG_FNC_ESPOUT NETDBG_CODE(DBG_NETIPSEC, (4 << 8)) +#define DBG_FNC_ENCRYPT NETDBG_CODE(DBG_NETIPSEC, (5 << 8)) + static int esp_output __P((struct mbuf *, u_char *, struct mbuf *, struct ipsecrequest *, int)); +extern int esp_udp_encap_port; +extern u_int32_t natt_now; + /* * compute ESP header size. */ @@ -96,6 +106,7 @@ size_t ivlen; size_t authlen; size_t hdrsiz; + size_t maxpad; /* sanity check */ if (isr == NULL) @@ -120,16 +131,15 @@ if (ivlen < 0) goto estimate; - /* - * XXX - * right now we don't calcurate the padding size. simply - * treat the padding size as constant, for simplicity. - * - * XXX variable size padding support - */ + if (algo->padbound) + maxpad = algo->padbound; + else + maxpad = 4; + maxpad += 1; /* maximum 'extendsiz' is padbound + 1, see esp_output */ + if (sav->flags & SADB_X_EXT_OLD) { /* RFC 1827 */ - hdrsiz = sizeof(struct esp) + ivlen + 9; + hdrsiz = sizeof(struct esp) + ivlen + maxpad; } else { /* RFC 2406 */ aalgo = ah_algorithm_lookup(sav->alg_auth); @@ -137,21 +147,28 @@ authlen = (aalgo->sumsiz)(sav); else authlen = 0; - hdrsiz = sizeof(struct newesp) + ivlen + 9 + authlen; + hdrsiz = sizeof(struct newesp) + ivlen + maxpad + authlen; } + + /* + * If the security association indicates that NATT is required, + * add the size of the NATT encapsulation header: + */ + if ((sav->flags & SADB_X_EXT_NATT) != 0) hdrsiz += sizeof(struct udphdr) + 4; return hdrsiz; estimate: /* * ASSUMING: - * sizeof(struct newesp) > sizeof(struct esp). + * sizeof(struct newesp) > sizeof(struct esp). (8) * esp_max_ivlen() = max ivlen for CBC mode - * 9 = (maximum padding length without random padding length) + * 17 = (maximum padding length without random padding length) * + (Pad Length field) + (Next Header field). * 16 = maximum ICV we support. + * sizeof(struct udphdr) in case NAT traversal is used */ - return sizeof(struct newesp) + esp_max_ivlen() + 9 + 16; + return sizeof(struct newesp) + esp_max_ivlen() + 17 + 16 + sizeof(struct udphdr); } /* @@ -197,7 +214,11 @@ size_t extendsiz; int error = 0; struct ipsecstat *stat; + struct udphdr *udp = NULL; + int udp_encapsulate = (sav->flags & SADB_X_EXT_NATT && af == AF_INET && + (esp_udp_encap_port & 0xFFFF) != 0); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_START, sav->ivlen,0,0,0,0); switch (af) { #if INET case AF_INET: @@ -213,6 +234,7 @@ #endif default: ipseclog((LOG_ERR, "esp_output: unsupported af %d\n", af)); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 1,0,0,0,0); return 0; /* no change at all */ } @@ -246,6 +268,7 @@ panic("esp_output: should not reach here"); } m_freem(m); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 2,0,0,0,0); return EINVAL; } @@ -254,6 +277,7 @@ ipseclog((LOG_ERR, "esp_output: unsupported algorithm: " "SPI=%u\n", (u_int32_t)ntohl(sav->spi))); m_freem(m); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 3,0,0,0,0); return EINVAL; } spi = sav->spi; @@ -276,9 +300,9 @@ #if INET6 struct ip6_hdr *ip6 = NULL; #endif - size_t esplen; /*sizeof(struct esp/newesp)*/ - size_t esphlen; /*sizeof(struct esp/newesp) + ivlen*/ - size_t hlen = 0; /*ip header len*/ + size_t esplen; /* sizeof(struct esp/newesp) */ + size_t esphlen; /* sizeof(struct esp/newesp) + ivlen */ + size_t hlen = 0; /* ip header len */ if (sav->flags & SADB_X_EXT_OLD) { /* RFC 1827 */ @@ -298,6 +322,7 @@ ipseclog((LOG_DEBUG, "esp%d_output: md is not in chain\n", afnumber)); m_freem(m); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 4,0,0,0,0); return EINVAL; } @@ -334,11 +359,16 @@ mprev->m_next = md; espoff = m->m_pkthdr.len - plen; + + if (udp_encapsulate) { + esphlen += sizeof(struct udphdr); + espoff += sizeof(struct udphdr); + } /* * grow the mbuf to accomodate ESP header. * before: IP ... payload - * after: IP ... ESP IV payload + * after: IP ... [UDP] ESP IV payload */ if (M_LEADINGSPACE(md) < esphlen || (md->m_flags & M_EXT) != 0) { MGET(n, M_DONTWAIT, MT_DATA); @@ -351,16 +381,25 @@ mprev->m_next = n; n->m_next = md; m->m_pkthdr.len += esphlen; - esp = mtod(n, struct esp *); + if (udp_encapsulate) { + udp = mtod(n, struct udphdr *); + esp = (struct esp *)((caddr_t)udp + sizeof(struct udphdr)); + } else { + esp = mtod(n, struct esp *); + } } else { md->m_len += esphlen; md->m_data -= esphlen; m->m_pkthdr.len += esphlen; esp = mtod(md, struct esp *); + if (udp_encapsulate) { + udp = mtod(md, struct udphdr *); + esp = (struct esp *)((caddr_t)udp + sizeof(struct udphdr)); + } else { + esp = mtod(md, struct esp *); + } } - nxt = *nexthdrp; - *nexthdrp = IPPROTO_ESP; switch (af) { #if INET case AF_INET: @@ -397,6 +436,7 @@ ipsec_logsastr(sav))); stat->out_inval++; m_freem(m); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 5,0,0,0,0); return EINVAL; } } @@ -523,6 +563,22 @@ extend[i] = (i + 1) & 0xff; break; } + + nxt = *nexthdrp; + if (udp_encapsulate) { + *nexthdrp = IPPROTO_UDP; + + /* Fill out the UDP header */ + udp->uh_sport = ntohs((u_short)esp_udp_encap_port); + udp->uh_dport = ntohs(sav->remote_ike_port); +// udp->uh_len set later, after all length tweaks are complete + udp->uh_sum = 0; + + /* Update last sent so we know if we need to send keepalive */ + sav->natt_last_activity = natt_now; + } else { + *nexthdrp = IPPROTO_ESP; + } /* initialize esp trailer. */ esptail = (struct esptail *) @@ -571,13 +627,16 @@ */ if (!algo->encrypt) panic("internal error: no encrypt function"); + KERNEL_DEBUG(DBG_FNC_ENCRYPT | DBG_FUNC_START, 0,0,0,0,0); if ((*algo->encrypt)(m, espoff, plen + extendsiz, sav, algo, ivlen)) { /* m is already freed */ ipseclog((LOG_ERR, "packet encryption failure\n")); stat->out_inval++; error = EINVAL; + KERNEL_DEBUG(DBG_FNC_ENCRYPT | DBG_FUNC_END, 1,error,0,0,0); goto fail; } + KERNEL_DEBUG(DBG_FNC_ENCRYPT | DBG_FUNC_END, 2,0,0,0,0); /* * calculate ICV if required. @@ -618,7 +677,7 @@ while (n->m_next) n = n->m_next; - if (!(n->m_flags & M_EXT) && siz < M_TRAILINGSPACE(n)) { /*XXX*/ + if (!(n->m_flags & M_EXT) && siz < M_TRAILINGSPACE(n)) { /* XXX */ n->m_len += siz; m->m_pkthdr.len += siz; p = mtod(n, u_char *) + n->m_len - siz; @@ -666,6 +725,13 @@ #endif } } + + if (udp_encapsulate) { + struct ip *ip; + ip = mtod(m, struct ip *); + udp->uh_ulen = htons(ntohs(ip->ip_len) - (IP_VHL_HL(ip->ip_vhl) << 2)); + } + noantireplay: if (!m) { @@ -675,10 +741,12 @@ stat->out_success++; stat->out_esphist[sav->alg_enc]++; key_sa_recordxfer(sav, m); + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 6,0,0,0,0); return 0; fail: #if 1 + KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 7,error,0,0,0); return error; #else panic("something bad in esp_output"); diff -urN xnu-344.49/bsd/netinet6/frag6.c xnu-517/bsd/netinet6/frag6.c --- xnu-344.49/bsd/netinet6/frag6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/frag6.c Sat Oct 25 00:25:55 2003 @@ -128,9 +128,9 @@ * Fragment input */ int -frag6_input(mp, offp, proto) +frag6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; diff -urN xnu-344.49/bsd/netinet6/icmp6.c xnu-517/bsd/netinet6/icmp6.c --- xnu-344.49/bsd/netinet6/icmp6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/icmp6.c Sat Oct 25 00:25:55 2003 @@ -374,7 +374,7 @@ m->m_pkthdr.rcvif = NULL; icmp6stat.icp6s_outhist[type]++; - icmp6_reflect(m, sizeof(struct ip6_hdr)); /*header order: IPv6 - ICMPv6*/ + icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; @@ -389,9 +389,9 @@ * Process a received ICMP6 message. */ int -icmp6_input(mp, offp, proto) +icmp6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; @@ -402,7 +402,7 @@ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); - /* m might change if M_LOOP. So, call mtod after this */ + /* m might change if M_LOOP. So, call mtod after this */ #endif /* @@ -706,9 +706,9 @@ bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); - bcopy(hostname, p + 4, maxhlen); /*meaningless TTL*/ + bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */ noff = sizeof(struct ip6_hdr); - M_COPY_PKTHDR(n, m); /* just for recvif */ + M_COPY_PKTHDR(n, m); /* just for rcvif */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; @@ -859,7 +859,7 @@ static int icmp6_notify_error(m, off, icmp6len, code) struct mbuf *m; - int off, icmp6len; + int off, icmp6len, code; { struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; @@ -899,7 +899,7 @@ struct ip6_rthdr0 *rth0; int rthlen; - while (1) { /* XXX: should avoid inf. loop explicitly? */ + while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { @@ -1013,7 +1013,7 @@ default: /* * This case includes ESP and the No Next - * Header. In such cases going to the notify + * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update @@ -1562,7 +1562,7 @@ } panic("should not reach here"); - /*NOTREACHED*/ + /* NOTREACHED */ fail: if (m) @@ -1713,7 +1713,7 @@ /* * check if anycast is okay. - * XXX: just experimental. not in the spec. + * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) @@ -2114,7 +2114,7 @@ if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) { /* * This is the case if the dst is our link-local address - * and the sender is also ourseleves. + * and the sender is also ourselves. */ src = &t; } @@ -2125,7 +2125,7 @@ /* * This case matches to multicasts, our anycast, or unicasts - * that we do not own. Select a source address based on the + * that we do not own. Select a source address based on the * source address of the erroneous packet. */ bzero(&ro, sizeof(ro)); @@ -2361,7 +2361,7 @@ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); - if (!is_onlink) { /* better router case. perform rtredirect. */ + if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; @@ -2423,14 +2423,14 @@ icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); - /* if we are not router, we don't send icmp6 redirect */ - if (!ip6_forwarding || ip6_accept_rtadv) - goto fail; - /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; + /* if we are not router, we don't send icmp6 redirect */ + if (!ip6_forwarding || ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD)) + goto fail; + /* * Address check: * the source address must identify a neighbor, and @@ -2549,7 +2549,7 @@ if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; - len = (len + 7) & ~7; /*round by 8*/ + len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; @@ -2808,8 +2808,8 @@ timersub(&tv, lasttime, &delta); /* - * check for 0,0 is so that the message will be seen at least once. - * if more than one second have passed since the last update of + * Check for 0,0 so that the message will be seen at least once. + * If more than one second has passed since the last update of * lasttime, reset the counter. * * we do increment *curpps even in *curpps < maxpps case, as some may @@ -2827,7 +2827,7 @@ else rv = 0; -#if 1 /*DIAGNOSTIC?*/ +#if 1 /* DIAGNOSTIC? */ /* be careful about wrap-around */ if (*curpps + 1 > *curpps) *curpps = *curpps + 1; @@ -2862,7 +2862,7 @@ { int ret; - ret = 0; /*okay to send*/ + ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, diff -urN xnu-344.49/bsd/netinet6/in6.c xnu-517/bsd/netinet6/in6.c --- xnu-344.49/bsd/netinet6/in6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6.c Sat Oct 25 00:25:55 2003 @@ -182,7 +182,6 @@ */ if (cmd == RTM_ADD && nrt && ifa != nrt->rt_ifa) { rtsetifa(nrt, ifa); - nrt->rt_dlt = ifa->ifa_dlt; } /* @@ -249,7 +248,7 @@ */ /* - * Delete the entry only if exact one ifa exists. More than one ifa + * Delete the entry only if exact one ifa exists. More than one ifa * can exist if we assign a same single address to multiple * (probably p2p) interfaces. * XXX: we should avoid such a configuration in IPv6... @@ -265,9 +264,9 @@ if (ia_count == 1) { /* * Before deleting, check if a corresponding loopbacked host - * route surely exists. With this check, we can avoid to + * route surely exists. With this check, we can avoid to * delete an interface direct route whose destination is same - * as the address being removed. This can happen when remofing + * as the address being removed. This can happen when remofing * a subnet-router anycast address on an interface attahced * to a shared medium. */ @@ -398,7 +397,7 @@ case SIOCSIFINFO_FLAGS: if (!privileged) return(EPERM); - /*fall through*/ + /* fall through */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGDRLST_IN6: @@ -421,7 +420,7 @@ return(EOPNOTSUPP); } - switch(cmd) { + switch (cmd) { case SIOCSSCOPE6: if (!privileged) return(EPERM); @@ -440,7 +439,7 @@ case SIOCDLIFADDR: if (!privileged) return(EPERM); - /*fall through*/ + /* fall through */ case SIOCGLIFADDR: return in6_lifaddr_ioctl(so, cmd, data, ifp, p); } @@ -449,35 +448,92 @@ switch (cmd) { - case SIOCPROTOATTACH: - in6_if_up(ifp); - break; - case SIOCPROTODETACH: - in6_purgeif(ifp); + case SIOCAUTOCONF_START: + ifp->if_eflags |= IFEF_ACCEPT_RTADVD; + return (0); + + case SIOCAUTOCONF_STOP: + { + struct ifaddr *ifa, *nifa = NULL; + + ifp->if_eflags &= ~IFEF_ACCEPT_RTADVD; + + /* nuke prefix list. this may try to remove some of ifaddrs as well */ + in6_purgeprefix(ifp); + + /* removed autoconfigured address from interface */ + + for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) + { + nifa = TAILQ_NEXT(ifa, ifa_list); + if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_AUTOCONF) + in6_purgeaddr(ifa); + } + return (0); + } + + + case SIOCLL_START: + + /* NOTE: All the interface specific DLIL attachements should be done here + * They are currently done in in6_ifattach() for the interfaces that need it + */ + + if (ifp->if_type == IFT_PPP && ifra->ifra_addr.sin6_family == AF_INET6 && + ifra->ifra_dstaddr.sin6_family == AF_INET6) + in6_if_up(ifp, ifra); /* PPP may provide LinkLocal addresses */ + else + in6_if_up(ifp, 0); + + return(0); + + case SIOCLL_STOP: + { + struct ifaddr *ifa, *nifa = NULL; + + /* removed link local addresses from interface */ + + for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) + { + nifa = TAILQ_NEXT(ifa, ifa_list); + if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) + in6_purgeaddr(ifa); + } + return (0); + } + + + case SIOCPROTOATTACH_IN6: + switch (ifp->if_type) { - case IFT_ETHER: - error = ether_detach_inet6(ifp); - break; - case IFT_GIF: - error = gif_detach_proto_family(ifp, PF_INET6); - break; - case IFT_STF: - error = stf_detach_inet6(ifp); - break; - case IFT_LOOP: /* do not detach loopback */ - break; - default: - printf("SIOCPROTODETACH: %s%d unknown type, can't detach\n", - ifp->if_name, ifp->if_unit); - return(ENOENT); +#if IFT_BRIDGE /*OpenBSD 2.8*/ + /* some of the interfaces are inherently not IPv6 capable */ + case IFT_BRIDGE: + return; +#endif + default: + + if (error = dlil_plumb_protocol(PF_INET6, ifp, &dl_tag)) + printf("SIOCPROTOATTACH_IN6: %s error=%d\n", + if_name(ifp), error); break; + } - if (error) { - printf("SIOCPROTODETACH: %s%d ether_detach_inet6 error=%x\n", - ifp->if_name, ifp->if_unit, error); - return(error); - } - break; + return (error); + + + case SIOCPROTODETACH_IN6: + + in6_purgeif(ifp); /* Cleanup interface routes and addresses */ + + if (error = dlil_unplumb_protocol(PF_INET6, ifp)) + printf("SIOCPROTODETACH_IN6: %s error=%d\n", + if_name(ifp), error); + return(error); } #endif @@ -648,10 +704,17 @@ { int i, error = 0; struct nd_prefix pr0, *pr; - + if (dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag) == EPROTONOSUPPORT) { - in6_if_up(ifp); /* no dl_tag, the interface is not "up" for IPv6 yet */ + /* Address is added without previous IPv6 configurator support (gif, stf etc...) */ + if (error = dlil_plumb_protocol(PF_INET6, ifp, &dl_tag)) { + printf("SIOCAIFADDR_IN6: %s can't plumb protocol error=%d\n", + if_name(ifp), error); + return (error); + } + in6_if_up(ifp, NULL); } + /* * first, make or update the interface address structure, @@ -685,10 +748,11 @@ ifra->ifra_prefixmask.sin6_addr.s6_addr32[i]; } /* - * XXX: since we don't have enough APIs, we just set inifinity - * to lifetimes. They can be overridden by later advertised - * RAs (when accept_rtadv is non 0), but we'd rather intend - * such a behavior. + * XXX: since we don't have an API to set prefix (not address) + * lifetimes, we just use the same lifetimes as addresses. + * The (temporarily) installed lifetimes can be overridden by + * later advertised RAs (when accept_rtadv is non 0), which is + * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = @@ -745,12 +809,9 @@ * other addresses detached. */ pfxlist_onlink_check(); + in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia); } - dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag); - ia->ia_ifa.ifa_dlt = dl_tag; - - in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia); break; } @@ -803,9 +864,7 @@ default: #ifdef __APPLE__ - error = dlil_ioctl(0, ifp, cmd, (caddr_t)data); - if (error == EOPNOTSUPP) - error = 0; + error = dlil_ioctl(PF_INET6, ifp, cmd, (caddr_t)data); return error; #else @@ -870,7 +929,7 @@ } else { /* - * In this case, ia must not be NULL. We just use its prefix + * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); @@ -1226,7 +1285,7 @@ struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; /* stop DAD processing */ - nd6_dad_stoptimer(ifa); + nd6_dad_stop(ifa); /* * delete route to the destination of the address being purged. @@ -1402,7 +1461,7 @@ /* address must be specified on GET with IFLR_PREFIX */ if ((iflr->flags & IFLR_PREFIX) == 0) break; - /*FALLTHROUGH*/ + /* FALLTHROUGH */ case SIOCALIFADDR: case SIOCDLIFADDR: /* address must be specified on ADD and DELETE */ @@ -1418,10 +1477,10 @@ if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6)) return EINVAL; break; - default: /*shouldn't happen*/ + default: /* shouldn't happen */ #if 0 panic("invalid cmd to in6_lifaddr_ioctl"); - /*NOTREACHED*/ + /* NOTREACHED */ #else return EOPNOTSUPP; #endif @@ -1523,7 +1582,7 @@ } else { if (cmd == SIOCGLIFADDR) { /* on getting an address, take the 1st match */ - cmp = 0; /*XXX*/ + cmp = 0; /* XXX */ } else { /* on deleting an address, do exact match */ in6_len2mask(&mask, 128); @@ -1596,7 +1655,7 @@ in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); - iflr->flags = ia->ia6_flags; /*XXX*/ + iflr->flags = ia->ia6_flags; /* XXX */ return 0; } else { @@ -1626,7 +1685,7 @@ } } - return EOPNOTSUPP; /*just for safety*/ + return EOPNOTSUPP; /* just for safety */ } /* @@ -1663,9 +1722,7 @@ if (ifacount <= 1 && #ifdef __APPLE__ - (error = dlil_ioctl(0, ifp, SIOCSIFADDR, (caddr_t)ia))) { - if (error == EOPNOTSUPP) - error = 0; + (error = dlil_ioctl(PF_INET6, ifp, SIOCSIFADDR, (caddr_t)ia))) { if (error) { splx(s); return(error); @@ -1703,7 +1760,7 @@ ia->ia_ifa.ifa_flags |= RTF_CLONING; } - /* Add ownaddr as loopback rtentry, if necessary(ex. on p2p link). */ + /* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */ if (newhost) { /* set the rtrequest function to create llinfo */ ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; @@ -1786,7 +1843,7 @@ struct ifmultiaddr *ifma = in6m->in6m_ifma; int s = splnet(); - if (ifma->ifma_refcount == 1) { + if (ifma && ifma->ifma_refcount == 1) { /* * No remaining claims to this record; let MLD6 know * that we are leaving the multicast group. @@ -1797,7 +1854,8 @@ FREE(in6m, M_IPMADDR); } /* XXX - should be separate API for when we have an ifma? */ - if_delmulti(ifma->ifma_ifp, ifma->ifma_addr); + if (ifma) + if_delmultiaddr(ifma); splx(s); } @@ -1864,8 +1922,8 @@ static char ip6buf[8][48]; int i; char *cp; - u_short *a = (u_short *)addr; - u_char *d; + const u_short *a = (const u_short *)addr; + const u_char *d; int dcolon = 0; ip6round = (ip6round + 1) & 7; @@ -1894,7 +1952,7 @@ a++; continue; } - d = (u_char *)a; + d = (const u_char *)a; *cp++ = digits[*d >> 4]; *cp++ = digits[*d++ & 0xf]; *cp++ = digits[*d >> 4]; @@ -2310,7 +2368,7 @@ int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = 0; - struct in6_ifaddr *dep[2]; /*last-resort: deprecated*/ + struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; @@ -2389,8 +2447,9 @@ * perform DAD when interface becomes IFF_UP. */ void -in6_if_up(ifp) +in6_if_up(ifp, ifra) struct ifnet *ifp; + struct in6_aliasreq *ifra; { struct ifaddr *ifa; struct in6_ifaddr *ia; @@ -2402,7 +2461,7 @@ /* * special cases, like 6to4, are handled in in6_ifattach */ - in6_ifattach(ifp, NULL); + in6_ifattach(ifp, NULL, ifra); dad_delay = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) @@ -2473,7 +2532,7 @@ } /* - * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be + * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void diff -urN xnu-344.49/bsd/netinet6/in6.h xnu-517/bsd/netinet6/in6.h --- xnu-344.49/bsd/netinet6/in6.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6.h Sat Oct 25 00:25:55 2003 @@ -418,10 +418,11 @@ #define IPV6_CHECKSUM 26 /* int; checksum offset for raw socket */ #define IPV6_V6ONLY 27 /* bool; only bind INET6 at wildcard bind */ -#ifndef _KERNEL +#ifndef KERNEL #define IPV6_BINDV6ONLY IPV6_V6ONLY #endif + #if 1 /*IPSEC*/ #define IPV6_IPSEC_POLICY 28 /* struct; get/set security policy */ #endif @@ -587,13 +588,14 @@ struct cmsghdr; struct mbuf; struct ifnet; +struct in6_aliasreq; int in6_cksum __P((struct mbuf *, u_int8_t, u_int32_t, u_int32_t)); int in6_localaddr __P((struct in6_addr *)); int in6_addrscope __P((struct in6_addr *)); struct in6_ifaddr *in6_ifawithscope __P((struct ifnet *, struct in6_addr *)); struct in6_ifaddr *in6_ifawithifp __P((struct ifnet *, struct in6_addr *)); -extern void in6_if_up __P((struct ifnet *)); +extern void in6_if_up __P((struct ifnet *, struct in6_aliasreq *)); struct sockaddr; void in6_sin6_2_sin __P((struct sockaddr_in *sin, diff -urN xnu-344.49/bsd/netinet6/in6_gif.c xnu-517/bsd/netinet6/in6_gif.c --- xnu-344.49/bsd/netinet6/in6_gif.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_gif.c Sat Oct 25 00:25:55 2003 @@ -282,53 +282,45 @@ } /* - * we know that we are in IFF_UP, outer address available, and outer family - * matched the physical addr family. see gif_encapcheck(). + * validate outer address. */ -int -gif_encapcheck6(m, off, proto, arg) - const struct mbuf *m; - int off; - int proto; - void *arg; -{ - struct ip6_hdr ip6; +static int +gif_validate6(ip6, sc, ifp) + const struct ip6_hdr *ip6; struct gif_softc *sc; + struct ifnet *ifp; +{ struct sockaddr_in6 *src, *dst; - int addrmatch; - /* sanity check done in caller */ - sc = (struct gif_softc *)arg; src = (struct sockaddr_in6 *)sc->gif_psrc; dst = (struct sockaddr_in6 *)sc->gif_pdst; - /* LINTED const cast */ - m_copydata((struct mbuf *)m, 0, sizeof(ip6), (caddr_t)&ip6); - - /* check for address match */ - addrmatch = 0; - if (IN6_ARE_ADDR_EQUAL(&src->sin6_addr, &ip6.ip6_dst)) - addrmatch |= 1; - if (IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6.ip6_src)) - addrmatch |= 2; - if (addrmatch != 3) + /* + * Check for address match. Note that the check is for an incoming + * packet. We should compare the *source* address in our configuration + * and the *destination* address of the packet, and vice versa. + */ + if (!IN6_ARE_ADDR_EQUAL(&src->sin6_addr, &ip6->ip6_dst) || + !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_src)) return 0; /* martian filters on outer source - done in ip6_input */ /* ingress filters on outer source */ - if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && - (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) { + if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in6 sin6; struct rtentry *rt; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = ip6.ip6_src; - /* XXX scopeid */ + sin6.sin6_addr = ip6->ip6_src; +#ifndef SCOPEDROUTING + sin6.sin6_scope_id = 0; /* XXX */ +#endif + rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); - if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) { + if (!rt || rt->rt_ifp != ifp) { #if 0 log(LOG_WARNING, "%s: packet from %s dropped " "due to ingress filter\n", if_name(&sc->gif_if), @@ -342,4 +334,30 @@ } return 128 * 2; +} + +/* + * we know that we are in IFF_UP, outer address available, and outer family + * matched the physical addr family. see gif_encapcheck(). + * sanity check for arg should have been done in the caller. + */ +int +gif_encapcheck6(m, off, proto, arg) + const struct mbuf *m; + int off; + int proto; + void *arg; +{ + struct ip6_hdr ip6; + struct gif_softc *sc; + struct ifnet *ifp; + + /* sanity check done in caller */ + sc = (struct gif_softc *)arg; + + /* LINTED const cast */ + m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6); + ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; + + return gif_validate6(&ip6, sc, ifp); } diff -urN xnu-344.49/bsd/netinet6/in6_ifattach.c xnu-517/bsd/netinet6/in6_ifattach.c --- xnu-344.49/bsd/netinet6/in6_ifattach.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_ifattach.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,5 @@ -/* $KAME: in6_ifattach.c,v 1.41 2000/03/16 07:05:34 jinmei Exp $ */ +/* $FreeBSD: src/sys/netinet6/in6_ifattach.c,v 1.8 2002/04/19 04:46:22 suz Exp $ */ +/* $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -72,9 +73,6 @@ int ip6_auto_linklocal = 1; /* enable by default */ #endif -#ifndef __APPLE__ -struct callout in6_tmpaddrtimer_ch; -#endif extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; @@ -83,7 +81,7 @@ static int generate_tmp_ifid __P((u_int8_t *, const u_int8_t *, u_int8_t *)); static int get_hw_ifid __P((struct ifnet *, struct in6_addr *)); static int get_ifid __P((struct ifnet *, struct ifnet *, struct in6_addr *)); -static int in6_ifattach_linklocal __P((struct ifnet *, struct ifnet *)); +static int in6_ifattach_linklocal __P((struct ifnet *, struct ifnet *, struct in6_aliasreq *)); static int in6_ifattach_loopback __P((struct ifnet *)); #define EUI64_GBIT 0x01 @@ -107,7 +105,7 @@ static int get_rand_ifid(ifp, in6) struct ifnet *ifp; - struct in6_addr *in6; /*upper 64bits are preserved */ + struct in6_addr *in6; /* upper 64bits are preserved */ { MD5_CTX ctxt; u_int8_t digest[16]; @@ -158,8 +156,9 @@ val32 = random() ^ tv.tv_usec; bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } - } else + } else { bcopy(seed0, seed, 8); + } /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ @@ -229,7 +228,7 @@ static int get_hw_ifid(ifp, in6) struct ifnet *ifp; - struct in6_addr *in6; /*upper 64bits are preserved */ + struct in6_addr *in6; /* upper 64bits are preserved */ { struct ifaddr *ifa; struct sockaddr_dl *sdl; @@ -362,7 +361,7 @@ static int get_ifid(ifp0, altifp, in6) struct ifnet *ifp0; - struct ifnet *altifp; /*secondary EUI64 source*/ + struct ifnet *altifp; /* secondary EUI64 source */ struct in6_addr *in6; { struct ifnet *ifp; @@ -424,50 +423,56 @@ } static int -in6_ifattach_linklocal(ifp, altifp) +in6_ifattach_linklocal(ifp, altifp, ifra_passed) struct ifnet *ifp; struct ifnet *altifp; /* secondary EUI64 source */ + struct in6_aliasreq *ifra_passed; { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefix pr0; - int i, error; + int i, dl_tag, error; /* * configure link-local address. */ bzero(&ifra, sizeof(ifra)); + dlil_plumb_protocol(PF_INET6, ifp, &dl_tag); + /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); - ifra.ifra_addr.sin6_family = AF_INET6; - ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); - ifra.ifra_addr.sin6_addr.s6_addr16[0] = htons(0xfe80); + if (ifp->if_type == IFT_PPP && ifra_passed != NULL) /* PPP provided both addresses for us */ + bcopy(&ifra_passed->ifra_addr, &(ifra.ifra_addr), sizeof(struct sockaddr_in6)); + else { + ifra.ifra_addr.sin6_family = AF_INET6; + ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); + ifra.ifra_addr.sin6_addr.s6_addr16[0] = htons(0xfe80); #if SCOPEDROUTING - ifra.ifra_addr.sin6_addr.s6_addr16[1] = 0 + ifra.ifra_addr.sin6_addr.s6_addr16[1] = 0 #else - ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ + ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ #endif - ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; - if ((ifp->if_flags & IFF_LOOPBACK) != 0) { - ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; - ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); - } else { - if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { - nd6log((LOG_ERR, - "%s: no ifid available\n", if_name(ifp))); - return -1; + ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; + if ((ifp->if_flags & IFF_LOOPBACK) != 0) { + ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; + ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); + } else { + if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { + nd6log((LOG_ERR, + " %s: no ifid available\n", if_name(ifp))); + return -1; + } } - } #if SCOPEDROUTING - ifra.ifra_addr.sin6_scope_id = - in6_addr2scopeid(ifp, &ifra.ifra_addr.sin6_addr); + ifra.ifra_addr.sin6_scope_id = + in6_addr2scopeid(ifp, &ifra.ifra_addr.sin6_addr); #endif - + } ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; ifra.ifra_prefixmask.sin6_addr = in6mask64; @@ -481,7 +486,7 @@ /* * Do not let in6_update_ifa() do DAD, since we need a random delay - * before sending an NS at the first time the inteface becomes up. + * before sending an NS at the first time the interface becomes up. * Instead, in6_if_up() will start DAD with a proper random delay. */ ifra.ifra_flags |= IN6_IFF_NODAD; @@ -489,7 +494,8 @@ /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set NULL to the 3rd argument, because - * we know there's no other link-local address on the interface. + * we know there's no other link-local address on the interface + * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL)) != 0) { /* @@ -600,15 +606,15 @@ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; - /* we don't need to perfrom DAD on loopback interfaces. */ + /* we don't need to perform DAD on loopback interfaces. */ ifra.ifra_flags |= IN6_IFF_NODAD; /* skip registration to the prefix list. XXX should be temporary. */ ifra.ifra_flags |= IN6_IFF_NOPFX; /* - * We can set NULL to the 3rd arg. See comments in - * in6_ifattach_linklocal(). + * We are sure that this is a newly assigned address, so we can set + * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL)) != 0) { log(LOG_ERR, "in6_ifattach_loopback: failed to configure " @@ -647,7 +653,7 @@ while (p && *p && *p != '.' && p - name < namelen) p++; if (p - name > sizeof(n) - 1) - return -1; /*label too long*/ + return -1; /* label too long */ l = p - name; strncpy(n, name, l); n[(int)l] = '\0'; @@ -734,40 +740,15 @@ * XXX multiple link-local address case */ void -in6_ifattach(ifp, altifp) +in6_ifattach(ifp, altifp, ifra) struct ifnet *ifp; struct ifnet *altifp; /* secondary EUI64 source */ + struct in6_aliasreq *ifra; { static size_t if_indexlim = 8; struct in6_ifaddr *ia; struct in6_addr in6; - u_long dl_tag; - switch (ifp->if_type) { -#if IFT_BRIDGE /*OpenBSD 2.8*/ - /* some of the interfaces are inherently not IPv6 capable */ - case IFT_BRIDGE: - return; -#endif -#ifdef __APPLE__ - case IFT_ETHER: - dl_tag = ether_attach_inet6(ifp); - break; - - case IFT_LOOP: - dl_tag = lo_attach_inet6(ifp); -#if NGIF > 0 - case IFT_GIF: - dl_tag = gif_attach_proto_family(ifp, PF_INET6); - break; -#endif -#if NSTF > 0 - case IFT_STF: - dl_tag = stf_attach_inet6(ifp); - break; -#endif -#endif - } /* * We have some arrays that should be indexed by if_index. @@ -810,6 +791,9 @@ icmp6_ifstatmax = if_indexlim; } + /* initialize NDP variables */ + nd6_ifattach(ifp); + /* initialize scope identifiers */ scope6_ifattach(ifp); @@ -820,8 +804,10 @@ #if IFT_STF case IFT_STF: /* - * 6to4 interface is a very speical kind of beast. - * no multicast, no linklocal (based on 03 draft). + * 6to4 interface is a very special kind of beast. + * no multicast, no linklocal. RFC2529 specifies how to make + * linklocals for 6to4 interface, but there's no use and + * it is rather harmful to have one. */ goto statinit; #endif @@ -839,9 +825,6 @@ return; } - /* initialize NDP variables */ - nd6_ifattach(ifp); - /* * assign loopback address for loopback interface. * XXX multiple loopback interface case. @@ -860,9 +843,12 @@ if (ip6_auto_linklocal) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { - if (in6_ifattach_linklocal(ifp, altifp) == 0) { + if (in6_ifattach_linklocal(ifp, altifp, ifra) == 0) { /* linklocal address assigned */ } else { + log(LOG_INFO, "in6_ifattach: " + "%s failed to attach a linklocal address.\n", + if_name(ifp)); /* failed to assign linklocal address. bark? */ } } @@ -886,7 +872,6 @@ _MALLOC(sizeof(struct icmp6_ifstat), M_IFADDR, M_WAITOK); bzero(icmp6_ifstat[ifp->if_index], sizeof(struct icmp6_ifstat)); } - } /* @@ -969,6 +954,12 @@ IFAFREE(&oia->ia_ifa); } +#ifndef __APPLE__ + +/* This is a cause for reentrency, as those multicast addresses are + * freed both from the interface detaching and triggered by the closing of the socket + * Let the socket do the cleanup and not force it from the interface level + */ /* leave from all multicast groups joined */ in6_pcbpurgeif0(LIST_FIRST(udbinfo.listhead), ifp); in6_pcbpurgeif0(LIST_FIRST(ripcbinfo.listhead), ifp); @@ -979,6 +970,7 @@ in6_delmulti(in6m); in6m = NULL; } +#endif /* __APPLE__ */ /* * remove neighbor management table. we call it twice just to make diff -urN xnu-344.49/bsd/netinet6/in6_ifattach.h xnu-517/bsd/netinet6/in6_ifattach.h --- xnu-344.49/bsd/netinet6/in6_ifattach.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_ifattach.h Sat Oct 25 00:25:55 2003 @@ -37,7 +37,7 @@ #ifdef __APPLE_API_PRIVATE void in6_nigroup_attach __P((const char *, int)); void in6_nigroup_detach __P((const char *, int)); -void in6_ifattach __P((struct ifnet *, struct ifnet *)); +void in6_ifattach __P((struct ifnet *, struct ifnet *, struct in6_aliasreq *)); void in6_ifdetach __P((struct ifnet *)); void in6_get_tmpifid __P((struct ifnet *, u_int8_t *, const u_int8_t *, int)); void in6_tmpaddrtimer __P((void *)); diff -urN xnu-344.49/bsd/netinet6/in6_pcb.c xnu-517/bsd/netinet6/in6_pcb.c --- xnu-344.49/bsd/netinet6/in6_pcb.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_pcb.c Sat Oct 25 00:25:55 2003 @@ -342,7 +342,8 @@ int error; /* - * Call inner routine, to assign local interface address. + * Call inner routine, to assign local interface address. + * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return(error); @@ -618,12 +619,56 @@ /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); - ip_freemoptions(inp->inp_moptions); + ip_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; zfree(ipi->ipi_zone, inp); } +struct sockaddr * +in6_sockaddr(port, addr_p) + in_port_t port; + struct in6_addr *addr_p; +{ + struct sockaddr_in6 *sin6; + + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_WAITOK); + bzero(sin6, sizeof *sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = port; + sin6->sin6_addr = *addr_p; + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) + sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]); + else + sin6->sin6_scope_id = 0; /*XXX*/ + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) + sin6->sin6_addr.s6_addr16[1] = 0; + + return (struct sockaddr *)sin6; +} + +struct sockaddr * +in6_v4mapsin6_sockaddr(port, addr_p) + in_port_t port; + struct in_addr *addr_p; +{ + struct sockaddr_in sin; + struct sockaddr_in6 *sin6_p; + + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_port = port; + sin.sin_addr = *addr_p; + + MALLOC(sin6_p, struct sockaddr_in6 *, sizeof *sin6_p, M_SONAME, + M_WAITOK); + in6_sin_2_v4mapsin6(&sin, sin6_p); + + return (struct sockaddr *)sin6_p; +} + /* * The calling convention of in6_setsockaddr() and in6_setpeeraddr() was * modified to match the pru_sockaddr() and pru_peeraddr() entry points @@ -641,34 +686,20 @@ { int s; register struct inpcb *inp; - register struct sockaddr_in6 *sin6; - - /* - * Do the malloc first in case it blocks. - */ - MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_WAITOK); - bzero(sin6, sizeof *sin6); - sin6->sin6_family = AF_INET6; - sin6->sin6_len = sizeof(*sin6); + struct in6_addr addr; + in_port_t port; s = splnet(); inp = sotoinpcb(so); if (!inp) { splx(s); - _FREE(sin6, M_SONAME); return EINVAL; } - sin6->sin6_port = inp->inp_lport; - sin6->sin6_addr = inp->in6p_laddr; + port = inp->inp_lport; + addr = inp->in6p_laddr; splx(s); - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]); - else - sin6->sin6_scope_id = 0; /*XXX*/ - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - sin6->sin6_addr.s6_addr16[1] = 0; - *nam = (struct sockaddr *)sin6; + *nam = in6_sockaddr(port, &addr); return 0; } @@ -679,34 +710,20 @@ { int s; struct inpcb *inp; - register struct sockaddr_in6 *sin6; - - /* - * Do the malloc first in case it blocks. - */ - MALLOC(sin6, struct sockaddr_in6 *, sizeof(*sin6), M_SONAME, M_WAITOK); - bzero((caddr_t)sin6, sizeof (*sin6)); - sin6->sin6_family = AF_INET6; - sin6->sin6_len = sizeof(struct sockaddr_in6); + struct in6_addr addr; + in_port_t port; s = splnet(); inp = sotoinpcb(so); if (!inp) { splx(s); - _FREE(sin6, M_SONAME); return EINVAL; } - sin6->sin6_port = inp->inp_fport; - sin6->sin6_addr = inp->in6p_faddr; + port = inp->inp_fport; + addr = inp->in6p_faddr; splx(s); - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]); - else - sin6->sin6_scope_id = 0; /*XXX*/ - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - sin6->sin6_addr.s6_addr16[1] = 0; - *nam = (struct sockaddr *)sin6; + *nam = in6_sockaddr(port, &addr); return 0; } @@ -723,6 +740,7 @@ if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else + /* scope issues will be handled in in6_setsockaddr(). */ error = in6_setsockaddr(so, nam); return error; @@ -741,6 +759,7 @@ if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else + /* scope issues will be handled in in6_setpeeraddr(). */ error = in6_setpeeraddr(so, nam); return error; @@ -760,9 +779,11 @@ void in6_pcbnotify(head, dst, fport_arg, src, lport_arg, cmd, notify) struct inpcbhead *head; - struct sockaddr *dst, *src; + struct sockaddr *dst; + const struct sockaddr *src; u_int fport_arg, lport_arg; int cmd; +// struct inpcb *(*notify) __P((struct inpcb *, int)); void (*notify) __P((struct inpcb *, int)); { struct inpcb *inp, *ninp; @@ -781,7 +802,7 @@ /* * note that src can be NULL when we get notify by local fragmentation. */ - sa6_src = (src == NULL) ? sa6_any : *(struct sockaddr_in6 *)src; + sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* diff -urN xnu-344.49/bsd/netinet6/in6_pcb.h xnu-517/bsd/netinet6/in6_pcb.h --- xnu-344.49/bsd/netinet6/in6_pcb.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_pcb.h Sat Oct 25 00:25:55 2003 @@ -90,9 +90,14 @@ struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *)); void in6_pcbnotify __P((struct inpcbhead *, struct sockaddr *, - u_int, struct sockaddr *, u_int, int, + u_int, const struct sockaddr *, u_int, int, void (*)(struct inpcb *, int))); -void in6_rtchange __P((struct inpcb *, int)); +void + in6_rtchange __P((struct inpcb *, int)); +struct sockaddr * + in6_sockaddr __P((in_port_t port, struct in6_addr *addr_p)); +struct sockaddr * + in6_v4mapsin6_sockaddr __P((in_port_t port, struct in_addr *addr_p)); int in6_setpeeraddr __P((struct socket *so, struct sockaddr **nam)); int in6_setsockaddr __P((struct socket *so, struct sockaddr **nam)); int in6_mapped_sockaddr __P((struct socket *so, struct sockaddr **nam)); diff -urN xnu-344.49/bsd/netinet6/in6_proto.c xnu-517/bsd/netinet6/in6_proto.c --- xnu-344.49/bsd/netinet6/in6_proto.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_proto.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/in6_proto.c,v 1.6.2.7 2001/07/24 19:10:18 brooks Exp $ */ +/* $FreeBSD: src/sys/netinet6/in6_proto.c,v 1.19 2002/10/16 02:25:05 sam Exp $ */ /* $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /* @@ -141,6 +141,8 @@ extern int in6_inithead __P((void **, int)); void in6_dinit(void); +static int rip6_pr_output(struct mbuf *m, struct socket *so, struct sockaddr_in6 *, struct mbuf *); + struct ip6protosw inet6sw[] = { { 0, &inet6domain, IPPROTO_IPV6, 0, 0, 0, 0, 0, @@ -165,13 +167,13 @@ 0, &tcp6_usrreqs, }, { SOCK_RAW, &inet6domain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, - rip6_input, rip6_output, rip6_ctlinput, rip6_ctloutput, + rip6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, 0, 0, 0, 0, 0, &rip6_usrreqs }, { SOCK_RAW, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - icmp6_input, rip6_output, rip6_ctlinput, rip6_ctloutput, + icmp6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, icmp6_init, icmp6_fasttimo, 0, 0, 0, &rip6_usrreqs @@ -220,27 +222,27 @@ #endif /* IPSEC */ #if INET { SOCK_RAW, &inet6domain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - encap6_input, rip6_output, 0, rip6_ctloutput, + encap6_input, rip6_pr_output, 0, rip6_ctloutput, 0, encap_init, 0, 0, 0, 0, &rip6_usrreqs }, #endif /*INET*/ { SOCK_RAW, &inet6domain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - encap6_input, rip6_output, 0, rip6_ctloutput, + encap6_input, rip6_pr_output, 0, rip6_ctloutput, 0, encap_init, 0, 0, 0, 0, &rip6_usrreqs }, { SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - pim6_input, rip6_output, 0, rip6_ctloutput, + pim6_input, rip6_pr_output, 0, rip6_ctloutput, 0, 0, 0, 0, 0, 0, &rip6_usrreqs }, /* raw wildcard */ { SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR, - rip6_input, rip6_output, 0, rip6_ctloutput, + rip6_input, rip6_pr_output, 0, rip6_ctloutput, 0, 0, 0, 0, 0, 0, &rip6_usrreqs @@ -252,7 +254,7 @@ struct domain inet6domain = { AF_INET6, "internet6", in6_dinit, 0, 0, - inet6sw, 0, + (struct protosw *)inet6sw, 0, in6_inithead, offsetof(struct sockaddr_in6, sin6_addr) << 3, sizeof(struct sockaddr_in6) , sizeof(struct sockaddr_in6), 0 }; @@ -278,6 +280,11 @@ } } +int rip6_pr_output(struct mbuf *m, struct socket *so, struct sockaddr_in6 *sin6, struct mbuf *m1) +{ + panic("rip6_pr_output\n"); + return 0; +} /* * Internet configuration info @@ -298,8 +305,7 @@ int ip6_sendredirects = IPV6_SENDREDIRECTS; int ip6_defhlim = IPV6_DEFHLIM; int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; -//int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ -int ip6_accept_rtadv = 1; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ +int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ int ip6_log_interval = 5; int ip6_hdrnestlimit = 50; /* appropriate? */ @@ -310,10 +316,7 @@ int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ int ip6_rr_prune = 5; /* router renumbering prefix * walk list every 5 sec. */ -int ip6_v6only = 0; -#ifdef __APPLE__ -int ip6_auto_on = 1; /* Start IPv6 per interface triggered by IPv4 address assignment */ -#endif +int ip6_v6only = 0; /* Mapped addresses on by default - Radar 3347718 */ u_int32_t ip6_id = 0UL; int ip6_keepfaith = 0; @@ -342,7 +345,7 @@ int icmp6_rediraccept = 1; /* accept and process redirects */ int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ int icmp6errppslim = 100; /* 100pps */ -int icmp6_nodeinfo = 1; /* enable/disable NI response */ +int icmp6_nodeinfo = 3; /* enable/disable NI response */ /* UDP on IP6 parameters */ int udp6_sendspace = 9216; /* really max datagram size */ @@ -475,6 +478,3 @@ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_RW, &nd6_debug, 0, ""); -#ifdef __APPLE__ -SYSCTL_INT(_net_inet6_ip6, OID_AUTO, auto_on, CTLFLAG_RW, &ip6_auto_on,0, ""); -#endif diff -urN xnu-344.49/bsd/netinet6/in6_src.c xnu-517/bsd/netinet6/in6_src.c --- xnu-344.49/bsd/netinet6/in6_src.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_src.c Sat Oct 25 00:25:55 2003 @@ -237,7 +237,10 @@ */ if (ro) { if (ro->ro_rt && - !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst)) { + (!(ro->ro_rt->rt_flags & RTF_UP) || + satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || + !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, + dst))) { rtfree(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } diff -urN xnu-344.49/bsd/netinet6/in6_var.h xnu-517/bsd/netinet6/in6_var.h --- xnu-344.49/bsd/netinet6/in6_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/in6_var.h Sat Oct 25 00:25:55 2003 @@ -85,7 +85,7 @@ * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime - * kernel -> userland: throuw up everything + * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { @@ -467,6 +467,20 @@ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ +#ifdef KERNEL_PRIVATE +/* + * temporary control calls to attach/detach IP to/from an ethernet interface + */ +#define SIOCPROTOATTACH_IN6 _IOWR('i', 110, struct in6_aliasreq) /* attach proto to interface */ +#define SIOCPROTODETACH_IN6 _IOWR('i', 111, struct in6_ifreq) /* detach proto from interface */ + +#define SIOCLL_START _IOWR('i', 130, struct in6_aliasreq) /* start aquiring linklocal on interface */ +#define SIOCLL_STOP _IOWR('i', 131, struct in6_ifreq) /* deconfigure linklocal from interface */ +#define SIOCAUTOCONF_START _IOWR('i', 132, struct in6_ifreq) /* accept rtadvd on this interface */ +#define SIOCAUTOCONF_STOP _IOWR('i', 133, struct in6_ifreq) /* stop accepting rtadv for this interface */ +#endif KERNEL_PRIVATE + + #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ @@ -564,7 +578,7 @@ /* * Structure used by macros below to remember position when stepping through - * all of eht in6_multi records. + * all of the in6_multi records. */ struct in6_multistep { struct in6_ifaddr *i_ia; diff -urN xnu-344.49/bsd/netinet6/ip6_forward.c xnu-517/bsd/netinet6/ip6_forward.c --- xnu-344.49/bsd/netinet6/ip6_forward.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_forward.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/ip6_forward.c,v 1.4.2.4 2001/07/03 11:01:53 ume Exp $ */ +/* $FreeBSD: src/sys/netinet6/ip6_forward.c,v 1.16 2002/10/16 02:25:05 sam Exp $ */ /* $KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $ */ /* @@ -268,7 +268,7 @@ break; default: printf("ip6_output (ipsec): error code %d\n", error); - /*fall through*/ + /* fall through */ case ENOENT: /* don't show these error codes to the user */ break; @@ -344,7 +344,7 @@ * for the reason that the destination is beyond the scope of the * source address, discard the packet and return an icmp6 destination * unreachable error with Code 2 (beyond scope of source address). - * [draft-ietf-ipngwg-icmp-v3-00.txt, Section 3.1] + * [draft-ietf-ipngwg-icmp-v3-02.txt, Section 3.1] */ if (in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_src) != in6_addr2scopeid(rt->rt_ifp, &ip6->ip6_src)) { @@ -380,7 +380,7 @@ #endif mtu = rt->rt_ifp->if_mtu; -#if IPSEC_IPV6FWD +#if IPSEC /* * When we do IPsec tunnel ingress, we need to play * with if_mtu value (decrement IPsec header size @@ -482,11 +482,11 @@ #endif { printf("ip6_forward: outgoing interface is loopback. " - "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), - ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), - if_name(rt->rt_ifp)); + "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", + ip6_sprintf(&ip6->ip6_src), + ip6_sprintf(&ip6->ip6_dst), + ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), + if_name(rt->rt_ifp)); } /* we can just use rcvif in forwarding. */ diff -urN xnu-344.49/bsd/netinet6/ip6_fw.h xnu-517/bsd/netinet6/ip6_fw.h --- xnu-344.49/bsd/netinet6/ip6_fw.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_fw.h Sat Oct 25 00:25:55 2003 @@ -42,9 +42,8 @@ #include #include -#ifdef __APPLE_API_PRIVATE -#define IP6_FW_CURRENT_API_VERSION 20 /* Version of this API */ +#define IPV6_FW_CURRENT_API_VERSION 20 /* Version of this API */ /* @@ -212,6 +211,7 @@ * Main firewall chains definitions and global var's definitions. */ #ifdef KERNEL +#ifdef __APPLE_API_PRIVATE #define M_IP6FW M_IPFW @@ -223,14 +223,15 @@ /* Firewall hooks */ struct ip6_hdr; +struct sockopt; typedef int ip6_fw_chk_t __P((struct ip6_hdr**, struct ifnet*, u_short *, struct mbuf**)); -typedef int ip6_fw_ctl_t __P((int, struct mbuf**)); +typedef int ip6_fw_ctl_t __P((struct sockopt *)); extern ip6_fw_chk_t *ip6_fw_chk_ptr; extern ip6_fw_ctl_t *ip6_fw_ctl_ptr; extern int ip6_fw_enable; -#endif /* KERNEL */ #endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ #endif /* _IP6_FW_H */ diff -urN xnu-344.49/bsd/netinet6/ip6_input.c xnu-517/bsd/netinet6/ip6_input.c --- xnu-344.49/bsd/netinet6/ip6_input.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_input.c Sat Oct 25 00:25:55 2003 @@ -199,6 +199,7 @@ #endif nd6_init(); frag6_init(); + icmp6_init(); /* * in many cases, random() here does NOT return random number * as initialization during bootstrap time occur in fixed order. @@ -207,7 +208,7 @@ ip6_flow_seq = random() ^ tv.tv_usec; microtime(&tv); ip6_desync_factor = (random() ^ tv.tv_usec) % MAX_TEMP_DESYNC_FACTOR; - timeout(ip6_init2, (caddr_t)0, 1 * hz); + timeout(ip6_init2, (caddr_t)0, 2 * hz); } static void @@ -222,7 +223,7 @@ * to route local address of p2p link to loopback, * assign loopback address first. */ - in6_ifattach(&loif[0], NULL); + in6_ifattach(&loif[0], NULL, NULL); #ifdef __APPLE__ /* nd6_timer_init */ @@ -323,7 +324,7 @@ ip6_delaux(m); /* - * mbuf statistics by kazu + * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) @@ -334,7 +335,7 @@ #define M2MMAX (sizeof(ip6stat.ip6s_m2m)/sizeof(ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { - ip6stat.ip6s_m2m[loif[0].if_index]++; /*XXX*/ + ip6stat.ip6s_m2m[loif[0].if_index]++; /* XXX */ } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else @@ -366,7 +367,7 @@ n = NULL; } } - if (!n) { + if (n == NULL) { m_freem(m); return; /*ENOBUFS*/ } @@ -433,6 +434,7 @@ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } + /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack @@ -660,7 +662,7 @@ && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; - deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /*faith*/ + deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */ goto hbhcheck; } } @@ -718,7 +720,7 @@ ip6 = mtod(m, struct ip6_hdr *); /* - * if the payload length field is 0 and the next header field + * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ @@ -1652,8 +1654,10 @@ } } else { n = m_aux_add(m, AF_INET6, -1); - n->m_len = sizeof(struct ip6aux); - bzero(mtod(n, caddr_t), n->m_len); + if (n) { + n->m_len = sizeof(struct ip6aux); + bzero(mtod(n, caddr_t), n->m_len); + } } return n; } diff -urN xnu-344.49/bsd/netinet6/ip6_mroute.c xnu-517/bsd/netinet6/ip6_mroute.c --- xnu-344.49/bsd/netinet6/ip6_mroute.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_mroute.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/netinet6/ip6_mroute.c,v 1.2.2.4 2001/07/03 11:01:53 ume Exp $ */ -/* $KAME: ip6_mroute.c,v 1.46 2001/04/04 05:17:30 itojun Exp $ */ +/* $FreeBSD: src/sys/netinet6/ip6_mroute.c,v 1.16.2.1 2002/12/18 21:39:40 suz Exp $ */ +/* $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $ */ /* * Copyright (C) 1998 WIDE Project. @@ -100,7 +100,7 @@ #define RTE_FOUND 0x2 struct mf6c *mf6ctable[MF6CTBLSIZ]; -u_char nexpire[MF6CTBLSIZ]; +u_char n6expire[MF6CTBLSIZ]; static struct mif6 mif6table[MAXMIFS]; #if MRT6DEBUG u_int mrt6debug = 0; /* debug level */ @@ -144,11 +144,6 @@ static mifi_t reg_mif_num = (mifi_t)-1; static struct pim6stat pim6stat; - -/* - * one-back cache used by ipip_input to locate a tunnel's mif - * given a datagram's src ip address. - */ static int pim6; /* @@ -414,7 +409,7 @@ ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); - bzero((caddr_t)nexpire, sizeof(nexpire)); + bzero((caddr_t)n6expire, sizeof(n6expire)); pim6 = 0;/* used for stubbing out/in pim stuff */ @@ -685,7 +680,8 @@ if (rt) { #if MRT6DEBUG if (mrt6debug & DEBUG_MFC) - log(LOG_DEBUG,"add_m6fc update o %s g %s p %x\n", + log(LOG_DEBUG, + "add_m6fc no upcall h %d o %s g %s p %x\n", ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); @@ -738,7 +734,7 @@ rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ - nexpire[hash]--; + n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { @@ -785,7 +781,7 @@ rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) - nexpire[hash]--; + n6expire[hash]--; rt->mf6c_expire = 0; } } @@ -1149,7 +1145,7 @@ rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst; rt->mf6c_expire = UPCALL_EXPIRE; - nexpire[hash]++; + n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ @@ -1217,7 +1213,7 @@ s = splnet(); for (i = 0; i < MF6CTBLSIZ; i++) { - if (nexpire[i] == 0) + if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { @@ -1247,7 +1243,7 @@ rte = n; } while (rte != NULL); mrt6stat.mrt6s_cache_cleanups++; - nexpire[i]--; + n6expire[i]--; *nptr = mfc->mf6c_next; FREE(mfc, M_MRTABLE); @@ -1283,7 +1279,7 @@ /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is - * seperate. + * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ @@ -1510,7 +1506,7 @@ * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ - if (mb_copy->m_pkthdr.len < ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) { + if (mb_copy->m_pkthdr.len <= ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) { dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; @@ -1617,7 +1613,7 @@ #if MRT6DEBUG if (mrt6debug) log(LOG_WARNING, - "register_send: ip_mrouter socket queue full\n"); + "register_send: ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return ENOBUFS; diff -urN xnu-344.49/bsd/netinet6/ip6_output.c xnu-517/bsd/netinet6/ip6_output.c --- xnu-344.49/bsd/netinet6/ip6_output.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_output.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/netinet6/ip6_output.c,v 1.13.2.10 2001/07/15 18:18:34 ume Exp $ */ -/* $KAME: ip6_output.c,v 1.180 2001/05/21 05:37:50 jinmei Exp $ */ +/* $FreeBSD: src/sys/netinet6/ip6_output.c,v 1.43 2002/10/31 19:45:48 ume Exp $ */ +/* $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -82,6 +82,7 @@ #include #include +#include #include #include #include @@ -108,6 +109,8 @@ static u_long lo_dl_tag = 0; +extern u_long route_generation; + struct ip6_exthdrs { struct mbuf *ip6e_ip6; @@ -119,7 +122,7 @@ static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt)); -static int ip6_setmoptions __P((int, struct ip6_moptions **, struct mbuf *)); +static int ip6_setmoptions __P((int, struct inpcb *, struct mbuf *)); static int ip6_getmoptions __P((int, struct ip6_moptions *, struct mbuf **)); static int ip6_copyexthdr __P((struct mbuf **, caddr_t, int)); static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int, @@ -127,6 +130,10 @@ static int ip6_insert_jumboopt __P((struct ip6_exthdrs *, u_int32_t)); static int ip6_splithdr __P((struct mbuf *, struct ip6_exthdrs *)); +extern int ip_createmoptions(struct ip_moptions **imop); +extern int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); +extern int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); + /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). @@ -314,7 +321,8 @@ /* * we treat dest2 specially. this makes IPsec processing - * much easier. + * much easier. the goal here is to make mprev point the + * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. @@ -392,7 +400,7 @@ break; default: printf("ip6_output (ipsec): error code %d\n", error); - /*fall through*/ + /* fall through */ case ENOENT: /* don't show these error codes to the user */ error = 0; @@ -468,7 +476,9 @@ * and is still up. If not, free it and try again. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_dst))) { + dst->sin6_family != AF_INET6 || + !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_dst) || + ro->ro_rt->generation_id != route_generation)) { rtfree(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } @@ -521,7 +531,7 @@ break; default: printf("ip6_output (ipsec): error code %d\n", error); - /*fall through*/ + /* fall through */ case ENOENT: /* don't show these error codes to the user */ error = 0; @@ -532,7 +542,7 @@ exthdrs.ip6e_ip6 = m; } -#endif /*IPSEC*/ +#endif /* IPSEC */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* Unicast */ @@ -785,9 +795,8 @@ * We eventually have sockaddr_in6 and use the sin6_scope_id * field of the structure here. * We rely on the consistency between two scope zone ids - * of source add destination, which should already be assured - * larger scopes than link will be supported in the near - * future. + * of source and destination, which should already be assured. + * Larger scopes than link will be supported in the future. */ origifp = NULL; if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) @@ -825,7 +834,7 @@ */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; - m->m_pkthdr.rcvif = NULL; /*XXX*/ + m->m_pkthdr.rcvif = NULL; /* XXX */ /* If ipfw says divert, we have to just drop packet */ if ((*ip6_fw_chk_ptr)(&ip6, ifp, &port, &m)) { m_freem(m); @@ -963,7 +972,8 @@ /* * Loop through length of segment after first fragment, - * make new header and copy data of each part and link onto chain. + * make new header and copy data of each part and link onto + * chain. */ m0 = m; for (off = hlen; off < tlen; off += len) { @@ -1240,6 +1250,8 @@ return(0); } +extern int load_ipfw(); + /* * IP6 socket option processing. */ @@ -1255,14 +1267,14 @@ int optlen; struct proc *p; - if (sopt) { + if (sopt == NULL) + panic("ip6_ctloutput: arg soopt is NULL"); + else { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; p = sopt->sopt_p; - } else { - panic("ip6_ctloutput: arg soopt is NULL"); } error = optval = 0; @@ -1358,12 +1370,11 @@ error = EINVAL; break; } - /* - * XXX: BINDV6ONLY should be integrated - * into V6ONLY. - */ - OPTSET(IN6P_BINDV6ONLY); OPTSET(IN6P_IPV6_V6ONLY); + if (optval) + in6p->in6p_vflag &= ~INP_IPV4; + else + in6p->in6p_vflag |= INP_IPV4; break; } break; @@ -1430,9 +1441,7 @@ m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); - error = ip6_setmoptions(sopt->sopt_name, - &in6p->in6p_moptions, - m); + error = ip6_setmoptions(sopt->sopt_name, in6p, m); (void)m_free(m); } break; @@ -1474,7 +1483,7 @@ if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; - if (error = soopt_mcopyin(sopt, m)) /* XXX */ + if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; if (m) { req = mtod(m, caddr_t); @@ -1491,21 +1500,12 @@ case IPV6_FW_DEL: case IPV6_FW_FLUSH: case IPV6_FW_ZERO: - { - struct mbuf *m; - struct mbuf **mp = &m; - - if (ip6_fw_ctl_ptr == NULL) + { + if (ip6_fw_ctl_ptr == NULL && load_ipfw() != 0) return EINVAL; - /* XXX */ - if ((error = soopt_getm(sopt, &m)) != 0) - break; - /* XXX */ - if ((error = soopt_mcopyin(sopt, m)) != 0) - break; - error = (*ip6_fw_ctl_ptr)(optname, mp); - m = *mp; - } + + error = (*ip6_fw_ctl_ptr)(sopt); + } break; default: @@ -1550,8 +1550,7 @@ break; case IPV6_V6ONLY: - /* XXX: see the setopt case. */ - optval = OPTBIT(IN6P_BINDV6ONLY); + optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: @@ -1649,20 +1648,12 @@ #endif /* KAME IPSEC */ case IPV6_FW_GET: - { - struct mbuf *m; - struct mbuf **mp = &m; - - if (ip6_fw_ctl_ptr == NULL) - { + { + if (ip6_fw_ctl_ptr == NULL && load_ipfw() != 0) return EINVAL; + + error = (*ip6_fw_ctl_ptr)(sopt); } - error = (*ip6_fw_ctl_ptr)(optname, mp); - if (error == 0) - error = soopt_mcopyout(sopt, m); /* XXX */ - if (error == 0 && m) - m_freem(m); - } break; default: @@ -1708,7 +1699,8 @@ if (!m || m->m_len == 0) { /* - * Only turning off any previous options. + * Only turning off any previous options, regardless of + * whether the opt is just created or given. */ if (opt) FREE(opt, M_IP6OPT); @@ -1720,6 +1712,7 @@ priv = 1; if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) { ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */ + FREE(opt, M_IP6OPT); return(error); } *pktopt = opt; @@ -1811,7 +1804,7 @@ dst = _MALLOC(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL && canwait == M_NOWAIT) - goto bad; + return (NULL); bzero(dst, sizeof(*dst)); dst->ip6po_hlim = src->ip6po_hlim; @@ -1837,13 +1830,13 @@ return(dst); bad: - printf("ip6_copypktopts: copy failed"); if (dst->ip6po_pktinfo) FREE(dst->ip6po_pktinfo, M_IP6OPT); if (dst->ip6po_nexthop) FREE(dst->ip6po_nexthop, M_IP6OPT); if (dst->ip6po_hbh) FREE(dst->ip6po_hbh, M_IP6OPT); if (dst->ip6po_dest1) FREE(dst->ip6po_dest1, M_IP6OPT); if (dst->ip6po_dest2) FREE(dst->ip6po_dest2, M_IP6OPT); if (dst->ip6po_rthdr) FREE(dst->ip6po_rthdr, M_IP6OPT); + FREE(dst, M_IP6OPT); return(NULL); } #undef PKTOPT_EXTHDRCPY @@ -1864,16 +1857,18 @@ * Set the IP6 multicast options in response to user setsockopt(). */ static int -ip6_setmoptions(optname, im6op, m) +ip6_setmoptions(optname, in6p, m) int optname; - struct ip6_moptions **im6op; + struct inpcb* in6p; struct mbuf *m; { int error = 0; u_int loop, ifindex; struct ipv6_mreq *mreq; struct ifnet *ifp; + struct ip6_moptions **im6op = &in6p->in6p_moptions; struct ip6_moptions *im6o = *im6op; + struct ip_moptions *imo; struct route_in6 ro; struct sockaddr_in6 *dst; struct in6_multi_mship *imm; @@ -1895,6 +1890,18 @@ im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } + + if (in6p->inp_moptions == NULL) { + /* + * No IPv4 multicast option buffer attached to the pcb; + * call ip_createmoptions to allocate one and initialize + * to default values. + */ + error = ip_createmoptions(&in6p->inp_moptions); + if (error != 0) + return error; + } + imo = in6p->inp_moptions; switch (optname) { @@ -1917,6 +1924,7 @@ break; } im6o->im6o_multicast_ifp = ifp; + imo->imo_multicast_ifp = ifp; break; case IPV6_MULTICAST_HOPS: @@ -1932,10 +1940,13 @@ bcopy(mtod(m, u_int *), &optval, sizeof(optval)); if (optval < -1 || optval >= 256) error = EINVAL; - else if (optval == -1) + else if (optval == -1) { im6o->im6o_multicast_hlim = ip6_defmcasthlim; - else + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + } else { im6o->im6o_multicast_hlim = optval; + imo->imo_multicast_ttl = optval; + } break; } @@ -1954,6 +1965,7 @@ break; } im6o->im6o_multicast_loop = loop; + imo->imo_multicast_loop = loop; break; case IPV6_JOIN_GROUP: @@ -1966,6 +1978,15 @@ break; } mreq = mtod(m, struct ipv6_mreq *); + /* + * If the interface is specified, validate it. + */ + if (mreq->ipv6mr_interface < 0 + || if_index < mreq->ipv6mr_interface) { + error = ENXIO; /* XXX EINVAL? */ + break; + } + if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { /* * We use the unspecified address to specify to accept @@ -1977,19 +1998,38 @@ error = EACCES; break; } + } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { + struct ip_mreq v4req; + + v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; + v4req.imr_interface.s_addr = INADDR_ANY; + + /* Find an IPv4 address on the specified interface. */ + if (mreq->ipv6mr_interface != 0) { + struct in_ifaddr *ifa; + + ifp = ifindex2ifnet[mreq->ipv6mr_interface]; + + TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { + if (ifa->ia_ifp == ifp) { + v4req.imr_interface = IA_SIN(ifa)->sin_addr; + break; + } + } + + if (v4req.imr_multiaddr.s_addr == 0) { + /* Interface has no IPv4 address. */ + error = EINVAL; + break; + } + } + + error = ip_addmembership(imo, &v4req); + break; } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } - - /* - * If the interface is specified, validate it. - */ - if (mreq->ipv6mr_interface < 0 - || if_index < mreq->ipv6mr_interface) { - error = ENXIO; /* XXX EINVAL? */ - break; - } /* * If no interface was explicitly specified, choose an * appropriate one according to the given multicast address. @@ -2078,15 +2118,6 @@ break; } mreq = mtod(m, struct ipv6_mreq *); - if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - if (suser(p->p_ucred, &p->p_acflag)) { - error = EACCES; - break; - } - } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { - error = EINVAL; - break; - } /* * If an interface address was specified, get a pointer * to its ifnet structure. @@ -2097,6 +2128,35 @@ break; } ifp = ifindex2ifnet[mreq->ipv6mr_interface]; + + if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { + if (suser(p->p_ucred, &p->p_acflag)) { + error = EACCES; + break; + } + } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { + struct ip_mreq v4req; + + v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; + v4req.imr_interface.s_addr = INADDR_ANY; + + if (ifp != NULL) { + struct in_ifaddr *ifa; + + TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { + if (ifa->ia_ifp == ifp) { + v4req.imr_interface = IA_SIN(ifa)->sin_addr; + break; + } + } + } + + error = ip_dropmembership(imo, &v4req); + break; + } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { + error = EINVAL; + break; + } /* * Put interface index into the multicast address, * if the address has link-local scope. @@ -2145,6 +2205,14 @@ FREE(*im6op, M_IPMOPTS); *im6op = NULL; } + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_vif == -1 && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + ip_freemoptions(imo); + in6p->inp_moptions = 0; + } return(error); } @@ -2491,12 +2559,13 @@ if (lo_dl_tag == 0) dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dl_tag); - if (lo_dl_tag) - dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *)&dst, 0); - else + if (lo_dl_tag) { + copym->m_pkthdr.rcvif = ifp; + dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *)dst, 0); + } else m_free(copym); #else - (void)if_simloop(ifp, copym, dst->sin6_family, NULL); + (void)if_simloop(ifp, copym, dst->sin6_family, NULL); #endif } diff -urN xnu-344.49/bsd/netinet6/ip6_var.h xnu-517/bsd/netinet6/ip6_var.h --- xnu-344.49/bsd/netinet6/ip6_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6_var.h Sat Oct 25 00:25:55 2003 @@ -334,10 +334,10 @@ struct ip6_pktopts *ip6_copypktopts __P((struct ip6_pktopts *, int)); int ip6_optlen __P((struct inpcb *)); -int route6_input __P((struct mbuf **, int *, int)); +int route6_input __P((struct mbuf **, int *)); void frag6_init __P((void)); -int frag6_input __P((struct mbuf **, int *, int)); +int frag6_input __P((struct mbuf **, int *)); void frag6_slowtimo __P((void)); void frag6_drain __P((void)); @@ -349,8 +349,8 @@ int rip6_usrreq __P((struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct proc *)); -int dest6_input __P((struct mbuf **, int *, int)); -int none_input __P((struct mbuf **, int *, int)); +int dest6_input __P((struct mbuf **, int *)); +int none_input __P((struct mbuf **, int *)); #endif /* KERNEL */ #endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/netinet6/ip6protosw.h xnu-517/bsd/netinet6/ip6protosw.h --- xnu-344.49/bsd/netinet6/ip6protosw.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ip6protosw.h Sat Oct 25 00:25:55 2003 @@ -129,8 +129,7 @@ int (*pr_output) __P((struct mbuf *m, struct socket *so, struct sockaddr_in6 *, struct mbuf *)); /* output to protocol (from above) */ - void (*pr_ctlinput)__P((int, struct sockaddr *, struct ip6_hdr *, - struct mbuf *, int)); + void (*pr_ctlinput)__P((int, struct sockaddr *, void *)); /* control input (from below) */ int (*pr_ctloutput)__P((struct socket *, struct sockopt *)); /* control output (from above) */ diff -urN xnu-344.49/bsd/netinet6/ipsec.c xnu-517/bsd/netinet6/ipsec.c --- xnu-344.49/bsd/netinet6/ipsec.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ipsec.c Sat Oct 25 00:25:55 2003 @@ -104,6 +104,14 @@ int ipsec_debug = 0; #endif +#include +#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) +#define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) +#define DBG_FNC_GETPOL_SOCK NETDBG_CODE(DBG_NETIPSEC, (1 << 8)) +#define DBG_FNC_GETPOL_ADDR NETDBG_CODE(DBG_NETIPSEC, (2 << 8)) +#define DBG_FNC_IPSEC_OUT NETDBG_CODE(DBG_NETIPSEC, (3 << 8)) + + struct ipsecstat ipsecstat; int ip4_ah_cleartos = 1; int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ @@ -115,7 +123,9 @@ struct secpolicy ip4_def_policy; int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ int ip4_esp_randpad = -1; +int esp_udp_encap_port = 0; static int sysctl_def_policy SYSCTL_HANDLER_ARGS; +extern u_int32_t natt_now; SYSCTL_DECL(_net_inet_ipsec); #if INET6 @@ -151,6 +161,15 @@ int ipsec_bypass = 1; SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD, &ipsec_bypass,0, ""); +/* + * NAT Traversal requires a UDP port for encapsulation, + * esp_udp_encap_port controls which port is used. Racoon + * must set this port to the port racoon is using locally + * for nat traversal. + */ +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, esp_port, + CTLFLAG_RW, &esp_udp_encap_port, 0, ""); + #if INET6 struct ipsecstat ipsec6stat; int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; @@ -219,6 +238,7 @@ static struct mbuf *ipsec_addaux __P((struct mbuf *)); static struct mbuf *ipsec_findaux __P((struct mbuf *)); static void ipsec_optaux __P((struct mbuf *, struct mbuf *)); +void ipsec_send_natt_keepalive(struct secasvar *sav); static int sysctl_def_policy SYSCTL_HANDLER_ARGS @@ -282,6 +302,8 @@ return ipsec4_getpolicybyaddr(m, dir, 0, error); } + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_START, 0,0,0,0,0); + switch (so->so_proto->pr_domain->dom_family) { case AF_INET: /* set spidx in pcb */ @@ -296,8 +318,10 @@ default: panic("ipsec4_getpolicybysock: unsupported address family\n"); } - if (*error) + if (*error) { + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 1,*error,0,0,0); return NULL; + } /* sanity check */ if (pcbsp == NULL) @@ -324,6 +348,7 @@ case IPSEC_POLICY_BYPASS: currsp->refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 2,*error,0,0,0); return currsp; case IPSEC_POLICY_ENTRUST: @@ -336,6 +361,7 @@ printf("DP ipsec4_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 3,*error,0,0,0); return kernsp; } @@ -349,17 +375,20 @@ } ip4_def_policy.refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 4,*error,0,0,0); return &ip4_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 5,*error,0,0,0); return currsp; default: ipseclog((LOG_ERR, "ipsec4_getpolicybysock: " "Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 6,*error,0,0,0); return NULL; } /* NOTREACHED */ @@ -375,6 +404,7 @@ printf("DP ipsec4_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 7,*error,0,0,0); return kernsp; } @@ -385,6 +415,7 @@ "Illegal policy for non-priviliged defined %d\n", currsp->policy)); *error = EINVAL; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 8,*error,0,0,0); return NULL; case IPSEC_POLICY_ENTRUST: @@ -397,17 +428,20 @@ } ip4_def_policy.refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 9,*error,0,0,0); return &ip4_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 10,*error,0,0,0); return currsp; default: ipseclog((LOG_ERR, "ipsec4_getpolicybysock: " "Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; + KERNEL_DEBUG(DBG_FNC_GETPOL_SOCK | DBG_FUNC_END, 11,*error,0,0,0); return NULL; } /* NOTREACHED */ @@ -442,14 +476,17 @@ { struct secpolicyindex spidx; + KERNEL_DEBUG(DBG_FNC_GETPOL_ADDR | DBG_FUNC_START, 0,0,0,0,0); bzero(&spidx, sizeof(spidx)); /* make a index to look for a policy */ *error = ipsec_setspidx_mbuf(&spidx, dir, AF_INET, m, (flag & IP_FORWARDING) ? 0 : 1); - if (*error != 0) + if (*error != 0) { + KERNEL_DEBUG(DBG_FNC_GETPOL_ADDR | DBG_FUNC_END, 1,*error,0,0,0); return NULL; + } sp = key_allocsp(&spidx, dir); } @@ -460,6 +497,7 @@ printf("DP ipsec4_getpolicybyaddr called " "to allocate SP:%p\n", sp)); *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_ADDR | DBG_FUNC_END, 2,*error,0,0,0); return sp; } @@ -473,6 +511,7 @@ } ip4_def_policy.refcnt++; *error = 0; + KERNEL_DEBUG(DBG_FNC_GETPOL_ADDR | DBG_FUNC_END, 3,*error,0,0,0); return &ip4_def_policy; } @@ -803,9 +842,11 @@ goto bad; spidx->dir = IPSEC_DIR_INBOUND; - KEYDEBUG(KEYDEBUG_IPSEC_DUMP, - printf("ipsec_setspidx_mbuf: end\n"); - kdebug_secpolicyindex(spidx)); + spidx = &pcb->in6p_sp->sp_out->spidx; + error = ipsec_setspidx(m, spidx, 1); + if (error) + goto bad; + spidx->dir = IPSEC_DIR_OUTBOUND; return 0; @@ -1872,7 +1913,7 @@ size_t siz, clen; KEYDEBUG(KEYDEBUG_IPSEC_DATA, - printf("ipsec_in_reject: using SP\n"); + printf("ipsec_hdrsiz: using SP\n"); kdebug_secpolicy(sp)); /* check policy */ @@ -2105,13 +2146,13 @@ ip->ip_off &= htons(~IP_OFFMASK); ip->ip_off &= htons(~IP_MF); switch (ip4_ipsec_dfbit) { - case 0: /*clear DF bit*/ + case 0: /* clear DF bit */ ip->ip_off &= htons(~IP_DF); break; - case 1: /*set DF bit*/ + case 1: /* set DF bit */ ip->ip_off |= htons(IP_DF); break; - default: /*copy DF bit*/ + default: /* copy DF bit */ break; } ip->ip_p = IPPROTO_IPIP; @@ -2381,7 +2422,7 @@ } /* - * shift variable length bunffer to left. + * shift variable length buffer to left. * IN: bitmap: pointer to the buffer * nbit: the number of to shift. * wsize: buffer size (bytes). @@ -2559,6 +2600,8 @@ if (!state->dst) panic("state->dst == NULL in ipsec4_output"); + KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_START, 0,0,0,0,0); + KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec4_output: applyed SP\n"); kdebug_secpolicy(sp)); @@ -2746,11 +2789,13 @@ ip = mtod(state->m, struct ip *); } + KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, 0,0,0,0,0); return 0; bad: m_freem(state->m); state->m = NULL; + KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, error,0,0,0,0); return error; } #endif @@ -2776,17 +2821,17 @@ struct sockaddr_in6 *sin6; if (!state) - panic("state == NULL in ipsec6_output"); + panic("state == NULL in ipsec6_output_trans"); if (!state->m) - panic("state->m == NULL in ipsec6_output"); + panic("state->m == NULL in ipsec6_output_trans"); if (!nexthdrp) - panic("nexthdrp == NULL in ipsec6_output"); + panic("nexthdrp == NULL in ipsec6_output_trans"); if (!mprev) - panic("mprev == NULL in ipsec6_output"); + panic("mprev == NULL in ipsec6_output_trans"); if (!sp) - panic("sp == NULL in ipsec6_output"); + panic("sp == NULL in ipsec6_output_trans"); if (!tun) - panic("tun == NULL in ipsec6_output"); + panic("tun == NULL in ipsec6_output_trans"); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_trans: applyed SP\n"); @@ -2947,11 +2992,11 @@ int s; if (!state) - panic("state == NULL in ipsec6_output"); + panic("state == NULL in ipsec6_output_tunnel"); if (!state->m) - panic("state->m == NULL in ipsec6_output"); + panic("state->m == NULL in ipsec6_output_tunnel"); if (!sp) - panic("sp == NULL in ipsec6_output"); + panic("sp == NULL in ipsec6_output_tunnel"); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_tunnel: applyed SP\n"); @@ -2966,9 +3011,48 @@ break; } - for (/*already initialized*/; isr; isr = isr->next) { - /* When tunnel mode, SA peers must be specified. */ - bcopy(&isr->saidx, &saidx, sizeof(saidx)); + for (/* already initialized */; isr; isr = isr->next) { + if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { + /* When tunnel mode, SA peers must be specified. */ + bcopy(&isr->saidx, &saidx, sizeof(saidx)); + } else { + /* make SA index to look for a proper SA */ + struct sockaddr_in6 *sin6; + + bzero(&saidx, sizeof(saidx)); + saidx.proto = isr->saidx.proto; + saidx.mode = isr->saidx.mode; + saidx.reqid = isr->saidx.reqid; + + ip6 = mtod(state->m, struct ip6_hdr *); + sin6 = (struct sockaddr_in6 *)&saidx.src; + if (sin6->sin6_len == 0) { + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = IPSEC_PORT_ANY; + bcopy(&ip6->ip6_src, &sin6->sin6_addr, + sizeof(ip6->ip6_src)); + if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { + /* fix scope id for comparing SPD */ + sin6->sin6_addr.s6_addr16[1] = 0; + sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); + } + } + sin6 = (struct sockaddr_in6 *)&saidx.dst; + if (sin6->sin6_len == 0) { + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = IPSEC_PORT_ANY; + bcopy(&ip6->ip6_dst, &sin6->sin6_addr, + sizeof(ip6->ip6_dst)); + if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { + /* fix scope id for comparing SPD */ + sin6->sin6_addr.s6_addr16[1] = 0; + sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); + } + } + } + if (key_checkrequest(isr, &saidx) == ENOENT) { /* * IPsec processing is required, but no SA found. @@ -3354,6 +3438,14 @@ sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst, (struct sockaddr *)&isrc, (struct sockaddr *)&idst); + /* + * when there is no suitable inbound policy for the packet of the ipsec + * tunnel mode, the kernel never decapsulate the tunneled packet + * as the ipsec tunnel mode even when the system wide policy is "none". + * then the kernel leaves the generic tunnel module to process this + * packet. if there is no rule of the generic tunnel, the packet + * is rejected and the statistics will be counted up. + */ if (!sp) return 0; key_freesp(sp); @@ -3577,7 +3669,7 @@ if (!n) return ENOBUFS; if (M_TRAILINGSPACE(n) < sizeof(*p)) - return ENOSPC; /*XXX*/ + return ENOSPC; /* XXX */ p = (struct ipsec_history *)(mtod(n, caddr_t) + n->m_len); n->m_len += sizeof(*p); bzero(p, sizeof(*p)); @@ -3619,4 +3711,43 @@ if ((n) && n->m_len > sizeof(struct socket *)) n->m_len = sizeof(struct socket *); ipsec_optaux(m, n); +} + +__private_extern__ void +ipsec_send_natt_keepalive( + struct secasvar *sav) +{ + struct mbuf *m; + struct udphdr *uh; + struct ip *ip; + + if ((esp_udp_encap_port & 0xFFFF) == 0 || sav->remote_ike_port == 0) return; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) return; + + /* + * Create a UDP packet complete with IP header. + * We must do this because UDP output requires + * an inpcb which we don't have. UDP packet + * contains one byte payload. The byte is set + * to 0xFF. + */ + ip = (struct ip*)m_mtod(m); + uh = (struct udphdr*)((char*)m_mtod(m) + sizeof(struct ip)); + m->m_len = sizeof(struct udpiphdr) + 1; + bzero(m_mtod(m), m->m_len); + ip->ip_len = ntohs(m->m_len); + ip->ip_ttl = ip_defttl; + ip->ip_p = IPPROTO_UDP; + ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; + ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; + uh->uh_sport = ntohs((u_short)esp_udp_encap_port); + uh->uh_dport = ntohs(sav->remote_ike_port); + uh->uh_ulen = htons(1 + sizeof(struct udphdr)); + uh->uh_sum = 0; + *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; + + if (ip_output(m, NULL, &sav->sah->sa_route, IP_NOIPSEC, NULL) == 0) + sav->natt_last_activity = natt_now; } diff -urN xnu-344.49/bsd/netinet6/ipsec.h xnu-517/bsd/netinet6/ipsec.h --- xnu-344.49/bsd/netinet6/ipsec.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/ipsec.h Sat Oct 25 00:25:55 2003 @@ -45,10 +45,9 @@ #ifdef __APPLE_API_PRIVATE /* * Security Policy Index - * NOTE: Ensure to be same address family and upper layer protocol. - * NOTE: ul_proto, port number, uid, gid: - * ANY: reserved for waldcard. - * 0 to (~0 - 1): is one of the number of each value. + * Ensure that both address families in the "src" and "dst" are same. + * When the value of the ul_proto is ICMPv6, the port field in "src" + * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code. */ struct secpolicyindex { u_int8_t dir; /* direction of packet flow, see blow */ @@ -224,7 +223,7 @@ #define IPSECCTL_DEF_ESP_NETLEV 4 /* int; ESP tunnel mode */ #define IPSECCTL_DEF_AH_TRANSLEV 5 /* int; AH transport mode */ #define IPSECCTL_DEF_AH_NETLEV 6 /* int; AH tunnel mode */ -#if 0 /*obsolete, do not reuse*/ +#if 0 /* obsolete, do not reuse */ #define IPSECCTL_INBOUND_CALL_IKE 7 #endif #define IPSECCTL_AH_CLEARTOS 8 diff -urN xnu-344.49/bsd/netinet6/mld6.c xnu-517/bsd/netinet6/mld6.c --- xnu-344.49/bsd/netinet6/mld6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/mld6.c Sat Oct 25 00:25:55 2003 @@ -102,6 +102,7 @@ static struct ip6_pktopts ip6_opts; static int mld6_timers_are_running; +static int mld6_init_done = 0 ; /* XXX: These are necessary for KAME's link-local hack */ static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT; static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; @@ -115,6 +116,10 @@ struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); + if (mld6_init_done) + return; + + mld6_init_done = 1; mld6_timers_are_running = 0; /* ip6h_nxt will be fill in later */ diff -urN xnu-344.49/bsd/netinet6/nd6.c xnu-517/bsd/netinet6/nd6.c --- xnu-344.49/bsd/netinet6/nd6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/nd6.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/nd6.c,v 1.2.2.9 2001/07/11 09:39:04 ume Exp $ */ +/* $FreeBSD: src/sys/netinet6/nd6.c,v 1.20 2002/08/02 20:49:14 rwatson Exp $ */ /* $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ /* @@ -200,9 +200,24 @@ #ifndef MIN #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif - struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - u_long oldmaxmtu = ndi->maxmtu; - u_long oldlinkmtu = ndi->linkmtu; + + struct nd_ifinfo *ndi; + u_long oldmaxmtu, oldlinkmtu, dl_tag; + + /* + * Make sure IPv6 is enabled for the interface first, + * because this can be called directly from SIOCSIFMTU for IPv4 + */ + + if (ifp->if_index >= nd_ifinfo_indexlim) { + if (dlil_find_dltag(ifp->if_family, ifp->if_unit, PF_INET6, &dl_tag) != EPROTONOSUPPORT) + nd6log((LOG_INFO, "setmtu for ifp=% but nd6 is not attached\n", if_name(ifp))); + return; /* we're out of bound for nd_ifinfo */ + } + + ndi = &nd_ifinfo[ifp->if_index]; + oldmaxmtu = ndi->maxmtu; + oldlinkmtu = ndi->linkmtu; switch (ifp->if_type) { case IFT_ARCNET: /* XXX MTU handling needs more work */ @@ -438,7 +453,6 @@ timeout(nd6_timer_funneled, (caddr_t)0, nd6_prune * hz); ln = llinfo_nd6.ln_next; - /* XXX BSD/OS separates this code -- itojun */ while (ln && ln != &llinfo_nd6) { struct rtentry *rt; struct sockaddr_in6 *dst; @@ -461,15 +475,24 @@ ln = next; continue; } - + /* sanity check */ - if (!rt) - panic("rt=0 in nd6_timer(ln=%p)\n", ln); - if (rt->rt_llinfo && (struct llinfo_nd6 *)rt->rt_llinfo != ln) - panic("rt_llinfo(%p) is not equal to ln(%p)\n", + if (!rt) { + printf("rt=0 in nd6_timer(ln=%p)\n", ln); + ln = next; + continue; + } + if (rt->rt_llinfo && (struct llinfo_nd6 *)rt->rt_llinfo != ln) { + printf("rt_llinfo(%p) is not equal to ln(%p)\n", rt->rt_llinfo, ln); - if (!dst) - panic("dst=0 in nd6_timer(ln=%p)\n", ln); + ln = next; + continue; + } + if (!dst) { + printf("dst=0 in nd6_timer(ln=%p)\n", ln); + ln = next; + continue; + } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: @@ -481,6 +504,7 @@ ln, 0); } else { struct mbuf *m = ln->ln_hold; + ln->ln_hold = NULL; if (m) { if (rt->rt_ifp) { /* @@ -572,7 +596,7 @@ /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when - * we suspended a laptop PC, then turned on after a + * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the @@ -589,7 +613,8 @@ if (regen) goto addrloop; /* XXX: see below */ - } else if (IFA6_IS_DEPRECATED(ia6)) { + } + if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; @@ -610,15 +635,15 @@ * has changed while we are still in * the loop. Although the change * would not cause disaster (because - * it's not an addition, but a - * deletion,) we'd rather restart the + * it's not a deletion, but an + * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } - } else if (IFA6_IS_DEPRECATED(ia6)) { + } else { /* * A new RA might have made a deprecated address * preferred. @@ -634,14 +659,6 @@ * check prefix lifetime. * since pltime is just for autoconf, pltime processing for * prefix is not necessary. - * - * we offset expire time by NDPR_KEEP_EXPIRE, so that we - * can use the old prefix information to validate the - * next prefix information to come. See prelist_update() - * for actual validation. - * - * I don't think such an offset is necessary. - * (jinmei@kame.net, 20010130). */ if (pr->ndpr_expire && pr->ndpr_expire < time_second) { struct nd_prefix *t; @@ -772,7 +789,7 @@ if (nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); - if (!ip6_forwarding && ip6_accept_rtadv) { /* XXX: too restrictive? */ + if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { /* refresh default router list */ bzero(&drany, sizeof(drany)); defrouter_delreq(&drany, 0); @@ -848,10 +865,10 @@ return(NULL); /* - * Create a new route. RTF_LLINFO is necessary + * Create a new route. RTF_LLINFO is necessary * to create a Neighbor Cache entry for the * destination in nd6_rtrequest which will be - * called in rtequest via ifa->ifa_rtrequest. + * called in rtrequest via ifa->ifa_rtrequest. */ if ((e = rtrequest(RTM_ADD, (struct sockaddr *)&sin6, ifa->ifa_addr, @@ -877,20 +894,26 @@ rtunref(rt); /* * Validation for the entry. + * Note that the check for rt_llinfo is necessary because a cloned + * route from a parent route that has the L flag (e.g. the default + * route to a p2p interface) may have the flag, too, while the + * destination is not actually a neighbor. * XXX: we can't use rt->rt_ifp to check for the interface, since * it might be the loopback interface if the entry is for our * own address on a non-loopback interface. Instead, we should - * use rt->rt_ifa->ifa_ifp, which would specify the REAL interface. + * use rt->rt_ifa->ifa_ifp, which would specify the REAL + * interface. */ - if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || - rt->rt_gateway->sa_family != AF_LINK || - (ifp && rt->rt_ifa->ifa_ifp != ifp)) { + if ((ifp->if_type !=IFT_PPP) && ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || + rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || + + (ifp && rt->rt_ifa->ifa_ifp != ifp))) { if (create) { log(LOG_DEBUG, "nd6_lookup: failed to lookup %s (if = %s)\n", ip6_sprintf(addr6), ifp ? if_name(ifp) : "unspec"); /* xxx more logs... kazu */ } - return(0); + return(NULL); } return(rt); } @@ -943,7 +966,7 @@ * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ - if (nd6_lookup(&addr->sin6_addr, 0, ifp)) + if (nd6_lookup(&addr->sin6_addr, 0, ifp) != NULL) return(1); return(0); @@ -967,13 +990,13 @@ * even though it is not harmful, it was not really necessary. */ - if (!ip6_forwarding && ip6_accept_rtadv) { /* XXX: too restrictive? */ + if (!ip6_forwarding && (ip6_accept_rtadv || (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { int s; s = splnet(); dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, rt->rt_ifp); - if (ln->ln_router || dr) { + if (ln && ln->ln_router || dr) { /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. @@ -992,7 +1015,7 @@ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of - * prefixes coreectly. + * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ @@ -1027,9 +1050,12 @@ * Before deleting the entry, remember the next entry as the * return value. We need this because pfxlist_onlink_check() above * might have freed other entries (particularly the old next entry) as - * a side effect (XXX). + * a side effect (XXX). */ - next = ln->ln_next; + if (ln) + next = ln->ln_next; + else + next = 0; /* * Detach the route from the routing tree and the list of neighbor @@ -1106,7 +1132,7 @@ struct ifnet *ifp = rt->rt_ifp; struct ifaddr *ifa; - if (rt->rt_flags & RTF_GATEWAY) + if ((rt->rt_flags & RTF_GATEWAY)) return; if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) { @@ -1120,6 +1146,27 @@ return; } + if (req == RTM_RESOLVE && + (nd6_need_cache(ifp) == 0 || /* stf case */ + !nd6_is_addr_neighbor((struct sockaddr_in6 *)rt_key(rt), ifp))) { + /* + * FreeBSD and BSD/OS often make a cloned host route based + * on a less-specific route (e.g. the default route). + * If the less specific route does not have a "gateway" + * (this is the case when the route just goes to a p2p or an + * stf interface), we'll mistakenly make a neighbor cache for + * the host route, and will see strange neighbor solicitation + * for the corresponding destination. In order to avoid the + * confusion, we check if the destination of the route is + * a neighbor in terms of neighbor discovery, and stop the + * process if not. Additionally, we remove the LLINFO flag + * so that ndp(8) will not try to get the neighbor information + * of the destination. + */ + rt->rt_flags &= ~RTF_LLINFO; + return; + } + switch (req) { case RTM_ADD: /* @@ -1132,7 +1179,7 @@ if (rt->rt_flags & (RTF_CLONING | RTF_LLINFO)) { /* * Case 1: This route should come from - * a route to interface. RTF_LLINFO flag is set + * a route to interface. RTF_LLINFO flag is set * for a host route whose destination should be * treated as on-link. */ @@ -1147,13 +1194,13 @@ if (ln && ln->ln_expire == 0) { /* kludge for desktops */ #if 0 - printf("nd6_request: time.tv_sec is zero; " + printf("nd6_rtequest: time.tv_sec is zero; " "treat it as 1\n"); #endif ln->ln_expire = 1; } #endif - if (rt->rt_flags & RTF_CLONING) + if ((rt->rt_flags & RTF_CLONING)) break; } /* @@ -1255,7 +1302,7 @@ SDL(gate)->sdl_alen = ifp->if_addrlen; } if (nd6_useloopback) { - rt->rt_ifp = &loif[0]; /*XXX*/ + rt->rt_ifp = &loif[0]; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. @@ -1322,6 +1369,7 @@ rt->rt_flags &= ~RTF_LLINFO; if (ln->ln_hold) m_freem(ln->ln_hold); + ln->ln_hold = NULL; Free((caddr_t)ln); } } @@ -1503,7 +1551,7 @@ /* do we really have to remove addresses as well? */ for (ia = in6_ifaddr; ia; ia = ia_next) { - /* ia might be removed. keep the next ptr. */ + /* ia might be removed. keep the next ptr. */ ia_next = ia->ia_next; if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) @@ -1673,7 +1721,7 @@ * 1 -- y -- (7) * STALE */ - if (lladdr) { /*(3-5) and (7)*/ + if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? @@ -1683,17 +1731,17 @@ } if (!is_newentry) { - if ((!olladdr && lladdr) /*(3)*/ - || (olladdr && lladdr && llchange)) { /*(5)*/ + if ((!olladdr && lladdr) /* (3) */ + || (olladdr && lladdr && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; - } else /*(1-2,4)*/ + } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; - if (!lladdr) /*(6)*/ + if (!lladdr) /* (6) */ newstate = ND6_LLINFO_NOSTATE; - else /*(7)*/ + else /* (7) */ newstate = ND6_LLINFO_STALE; } @@ -1762,7 +1810,7 @@ /* * New entry must have is_router flag cleared. */ - if (is_newentry) /*(6-7)*/ + if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: @@ -1773,7 +1821,7 @@ */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; - else if (is_newentry) /*(6-7)*/ + else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: @@ -1786,8 +1834,8 @@ /* * Mark an entry with lladdr as a router. */ - if ((!is_newentry && (olladdr || lladdr)) /*(2-5)*/ - || (is_newentry && lladdr)) { /*(7)*/ + if ((!is_newentry && (olladdr || lladdr)) /* (2-5) */ + || (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; @@ -1808,13 +1856,12 @@ * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ - if (do_update && ln->ln_router && !ip6_forwarding && ip6_accept_rtadv) + if (do_update && ln->ln_router && !ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) defrouter_select(); return rt; } - static void nd6_slowtimo(ignored_arg) void *ignored_arg; @@ -1880,7 +1927,7 @@ goto sendpkt; /* - * next hop determination. This routine is derived from ether_outpout. + * next hop determination. This routine is derived from ether_outpout. */ if (rt) { if ((rt->rt_flags & RTF_UP) == 0) { @@ -1903,9 +1950,8 @@ /* * We skip link-layer address resolution and NUD * if the gateway is not a neighbor from ND point - * of view, regardless the value of the - * nd_ifinfo.flags. - * The second condition is a bit tricky: we skip + * of view, regardless of the value of nd_ifinfo.flags. + * The second condition is a bit tricky; we skip * if the gateway is our own address, which is * sometimes used to install a route to a p2p link. */ @@ -1946,7 +1992,7 @@ else { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), - * the condition below is not very efficient. But we believe + * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ if (nd6_is_addr_neighbor(dst, ifp) && @@ -1988,7 +2034,7 @@ /* * If the neighbor cache entry has a state other than INCOMPLETE - * (i.e. its link-layer address is already reloved), just + * (i.e. its link-layer address is already resolved), just * send the packet. */ if (ln->ln_state > ND6_LLINFO_INCOMPLETE) @@ -1996,11 +2042,12 @@ /* * There is a neighbor cache entry, but no ethernet address - * response yet. Replace the held mbuf (if any) with this + * response yet. Replace the held mbuf (if any) with this * latest one. * - * XXX Does the code conform to rate-limiting rule? - * (RFC 2461 7.2.2) + * This code conforms to the rate-limiting rule described in Section + * 7.2.2 of RFC 2461, because the timer is set correctly after sending + * an NS below. */ if (ln->ln_state == ND6_LLINFO_NOSTATE) ln->ln_state = ND6_LLINFO_INCOMPLETE; @@ -2023,14 +2070,15 @@ /* Make sure the HW checksum flags are cleaned before sending the packet */ - m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { + m->m_pkthdr.rcvif = origifp; /* forwarding rules require the original scope_id */ return (dlil_output(ifptodlt(origifp, PF_INET6), m, (caddr_t)rt, (struct sockaddr *)dst,0)); } + m->m_pkthdr.rcvif = (struct ifnet *)0; return (dlil_output(ifptodlt(ifp, PF_INET6), m, (caddr_t)rt, (struct sockaddr *)dst, 0)); #else if ((ifp->if_flags & IFF_LOOPBACK) != 0) { @@ -2108,27 +2156,23 @@ *desten = 0; return(1); default: - m_freem(m); - return(0); + return(0); /* caller will free mbuf */ } } if (rt == NULL) { /* this could happen, if we could not allocate memory */ - m_freem(m); - return(0); + return(0); /* caller will free mbuf */ } if (rt->rt_gateway->sa_family != AF_LINK) { printf("nd6_storelladdr: something odd happens\n"); - m_freem(m); - return(0); + return(0); /* caller will free mbuf */ } sdl = SDL(rt->rt_gateway); if (sdl->sdl_alen == 0) { /* this should be impossible, but we bark here for debugging */ printf("nd6_storelladdr: sdl_alen == 0\n"); - m_freem(m); - return(0); + return(0); /* caller will free mbuf */ } bcopy(LLADDR(sdl), desten, sdl->sdl_alen); diff -urN xnu-344.49/bsd/netinet6/nd6.h xnu-517/bsd/netinet6/nd6.h --- xnu-344.49/bsd/netinet6/nd6.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/nd6.h Sat Oct 25 00:25:55 2003 @@ -80,7 +80,7 @@ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ u_int8_t receivedra; - /* the followings are for privacy extension for addrconf */ + /* the following 3 members are for privacy extension for addrconf */ u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ u_int8_t randomid[8]; /* current random ID */ @@ -205,7 +205,7 @@ struct nd_defrouter { TAILQ_ENTRY(nd_defrouter) dr_entry; struct in6_addr rtaddr; - u_char flags; + u_char flags; /* flags on RA message */ u_short rtlifetime; u_long expire; u_long advint; /* Mobile IPv6 addition (milliseconds) */ @@ -250,7 +250,7 @@ */ struct inet6_ndpr_msghdr { u_short inpm_msglen; /* to skip over non-understood messages */ - u_char inpm_version; /* future binary compatability */ + u_char inpm_version; /* future binary compatibility */ u_char inpm_type; /* message type */ struct in6_addr inpm_prefix; u_long prm_vltim; @@ -312,7 +312,7 @@ struct nd_opt_hdr *zero; struct nd_opt_hdr *src_lladdr; struct nd_opt_hdr *tgt_lladdr; - struct nd_opt_prefix_info *pi_beg;/* multiple opts, start */ + struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */ struct nd_opt_rd_hdr *rh; struct nd_opt_mtu *mtu; struct nd_opt_hdr *six; diff -urN xnu-344.49/bsd/netinet6/nd6_nbr.c xnu-517/bsd/netinet6/nd6_nbr.c --- xnu-344.49/bsd/netinet6/nd6_nbr.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/nd6_nbr.c Sat Oct 25 00:25:55 2003 @@ -134,11 +134,11 @@ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL - /*don't check ifindex portion*/ + /* don't check ifindex portion */ && daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { - ; /*good*/ + ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); @@ -164,7 +164,7 @@ } if (ndopts.nd_opts_src_lladdr) { - lladdr = (char *)(ndopts.nd_opts_src_lladdr +1); + lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } @@ -253,9 +253,9 @@ } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { - log(LOG_INFO, - "nd6_ns_input: duplicate IP6 address %s\n", - ip6_sprintf(&saddr6)); + nd6log((LOG_INFO, + "nd6_ns_input: duplicate IP6 address %s\n", + ip6_sprintf(&saddr6))); goto freeit; } @@ -384,7 +384,7 @@ icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; - m->m_data += max_linkhdr; /*or MH_ALIGN() equivalent?*/ + m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); @@ -434,7 +434,7 @@ * - saddr6 belongs to the outgoing interface. * Otherwise, we perform a scope-wise match. */ - struct ip6_hdr *hip6; /*hold ip6*/ + struct ip6_hdr *hip6; /* hold ip6 */ struct in6_addr *saddr6; if (ln && ln->ln_hold) { @@ -451,7 +451,10 @@ else { ia = in6_ifawithifp(ifp, &ip6->ip6_dst); if (ia == NULL) { - m_freem(m); /*XXX*/ + if (ln && ln->ln_hold) + m_freem(ln->ln_hold); + ln->ln_hold = NULL; + m_freem(m); return; } ip6->ip6_src = ia->ia_addr.sin6_addr; @@ -624,7 +627,7 @@ goto freeit; } - /* Just for safety, maybe unnecessery. */ + /* Just for safety, maybe unnecessary. */ if (ifa) { log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", @@ -769,11 +772,18 @@ int s; in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; + + /* + * Lock to protect the default router list. + * XXX: this might be unnecessary, since this function + * is only called under the network software interrupt + * context. However, we keep it just for safety. + */ s = splnet(); dr = defrouter_lookup(in6, rt->rt_ifp); if (dr) defrtrlist_del(dr); - else if (!ip6_forwarding && ip6_accept_rtadv) { + else if (!ip6_forwarding && (ip6_accept_rtadv || (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { /* * Even if the neighbor is not in the default * router list, the neighbor may be used @@ -791,7 +801,7 @@ ln->ln_asked = 0; if (ln->ln_hold) { /* - * we assume ifp is not a p2p here, so just set the 2nd + * we assume ifp is not a loopback here, so just set the 2nd * argument as the 1st one. */ nd6_output(ifp, ifp, ln->ln_hold, @@ -832,7 +842,7 @@ struct ip6_moptions im6o; int icmp6len; int maxlen; - caddr_t mac; + caddr_t mac = NULL; struct ifnet *outif = NULL; /* estimate the size of message */ @@ -867,7 +877,7 @@ icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; - m->m_data += max_linkhdr; /*or MH_ALIGN() equivalent?*/ + m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); @@ -910,7 +920,6 @@ * target lladdr option SHOULD NOT be included. */ if (tlladdr) { - mac = NULL; /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for @@ -992,9 +1001,6 @@ int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 - struct callout_handle dad_timer; -#endif }; static struct dadq_head dadq; @@ -1031,6 +1037,7 @@ callout_reset(&dp->dad_timer_ch, ticks, (void (*) __P((void *)))nd6_dad_timer, (void *)dp->dad_ifa); } + static void nd6_dad_stoptimer(dp) struct dadq *dp; @@ -1096,9 +1103,6 @@ return; } bzero(dp, sizeof(*dp)); -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 - callout_init(&dp->dad_timer_ch); -#endif TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), @@ -1115,11 +1119,8 @@ dp->dad_count = ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; - if (!tick) { + if (tick == NULL) { nd6_dad_ns_output(dp, ifa); -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 - dp->dad_timer = -#endif timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { @@ -1130,9 +1131,6 @@ else ntick = *tick + random() % (hz / 2); *tick = ntick; -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 - dp->dad_timer = -#endif timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, ntick); } @@ -1188,7 +1186,7 @@ struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; - s = splnet(); /*XXX*/ + s = splnet(); /* XXX */ /* Sanity check */ if (ia == NULL) { @@ -1233,9 +1231,6 @@ * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 - dp->dad_timer = -#endif timeout((void (*) __P((void *)))nd6_dad_timer_funnel, (void *)ifa, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { @@ -1256,7 +1251,7 @@ } if (dp->dad_ns_icount) { -#if 0 /*heuristics*/ +#if 0 /* heuristics */ /* * if * - we have sent many(?) DAD NS, and diff -urN xnu-344.49/bsd/netinet6/nd6_rtr.c xnu-517/bsd/netinet6/nd6_rtr.c --- xnu-344.49/bsd/netinet6/nd6_rtr.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/nd6_rtr.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/nd6_rtr.c,v 1.2.2.3 2001/07/03 11:01:54 ume Exp $ */ +/* $FreeBSD: src/sys/netinet6/nd6_rtr.c,v 1.11 2002/04/19 04:46:23 suz Exp $ */ /* $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $ */ /* @@ -126,7 +126,7 @@ union nd_opts ndopts; /* If I'm not a router, ignore it. */ - if (ip6_accept_rtadv != 0 || ip6_forwarding != 1) + if (ip6_accept_rtadv != 0 || (ifp->if_eflags & IFEF_ACCEPT_RTADVD) || ip6_forwarding != 1) goto freeit; /* Sanity checks */ @@ -215,7 +215,7 @@ union nd_opts ndopts; struct nd_defrouter *dr; - if (ip6_accept_rtadv == 0) + if (ip6_accept_rtadv == 0 && ((ifp->if_eflags & IFEF_ACCEPT_RTADVD) == 0)) goto freeit; if (ip6->ip6_hlim != 255) { @@ -267,7 +267,7 @@ dr0.advints_lost = 0; /* Mobile IPv6 */ /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { - NTOHL(advreachable); + advreachable = ntohl(advreachable); if (advreachable <= MAX_REACHABLE_TIME && ndi->basereachable != advreachable) { ndi->basereachable = advreachable; @@ -396,7 +396,7 @@ skip: /* - * Src linkaddress + * Source link layer address */ { char *lladdr = NULL; @@ -451,7 +451,7 @@ info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_IFP] = - (struct sockaddr *)TAILQ_FIRST(&rt->rt_ifp->if_addrlist); + TAILQ_FIRST(&rt->rt_ifp->if_addrlist)->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; rt_missmsg(cmd, &info, rt->rt_flags, 0); @@ -530,7 +530,7 @@ nd6_rtmsg(RTM_ADD, newrt); rtunref(newrt); } - in6_post_msg(ifp, KEV_INET6_DEFROUTER, &def); + in6_post_msg(ifp, KEV_INET6_DEFROUTER, (struct in6_ifaddr *)ifa); } } @@ -598,7 +598,7 @@ * Flush all the routing table entries that use the router * as a next hop. */ - if (!ip6_forwarding && ip6_accept_rtadv) { + if (!ip6_forwarding && (ip6_accept_rtadv || (dr->ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { /* above is a good condition? */ rt6_flush(&dr->rtaddr, dr->ifp); } @@ -1735,6 +1735,7 @@ int in6_tmpifadd(ia0, forcegen) const struct in6_ifaddr *ia0; /* corresponding public address */ + int forcegen; { struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia; @@ -1830,6 +1831,16 @@ } newia->ia6_ndpr = ia0->ia6_ndpr; newia->ia6_ndpr->ndpr_refcnt++; + + /* + * A newly added address might affect the status of other addresses. + * XXX: when the temporary address is generated with a new public + * address, the onlink check is redundant. However, it would be safe + * to do the check explicitly everywhere a new address is generated, + * and, in fact, we surely need the check when we create a new + * temporary address due to deprecation of an old temporary address. + */ + pfxlist_onlink_check(); return(0); } diff -urN xnu-344.49/bsd/netinet6/raw_ip6.c xnu-517/bsd/netinet6/raw_ip6.c --- xnu-344.49/bsd/netinet6/raw_ip6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/raw_ip6.c Sat Oct 25 00:25:55 2003 @@ -553,7 +553,9 @@ inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; MALLOC(inp->in6p_icmp6filt, struct icmp6_filter *, - sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); + sizeof(struct icmp6_filter), M_PCB, M_WAITOK); + if (inp->in6p_icmp6filt == NULL) + return (ENOMEM); ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); return 0; } diff -urN xnu-344.49/bsd/netinet6/route6.c xnu-517/bsd/netinet6/route6.c --- xnu-344.49/bsd/netinet6/route6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/route6.c Sat Oct 25 00:25:55 2003 @@ -48,9 +48,9 @@ struct ip6_rthdr0 *)); int -route6_input(mp, offp, proto) +route6_input(mp, offp) struct mbuf **mp; - int *offp, proto; /* proto is unused */ + int *offp; { struct ip6_hdr *ip6; struct mbuf *m = *mp; diff -urN xnu-344.49/bsd/netinet6/scope6.c xnu-517/bsd/netinet6/scope6.c --- xnu-344.49/bsd/netinet6/scope6.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/scope6.c Sat Oct 25 00:25:55 2003 @@ -1,4 +1,4 @@ -/* $FreeBSD: src/sys/netinet6/scope6.c,v 1.1.2.2 2001/07/03 11:01:55 ume Exp $ */ +/* $FreeBSD: src/sys/netinet6/scope6.c,v 1.3 2002/03/25 10:12:51 ume Exp $ */ /* $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $ */ /* @@ -222,7 +222,7 @@ } } - if (bcmp(&in6addr_loopback, addr, sizeof(addr) - 1) == 0) { + if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr8[15] == 1) /* loopback */ return IPV6_ADDR_SCOPE_NODELOCAL; if (addr->s6_addr8[15] == 0) /* unspecified */ diff -urN xnu-344.49/bsd/netinet6/tcp6_var.h xnu-517/bsd/netinet6/tcp6_var.h --- xnu-344.49/bsd/netinet6/tcp6_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/tcp6_var.h Sat Oct 25 00:25:55 2003 @@ -80,7 +80,7 @@ struct ip6_hdr; void tcp6_ctlinput __P((int, struct sockaddr *, void *)); void tcp6_init __P((void)); -int tcp6_input __P((struct mbuf **, int *, int)); +int tcp6_input __P((struct mbuf **, int *)); struct rtentry *tcp_rtlookup6 __P((struct inpcb *)); extern struct pr_usrreqs tcp6_usrreqs; diff -urN xnu-344.49/bsd/netinet6/udp6_usrreq.c xnu-517/bsd/netinet6/udp6_usrreq.c --- xnu-344.49/bsd/netinet6/udp6_usrreq.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/udp6_usrreq.c Sat Oct 25 00:25:55 2003 @@ -142,9 +142,9 @@ } int -udp6_input(mp, offp, proto) +udp6_input(mp, offp) struct mbuf **mp; - int *offp, proto; + int *offp; { struct mbuf *m = *mp; register struct ip6_hdr *ip6; @@ -547,6 +547,8 @@ return error; inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; + if (ip6_mapped_addr_on) + inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* @@ -635,7 +637,7 @@ error = in6_pcbconnect(inp, nam, p); splx(s); if (error == 0) { - if (ip6_mapped_addr_on) { /* should be non mapped addr */ + if (ip6_mapped_addr_on || (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { /* should be non mapped addr */ inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; } @@ -711,7 +713,7 @@ } } - if (ip6_mapped_addr_on) { + if (ip6_mapped_addr_on || (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = 0; diff -urN xnu-344.49/bsd/netinet6/udp6_var.h xnu-517/bsd/netinet6/udp6_var.h --- xnu-344.49/bsd/netinet6/udp6_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netinet6/udp6_var.h Sat Oct 25 00:25:55 2003 @@ -73,7 +73,7 @@ extern struct pr_usrreqs udp6_usrreqs; void udp6_ctlinput __P((int, struct sockaddr *, void *)); -int udp6_input __P((struct mbuf **, int *, int)); +int udp6_input __P((struct mbuf **, int *)); int udp6_output __P((struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct proc *p)); diff -urN xnu-344.49/bsd/netkey/key.c xnu-517/bsd/netkey/key.c --- xnu-344.49/bsd/netkey/key.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netkey/key.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ -/* $FreeBSD: src/sys/netkey/key.c,v 1.16.2.5 2001/07/03 11:01:58 ume Exp $ */ -/* $KAME: key.c,v 1.187 2001/05/24 07:41:22 itojun Exp $ */ +/* $FreeBSD: src/sys/netkey/key.c,v 1.16.2.13 2002/07/24 18:17:40 ume Exp $ */ +/* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -108,6 +109,8 @@ #define satosin(s) ((struct sockaddr_in *)s) #endif +#define FULLMASK 0xff + /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) @@ -128,9 +131,12 @@ static u_int key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ static int key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ static int key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ +static int key_preferred_oldsa = 0; /* preferred old sa rather than new sa.*/ +static int natt_keepalive_interval = 29; /* interval between natt keepalives.*/ static u_int32_t acq_seq = 0; static int key_tick_init_random = 0; +__private_extern__ u_int32_t natt_now = 0; static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ @@ -144,18 +150,17 @@ struct key_cb key_cb; /* search order for SAs */ -static u_int saorder_state_valid[] = { +static const u_int saorder_state_valid_prefer_old[] = { SADB_SASTATE_DYING, SADB_SASTATE_MATURE, - /* - * This order is important because we must select a oldest SA - * for outbound processing. For inbound, This is not important. - */ }; -static u_int saorder_state_alive[] = { +static const u_int saorder_state_valid_prefer_new[] = { + SADB_SASTATE_MATURE, SADB_SASTATE_DYING, +}; +static const u_int saorder_state_alive[] = { /* except DEAD */ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL }; -static u_int saorder_state_any[] = { +static const u_int saorder_state_any[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD }; @@ -184,7 +189,7 @@ }; static const int maxsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ - sizeof(struct sadb_sa), /* SADB_EXT_SA */ + sizeof(struct sadb_sa_2), /* SADB_EXT_SA */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */ @@ -243,6 +248,10 @@ SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, \ &key_blockacq_lifetime, 0, ""); +/* ESP auth */ +SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, \ + &ipsec_esp_auth, 0, ""); + /* minimum ESP key length */ SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, \ &ipsec_esp_keymin, 0, ""); @@ -251,6 +260,14 @@ SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, \ &ipsec_ah_keymin, 0, ""); +/* perfered old SA rather than new SA */ +SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW,\ + &key_preferred_oldsa, 0, ""); + +/* time between NATT keepalives in seconds, 0 disabled */ +SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW,\ + &natt_keepalive_interval, 0, ""); + #ifndef LIST_FOREACH #define LIST_FOREACH(elm, head, field) \ for (elm = LIST_FIRST(head); elm; elm = LIST_NEXT(elm, field)) @@ -271,20 +288,20 @@ #define KEY_CHKSASTATE(head, sav, name) \ do { \ - if ((head) != (sav)) { \ - printf("%s: state mismatched (TREE=%d SA=%d)\n", \ - (name), (head), (sav)); \ - continue; \ - } \ + if ((head) != (sav)) { \ + ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \ + (name), (head), (sav))); \ + continue; \ + } \ } while (0) #define KEY_CHKSPDIR(head, sp, name) \ do { \ - if ((head) != (sp)) { \ - printf("%s: direction mismatched (TREE=%d SP=%d), " \ - "anyway continue.\n", \ - (name), (head), (sp)); \ - } \ + if ((head) != (sp)) { \ + ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \ + "anyway continue.\n", \ + (name), (head), (sp))); \ + } \ } while (0) #if 1 @@ -396,21 +413,22 @@ static struct mbuf *key_setsadbident __P((u_int16_t, u_int16_t, caddr_t, int, u_int64_t)); #endif -static struct mbuf *key_setsadbxsa2(u_int8_t, u_int32_t); +static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t)); static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t, u_int32_t)); static void *key_newbuf __P((const void *, u_int)); #if INET6 static int key_ismyaddr6 __P((struct sockaddr_in6 *)); #endif -static int key_cmpsaidx_exactly - __P((struct secasindex *, struct secasindex *)); -static int key_cmpsaidx_withmode - __P((struct secasindex *, struct secasindex *)); -static int key_cmpsaidx_withoutmode2 - __P((struct secasindex *, struct secasindex *)); -static int key_cmpsaidx_withoutmode - __P((struct secasindex *, struct secasindex *)); + +/* flags for key_cmpsaidx() */ +#define CMP_HEAD 1 /* protocol, addresses. */ +#define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */ +#define CMP_REQID 3 /* additionally HEAD, reaid. */ +#define CMP_EXACTLY 4 /* all elements. */ +static int key_cmpsaidx + __P((struct secasindex *, struct secasindex *, int)); + static int key_cmpspidx_exactly __P((struct secpolicyindex *, struct secpolicyindex *)); static int key_cmpspidx_withmask @@ -479,6 +497,7 @@ static struct mbuf *key_alloc_mbuf __P((int)); extern int ipsec_bypass; +void ipsec_send_natt_keepalive(struct secasvar *sav); /* %%% IPsec policy management */ /* @@ -561,6 +580,12 @@ struct sockaddr *os, *od, *is, *id; struct secpolicyindex spidx; + if (isrc->sa_family != idst->sa_family) { + ipseclog((LOG_ERR, "protocol family mismatched %d != %d\n.", + isrc->sa_family, idst->sa_family)); + return NULL; + } + s = splnet(); /*called from softclock()*/ LIST_FOREACH(sp, &sptree[dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) @@ -692,11 +717,9 @@ /* there is no SA */ if ((error = key_acquire(saidx, isr->sp)) != 0) { - /* XXX What I do ? */ -#if IPSEC_DEBUG - printf("key_checkrequest: error %d returned " - "from key_acquire.\n", error); -#endif + /* XXX What should I do ? */ + ipseclog((LOG_DEBUG, "key_checkrequest: error %d returned " + "from key_acquire.\n", error)); return error; } @@ -716,11 +739,13 @@ struct secashead *sah; struct secasvar *sav; u_int stateidx, state; + const u_int *saorder_state_valid; + int arraysize; LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withmode(&sah->saidx, saidx)) + if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) goto found; } @@ -728,10 +753,19 @@ found: - /* search valid state */ - for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_valid); - stateidx++) { + /* + * search a valid state list for outbound packet. + * This search order is important. + */ + if (key_preferred_oldsa) { + saorder_state_valid = saorder_state_valid_prefer_old; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); + } else { + saorder_state_valid = saorder_state_valid_prefer_new; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); + } + + for (stateidx = 0; stateidx < arraysize; stateidx++) { state = saorder_state_valid[stateidx]; @@ -755,12 +789,16 @@ struct secashead *sah; u_int state; { - struct secasvar *sav, *candidate; + struct secasvar *sav, *nextsav, *candidate, *d; /* initilize */ candidate = NULL; - LIST_FOREACH(sav, &sah->savtree[state], chain) { + for (sav = LIST_FIRST(&sah->savtree[state]); + sav != NULL; + sav = nextsav) { + + nextsav = LIST_NEXT(sav, chain); /* sanity check */ KEY_CHKSASTATE(sav->state, state, "key_do_allocsa_policy"); @@ -778,11 +816,82 @@ panic("key_do_allocsa_policy: " "lifetime_current is NULL.\n"); - /* XXX What the best method is to compare ? */ - if (candidate->lft_c->sadb_lifetime_addtime > + /* What the best method is to compare ? */ + if (key_preferred_oldsa) { + if (candidate->lft_c->sadb_lifetime_addtime > + sav->lft_c->sadb_lifetime_addtime) { + candidate = sav; + } + continue; + /*NOTREACHED*/ + } + + /* prefered new sa rather than old sa */ + if (candidate->lft_c->sadb_lifetime_addtime < sav->lft_c->sadb_lifetime_addtime) { + d = candidate; candidate = sav; - continue; + } else + d = sav; + + /* + * prepared to delete the SA when there is more + * suitable candidate and the lifetime of the SA is not + * permanent. + */ + if (d->lft_c->sadb_lifetime_addtime != 0) { + struct mbuf *m, *result; + + key_sa_chgstate(d, SADB_SASTATE_DEAD); + + m = key_setsadbmsg(SADB_DELETE, 0, + d->sah->saidx.proto, 0, 0, d->refcnt - 1); + if (!m) + goto msgfail; + result = m; + + /* set sadb_address for saidx's. */ + m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, + (struct sockaddr *)&d->sah->saidx.src, + d->sah->saidx.src.ss_len << 3, + IPSEC_ULPROTO_ANY); + if (!m) + goto msgfail; + m_cat(result, m); + + /* set sadb_address for saidx's. */ + m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, + (struct sockaddr *)&d->sah->saidx.src, + d->sah->saidx.src.ss_len << 3, + IPSEC_ULPROTO_ANY); + if (!m) + goto msgfail; + m_cat(result, m); + + /* create SA extension */ + m = key_setsadbsa(d); + if (!m) + goto msgfail; + m_cat(result, m); + + if (result->m_len < sizeof(struct sadb_msg)) { + result = m_pullup(result, + sizeof(struct sadb_msg)); + if (result == NULL) + goto msgfail; + } + + result->m_pkthdr.len = 0; + for (m = result; m; m = m->m_next) + result->m_pkthdr.len += m->m_len; + mtod(result, struct sadb_msg *)->sadb_msg_len = + PFKEY_UNIT64(result->m_pkthdr.len); + + if (key_sendup_mbuf(NULL, result, + KEY_SENDUP_REGISTERED)) + goto msgfail; + msgfail: + key_freesav(d); } } @@ -823,12 +932,26 @@ struct sockaddr_in sin; struct sockaddr_in6 sin6; int s; + const u_int *saorder_state_valid; + int arraysize; /* sanity check */ if (src == NULL || dst == NULL) panic("key_allocsa: NULL pointer is passed.\n"); /* + * when both systems employ similar strategy to use a SA. + * the search order is important even in the inbound case. + */ + if (key_preferred_oldsa) { + saorder_state_valid = saorder_state_valid_prefer_old; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); + } else { + saorder_state_valid = saorder_state_valid_prefer_new; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); + } + + /* * searching SAD. * XXX: to be checked internal IP header somewhere. Also when * IPsec tunnel packet is received. But ESP tunnel mode is @@ -836,10 +959,11 @@ */ s = splnet(); /*called from softclock()*/ LIST_FOREACH(sah, &sahtree, chain) { - /* search valid state */ - for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_valid); - stateidx++) { + /* + * search a valid state list for inbound packet. + * the search order is not important. + */ + for (stateidx = 0; stateidx < arraysize; stateidx++) { state = saorder_state_valid[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ @@ -883,8 +1007,9 @@ continue; break; default: - printf("key_allocsa: unknown address family=%d.\n", - family); + ipseclog((LOG_DEBUG, "key_allocsa: " + "unknown address family=%d.\n", + family)); continue; } @@ -919,8 +1044,9 @@ continue; break; default: - printf("key_allocsa: unknown address family=%d.\n", - family); + ipseclog((LOG_DEBUG, "key_allocsa: " + "unknown address family=%d.\n", + family)); continue; } @@ -1015,10 +1141,8 @@ break; #endif /* INET6 */ default: -#if IPSEC_DEBUG - printf("key_freeso: unknown address family=%d.\n", - so->so_proto->pr_domain->dom_family); -#endif + ipseclog((LOG_DEBUG, "key_freeso: unknown address family=%d.\n", + so->so_proto->pr_domain->dom_family)); return; } @@ -1216,9 +1340,7 @@ if (len < sizeof(*xpl0)) panic("key_msg2sp: invalid length.\n"); if (len != PFKEY_EXTLEN(xpl0)) { -#if IPSEC_DEBUG - printf("key_msg2sp: Invalid msg length.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: Invalid msg length.\n")); *error = EINVAL; return NULL; } @@ -1248,9 +1370,8 @@ /* validity check */ if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) { -#if IPSEC_DEBUG - printf("key_msg2sp: Invalid msg length.\n"); -#endif + ipseclog((LOG_DEBUG, + "key_msg2sp: Invalid msg length.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1263,10 +1384,8 @@ /* length check */ if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) { -#if IPSEC_DEBUG - printf("key_msg2sp: " - "invalid ipsecrequest length.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: " + "invalid ipsecrequest length.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1275,9 +1394,8 @@ /* allocate request buffer */ KMALLOC(*p_isr, struct ipsecrequest *, sizeof(**p_isr)); if ((*p_isr) == NULL) { -#if IPSEC_DEBUG - printf("key_msg2sp: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, + "key_msg2sp: No more memory.\n")); key_freesp(newsp); *error = ENOBUFS; return NULL; @@ -1293,10 +1411,9 @@ case IPPROTO_IPCOMP: break; default: -#if IPSEC_DEBUG - printf("key_msg2sp: invalid proto type=%u\n", - xisr->sadb_x_ipsecrequest_proto); -#endif + ipseclog((LOG_DEBUG, + "key_msg2sp: invalid proto type=%u\n", + xisr->sadb_x_ipsecrequest_proto)); key_freesp(newsp); *error = EPROTONOSUPPORT; return NULL; @@ -1309,10 +1426,9 @@ break; case IPSEC_MODE_ANY: default: -#if IPSEC_DEBUG - printf("key_msg2sp: invalid mode=%u\n", - xisr->sadb_x_ipsecrequest_mode); -#endif + ipseclog((LOG_DEBUG, + "key_msg2sp: invalid mode=%u\n", + xisr->sadb_x_ipsecrequest_mode)); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1332,12 +1448,10 @@ */ if (xisr->sadb_x_ipsecrequest_reqid > IPSEC_MANUAL_REQID_MAX) { -#if IPSEC_DEBUG - printf("key_msg2sp: reqid=%d " - "range violation, " - "updated by kernel.\n", - xisr->sadb_x_ipsecrequest_reqid); -#endif + ipseclog((LOG_DEBUG, + "key_msg2sp: reqid=%d range " + "violation, updated by kernel.\n", + xisr->sadb_x_ipsecrequest_reqid)); xisr->sadb_x_ipsecrequest_reqid = 0; } @@ -1359,10 +1473,8 @@ break; default: -#if IPSEC_DEBUG - printf("key_msg2sp: invalid level=%u\n", - xisr->sadb_x_ipsecrequest_level); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: invalid level=%u\n", + xisr->sadb_x_ipsecrequest_level)); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1378,10 +1490,8 @@ /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.src)) { -#if IPSEC_DEBUG - printf("key_msg2sp: invalid request " - "address length.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: invalid request " + "address length.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1395,10 +1505,8 @@ /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.dst)) { -#if IPSEC_DEBUG - printf("key_msg2sp: invalid request " - "address length.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: invalid request " + "address length.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1416,9 +1524,7 @@ /* validity check */ if (tlen < 0) { -#if IPSEC_DEBUG - printf("key_msg2sp: becoming tlen < 0.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: becoming tlen < 0.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1430,9 +1536,7 @@ } break; default: -#if IPSEC_DEBUG - printf("key_msg2sp: invalid policy type.\n"); -#endif + ipseclog((LOG_DEBUG, "key_msg2sp: invalid policy type.\n")); key_freesp(newsp); *error = EINVAL; return NULL; @@ -1632,25 +1736,19 @@ if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { -#if IPSEC_DEBUG - printf("key_spdadd: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { -#if IPSEC_DEBUG - printf("key_spdadd: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(struct sadb_lifetime)) { -#if IPSEC_DEBUG - printf("key_spdadd: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; @@ -1676,9 +1774,7 @@ case IPSEC_DIR_OUTBOUND: break; default: -#if IPSEC_DEBUG - printf("key_spdadd: Invalid SP direction.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: Invalid SP direction.\n")); mhp->msg->sadb_msg_errno = EINVAL; return 0; } @@ -1687,9 +1783,7 @@ /* key_spdadd() accepts DISCARD, NONE and IPSEC. */ if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST || xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { -#if IPSEC_DEBUG - printf("key_spdadd: Invalid policy type.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: Invalid policy type.\n")); return key_senderror(so, m, EINVAL); } @@ -1697,34 +1791,26 @@ if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX && xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC && mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) { -#if IPSEC_DEBUG - printf("key_spdadd: some policy requests part required.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: some policy requests part required.\n")); return key_senderror(so, m, EINVAL); } /* * checking there is SP already or not. - * If type is SPDUPDATE and no SP found, then error. - * If type is either SPDADD or SPDSETIDX and SP found, then error. + * SPDUPDATE doesn't depend on whether there is a SP or not. + * If the type is either SPDADD or SPDSETIDX AND a SP is found, + * then error. */ newsp = key_getsp(&spidx); if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { - if (newsp == NULL) { -#if IPSEC_DEBUG - printf("key_spdadd: no SP found.\n"); -#endif - return key_senderror(so, m, ENOENT); + if (newsp) { + newsp->state = IPSEC_SPSTATE_DEAD; + key_freesp(newsp); } - - newsp->state = IPSEC_SPSTATE_DEAD; - key_freesp(newsp); } else { if (newsp != NULL) { key_freesp(newsp); -#if IPSEC_DEBUG - printf("key_spdadd: a SP entry exists already.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdadd: a SP entry exists already.\n")); return key_senderror(so, m, EEXIST); } } @@ -1865,7 +1951,7 @@ /* when requesting to allocate spi ranged */ while (count--) { - newid = (policy_id = (policy_id == ~0 ? 1 : ++policy_id)); + newid = (policy_id = (policy_id == ~0 ? 1 : policy_id + 1)); if ((sp = key_getspbyid(newid)) == NULL) break; @@ -1874,9 +1960,7 @@ } if (count == 0 || newid == 0) { -#if IPSEC_DEBUG - printf("key_getnewspid: to allocate policy id is failed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_getnewspid: to allocate policy id is failed.\n")); return 0; } @@ -1913,17 +1997,13 @@ if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { -#if IPSEC_DEBUG - printf("key_spddelete: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spddelete: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { -#if IPSEC_DEBUG - printf("key_spddelete: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spddelete: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -1947,17 +2027,13 @@ case IPSEC_DIR_OUTBOUND: break; default: -#if IPSEC_DEBUG - printf("key_spddelete: Invalid SP direction.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spddelete: Invalid SP direction.\n")); return key_senderror(so, m, EINVAL); } /* Is there SP in SPD ? */ if ((sp = key_getsp(&spidx)) == NULL) { -#if IPSEC_DEBUG - printf("key_spddelete: no SP found.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spddelete: no SP found.\n")); return key_senderror(so, m, EINVAL); } @@ -2014,9 +2090,7 @@ if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { -#if IPSEC_DEBUG - printf("key_spddelete2: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spddelete2: invalid message is passed.\n")); key_senderror(so, m, EINVAL); return 0; } @@ -2025,9 +2099,7 @@ /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { -#if IPSEC_DEBUG - printf("key_spddelete2: no SP found id:%u.\n", id); -#endif + ipseclog((LOG_DEBUG, "key_spddelete2: no SP found id:%u.\n", id)); key_senderror(so, m, EINVAL); } @@ -2115,9 +2187,7 @@ if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { -#if IPSEC_DEBUG - printf("key_spdget: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdget: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -2125,9 +2195,7 @@ /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { -#if IPSEC_DEBUG - printf("key_spdget: no SP found id:%u.\n", id); -#endif + ipseclog((LOG_DEBUG, "key_spdget: no SP found id:%u.\n", id)); return key_senderror(so, m, ENOENT); } @@ -2248,9 +2316,7 @@ } if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { -#if IPSEC_DEBUG - printf("key_spdflush: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_spdflush: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } @@ -2646,9 +2712,7 @@ KMALLOC(newsav, struct secasvar *, sizeof(struct secasvar)); if (newsav == NULL) { -#if IPSEC_DEBUG - printf("key_newsa: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_newsa: No more memory.\n")); *errp = ENOBUFS; return NULL; } @@ -2672,9 +2736,7 @@ /* sanity check */ if (mhp->ext[SADB_EXT_SA] == NULL) { KFREE(newsav); -#if IPSEC_DEBUG - printf("key_newsa: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_newsa: invalid message is passed.\n")); *errp = EINVAL; return NULL; } @@ -2790,7 +2852,7 @@ LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withoutmode2(&sah->saidx, saidx)) + if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) return sah; } @@ -2814,9 +2876,7 @@ /* check address family */ if (saidx->src.ss_family != saidx->dst.ss_family) { -#if IPSEC_DEBUG - printf("key_checkspidup: address family mismatched.\n"); -#endif + ipseclog((LOG_DEBUG, "key_checkspidup: address family mismatched.\n")); return NULL; } @@ -2856,12 +2916,9 @@ /* sanity check */ if (sav->state != state) { -#if IPSEC_DEBUG - printf("key_getsavbyspi: " - "invalid sav->state " - "(queue: %d SA: %d)\n", - state, sav->state); -#endif + ipseclog((LOG_DEBUG, "key_getsavbyspi: " + "invalid sav->state (queue: %d SA: %d)\n", + state, sav->state)); continue; } @@ -2907,6 +2964,8 @@ sav->lft_c = NULL; sav->lft_h = NULL; sav->lft_s = NULL; + sav->remote_ike_port = 0; + sav->natt_last_activity = natt_now; /* SA */ if (mhp->ext[SADB_EXT_SA] != NULL) { @@ -2921,14 +2980,25 @@ sav->alg_auth = sa0->sadb_sa_auth; sav->alg_enc = sa0->sadb_sa_encrypt; sav->flags = sa0->sadb_sa_flags; + + /* + * Verify that a nat-traversal port was specified if + * the nat-traversal flag is set. + */ + if ((sav->flags & SADB_X_EXT_NATT) != 0) { + if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa_2) || + ((struct sadb_sa_2*)(sa0))->sadb_sa_natt_port == 0) { + error = EINVAL; + goto fail; + } + sav->remote_ike_port = ((struct sadb_sa_2*)(sa0))->sadb_sa_natt_port; + } /* replay window */ if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) { sav->replay = keydb_newsecreplay(sa0->sadb_sa_replay); if (sav->replay == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -2961,17 +3031,13 @@ break; } if (error) { -#if IPSEC_DEBUG - printf("key_setsaval: invalid key_auth values.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: invalid key_auth values.\n")); goto fail; } sav->key_auth = (struct sadb_key *)key_newbuf(key0, len); if (sav->key_auth == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -2999,9 +3065,7 @@ } sav->key_enc = (struct sadb_key *)key_newbuf(key0, len); if (sav->key_enc == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -3017,9 +3081,7 @@ break; } if (error) { -#if IPSEC_DEBUG - printf("key_setsatval: invalid key_enc value.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsatval: invalid key_enc value.\n")); goto fail; } } @@ -3037,9 +3099,7 @@ break; KMALLOC(sav->iv, caddr_t, sav->ivlen); if (sav->iv == 0) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -3052,9 +3112,7 @@ case SADB_X_SATYPE_IPCOMP: break; default: -#if IPSEC_DEBUG - printf("key_setsaval: invalid SA type.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: invalid SA type.\n")); error = EINVAL; goto fail; } @@ -3067,9 +3125,7 @@ KMALLOC(sav->lft_c, struct sadb_lifetime *, sizeof(struct sadb_lifetime)); if (sav->lft_c == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -3097,9 +3153,7 @@ sav->lft_h = (struct sadb_lifetime *)key_newbuf(lft0, sizeof(*lft0)); if (sav->lft_h == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -3115,9 +3169,7 @@ sav->lft_s = (struct sadb_lifetime *)key_newbuf(lft0, sizeof(*lft0)); if (sav->lft_s == NULL) { -#if IPSEC_DEBUG - printf("key_setsaval: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); error = ENOBUFS; goto fail; } @@ -3134,14 +3186,17 @@ sav->replay = NULL; } if (sav->key_auth != NULL) { + bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); KFREE(sav->key_auth); sav->key_auth = NULL; } if (sav->key_enc != NULL) { + bzero(_KEYBUF(sav->key_enc), _KEYLEN(sav->key_enc)); KFREE(sav->key_enc); sav->key_enc = NULL; } if (sav->sched) { + bzero(sav->sched, sav->schedlen); KFREE(sav->sched); sav->sched = NULL; } @@ -3185,10 +3240,9 @@ case IPPROTO_ESP: case IPPROTO_AH: if (ntohl(sav->spi) >= 0 && ntohl(sav->spi) <= 255) { -#if IPSEC_DEBUG - printf("key_mature: illegal range of SPI %u.\n", - (u_int32_t)ntohl(sav->spi)); -#endif + ipseclog((LOG_DEBUG, + "key_mature: illegal range of SPI %u.\n", + (u_int32_t)ntohl(sav->spi))); return EINVAL; } break; @@ -3200,10 +3254,8 @@ /* check flags */ if ((sav->flags & SADB_X_EXT_OLD) && (sav->flags & SADB_X_EXT_DERIV)) { -#if IPSEC_DEBUG - printf("key_mature: " - "invalid flag (derived) given to old-esp.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: " + "invalid flag (derived) given to old-esp.\n")); return EINVAL; } if (sav->alg_auth == SADB_AALG_NONE) @@ -3215,17 +3267,13 @@ case IPPROTO_AH: /* check flags */ if (sav->flags & SADB_X_EXT_DERIV) { -#if IPSEC_DEBUG - printf("key_mature: " - "invalid flag (derived) given to AH SA.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: " + "invalid flag (derived) given to AH SA.\n")); return EINVAL; } if (sav->alg_enc != SADB_EALG_NONE) { -#if IPSEC_DEBUG - printf("key_mature: " - "protocol and algorithm mismated.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: " + "protocol and algorithm mismated.\n")); return(EINVAL); } checkmask = 2; @@ -3233,26 +3281,20 @@ break; case IPPROTO_IPCOMP: if (sav->alg_auth != SADB_AALG_NONE) { -#if IPSEC_DEBUG - printf("key_mature: " - "protocol and algorithm mismated.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: " + "protocol and algorithm mismated.\n")); return(EINVAL); } if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 && ntohl(sav->spi) >= 0x10000) { -#if IPSEC_DEBUG - printf("key_mature: invalid cpi for IPComp.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: invalid cpi for IPComp.\n")); return(EINVAL); } checkmask = 4; mustmask = 4; break; default: -#if IPSEC_DEBUG - printf("key_mature: Invalid satype.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: Invalid satype.\n")); return EPROTONOSUPPORT; } @@ -3263,10 +3305,8 @@ algo = ah_algorithm_lookup(sav->alg_auth); if (!algo) { -#if IPSEC_DEBUG - printf("key_mature: " - "unknown authentication algorithm.\n"); -#endif + ipseclog((LOG_DEBUG,"key_mature: " + "unknown authentication algorithm.\n")); return EINVAL; } @@ -3276,11 +3316,10 @@ else keylen = 0; if (keylen < algo->keymin || algo->keymax < keylen) { -#if IPSEC_DEBUG - printf("key_mature: invalid AH key length %d " - "(%d-%d allowed)\n", keylen, - algo->keymin, algo->keymax); -#endif + ipseclog((LOG_DEBUG, + "key_mature: invalid AH key length %d " + "(%d-%d allowed)\n", + keylen, algo->keymin, algo->keymax)); return EINVAL; } @@ -3293,9 +3332,7 @@ } if ((mustmask & 2) != 0 && mature != SADB_SATYPE_AH) { -#if IPSEC_DEBUG - printf("key_mature: no satisfy algorithm for AH\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: no satisfy algorithm for AH\n")); return EINVAL; } } @@ -3308,9 +3345,7 @@ algo = esp_algorithm_lookup(sav->alg_enc); if (!algo) { -#if IPSEC_DEBUG - printf("key_mature: unknown encryption algorithm.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: unknown encryption algorithm.\n")); return EINVAL; } @@ -3320,11 +3355,10 @@ else keylen = 0; if (keylen < algo->keymin || algo->keymax < keylen) { -#if IPSEC_DEBUG - printf("key_mature: invalid ESP key length %d " - "(%d-%d allowed)\n", keylen, - algo->keymin, algo->keymax); -#endif + ipseclog((LOG_DEBUG, + "key_mature: invalid ESP key length %d " + "(%d-%d allowed)\n", + keylen, algo->keymin, algo->keymax)); return EINVAL; } @@ -3337,15 +3371,11 @@ } if ((mustmask & 1) != 0 && mature != SADB_SATYPE_ESP) { -#if IPSEC_DEBUG - printf("key_mature: no satisfy algorithm for ESP\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: no satisfy algorithm for ESP\n")); return EINVAL; } #else /*IPSEC_ESP*/ -#if IPSEC_DEBUG - printf("key_mature: ESP not supported in this configuration\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: ESP not supported in this configuration\n")); return EINVAL; #endif } @@ -3357,9 +3387,7 @@ /* algorithm-dependent check */ algo = ipcomp_algorithm_lookup(sav->alg_enc); if (!algo) { -#if IPSEC_DEBUG - printf("key_mature: unknown compression algorithm.\n"); -#endif + ipseclog((LOG_DEBUG, "key_mature: unknown compression algorithm.\n")); return EINVAL; } } @@ -3408,6 +3436,7 @@ case SADB_X_EXT_SA2: m = key_setsadbxsa2(sav->sah->saidx.mode, + sav->replay ? sav->replay->count : 0, sav->sah->saidx.reqid); if (!m) goto fail; @@ -3416,7 +3445,7 @@ case SADB_EXT_ADDRESS_SRC: m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, (struct sockaddr *)&sav->sah->saidx.src, - sav->sah->saidx.src.ss_len << 3, IPSEC_ULPROTO_ANY); + FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; @@ -3424,7 +3453,7 @@ case SADB_EXT_ADDRESS_DST: m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, (struct sockaddr *)&sav->sah->saidx.dst, - sav->sah->saidx.dst.ss_len << 3, IPSEC_ULPROTO_ANY); + FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; @@ -3626,6 +3655,18 @@ p->sadb_address_len = PFKEY_UNIT64(len); p->sadb_address_exttype = exttype; p->sadb_address_proto = ul_proto; + if (prefixlen == FULLMASK) { + switch (saddr->sa_family) { + case AF_INET: + prefixlen = sizeof(struct in_addr) << 3; + break; + case AF_INET6: + prefixlen = sizeof(struct in6_addr) << 3; + break; + default: + ; /*XXX*/ + } + } p->sadb_address_prefixlen = prefixlen; p->sadb_address_reserved = 0; @@ -3680,9 +3721,9 @@ * set data into sadb_x_sa2. */ static struct mbuf * -key_setsadbxsa2(mode, reqid) +key_setsadbxsa2(mode, seq, reqid) u_int8_t mode; - u_int32_t reqid; + u_int32_t seq, reqid; { struct mbuf *m; struct sadb_x_sa2 *p; @@ -3704,7 +3745,7 @@ p->sadb_x_sa2_mode = mode; p->sadb_x_sa2_reserved1 = 0; p->sadb_x_sa2_reserved2 = 0; - p->sadb_x_sa2_reserved3 = 0; + p->sadb_x_sa2_sequence = seq; p->sadb_x_sa2_reqid = reqid; return m; @@ -3756,9 +3797,7 @@ KMALLOC(new, caddr_t, len); if (new == NULL) { -#if IPSEC_DEBUG - printf("key_newbuf: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_newbuf: No more memory.\n")); return NULL; } bcopy(src, new, len); @@ -3850,96 +3889,21 @@ #endif /*INET6*/ /* - * compare two secasindex structure exactly. - * IN: - * saidx0: source, it can be in SAD. - * saidx1: object. - * OUT: - * 1 : equal - * 0 : not equal - */ -static int -key_cmpsaidx_exactly(saidx0, saidx1) - struct secasindex *saidx0, *saidx1; -{ - /* sanity */ - if (saidx0 == NULL && saidx1 == NULL) - return 1; - - if (saidx0 == NULL || saidx1 == NULL) - return 0; - - if (saidx0->proto != saidx1->proto - || saidx0->mode != saidx1->mode - || saidx0->reqid != saidx1->reqid) - return 0; - - if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.ss_len) != 0 || - bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.ss_len) != 0) - return 0; - - return 1; -} - -/* - * compare two secasindex structure with consideration mode. - * don't compare port. - * IN: - * saidx0: source, it is often in SAD. - * saidx1: object, it is often from SPD. - * OUT: - * 1 : equal - * 0 : not equal - */ -static int -key_cmpsaidx_withmode(saidx0, saidx1) - struct secasindex *saidx0, *saidx1; -{ - /* sanity */ - if (saidx0 == NULL && saidx1 == NULL) - return 1; - - if (saidx0 == NULL || saidx1 == NULL) - return 0; - - if (saidx0->proto != saidx1->proto) - return 0; - - /* - * If reqid of SPD is non-zero, unique SA is required. - * The result must be of same reqid in this case. - */ - if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid) - return 0; - - if (saidx0->mode != IPSEC_MODE_ANY && saidx0->mode != saidx1->mode) - return 0; - - if (key_sockaddrcmp((struct sockaddr *)&saidx0->src, - (struct sockaddr *)&saidx1->src, 0) != 0) { - return 0; - } - if (key_sockaddrcmp((struct sockaddr *)&saidx0->dst, - (struct sockaddr *)&saidx1->dst, 0) != 0) { - return 0; - } - - return 1; -} - -/* - * compare two secasindex structure without mode, but think reqid. + * compare two secasindex structure. + * flag can specify to compare 2 saidxes. + * compare two secasindex structure without both mode and reqid. * don't compare port. - * IN: - * saidx0: source, it is often in SAD. - * saidx1: object, it is often from user. - * OUT: - * 1 : equal - * 0 : not equal + * IN: + * saidx0: source, it can be in SAD. + * saidx1: object. + * OUT: + * 1 : equal + * 0 : not equal */ static int -key_cmpsaidx_withoutmode2(saidx0, saidx1) +key_cmpsaidx(saidx0, saidx1, flag) struct secasindex *saidx0, *saidx1; + int flag; { /* sanity */ if (saidx0 == NULL && saidx1 == NULL) @@ -3951,56 +3915,41 @@ if (saidx0->proto != saidx1->proto) return 0; - /* - * If reqid of SPD is non-zero, unique SA is required. - * The result must be of same reqid in this case. - */ - if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid) - return 0; - - if (key_sockaddrcmp((struct sockaddr *)&saidx0->src, - (struct sockaddr *)&saidx1->src, 0) != 0) { - return 0; - } - if (key_sockaddrcmp((struct sockaddr *)&saidx0->dst, - (struct sockaddr *)&saidx1->dst, 0) != 0) { - return 0; - } - - return 1; -} - -/* - * compare two secasindex structure without both mode and reqid. - * don't compare port. - * IN: - * saidx0: source, it is often in SAD. - * saidx1: object, it is often from user. - * OUT: - * 1 : equal - * 0 : not equal - */ -static int -key_cmpsaidx_withoutmode(saidx0, saidx1) - struct secasindex *saidx0, *saidx1; -{ - /* sanity */ - if (saidx0 == NULL && saidx1 == NULL) - return 1; + if (flag == CMP_EXACTLY) { + if (saidx0->mode != saidx1->mode) + return 0; + if (saidx0->reqid != saidx1->reqid) + return 0; + if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.ss_len) != 0 || + bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.ss_len) != 0) + return 0; + } else { - if (saidx0 == NULL || saidx1 == NULL) - return 0; + /* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */ + if (flag == CMP_MODE_REQID + ||flag == CMP_REQID) { + /* + * If reqid of SPD is non-zero, unique SA is required. + * The result must be of same reqid in this case. + */ + if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid) + return 0; + } - if (saidx0->proto != saidx1->proto) - return 0; + if (flag == CMP_MODE_REQID) { + if (saidx0->mode != IPSEC_MODE_ANY + && saidx0->mode != saidx1->mode) + return 0; + } - if (key_sockaddrcmp((struct sockaddr *)&saidx0->src, - (struct sockaddr *)&saidx1->src, 0) != 0) { - return 0; - } - if (key_sockaddrcmp((struct sockaddr *)&saidx0->dst, - (struct sockaddr *)&saidx1->dst, 0) != 0) { - return 0; + if (key_sockaddrcmp((struct sockaddr *)&saidx0->src, + (struct sockaddr *)&saidx1->src, 0) != 0) { + return 0; + } + if (key_sockaddrcmp((struct sockaddr *)&saidx0->dst, + (struct sockaddr *)&saidx1->dst, 0) != 0) { + return 0; + } } return 1; @@ -4322,7 +4271,23 @@ key_freesav(sav); } } - + + /* + * If this is a NAT traversal SA with no activity, + * we need to send a keep alive. + * + * Performed outside of the loop before so we will + * only ever send one keepalive. The first SA on + * the list is the one that will be used for sending + * traffic, so this is the one we use for determining + * when to send the keepalive. + */ + sav = LIST_FIRST(&sah->savtree[SADB_SASTATE_MATURE]); + if (natt_keepalive_interval && sav && (sav->flags & SADB_X_EXT_NATT_KEEPALIVE) != 0 && + (natt_now - sav->natt_last_activity) >= natt_keepalive_interval) { + ipsec_send_natt_keepalive(sav); + } + /* * check MATURE entry to start to send expire message * whether or not. @@ -4339,10 +4304,8 @@ /* sanity check */ if (sav->lft_c == NULL) { -#if IPSEC_DEBUG - printf("key_timehandler: " - "There is no CURRENT time, why?\n"); -#endif + ipseclog((LOG_DEBUG,"key_timehandler: " + "There is no CURRENT time, why?\n")); continue; } @@ -4350,8 +4313,9 @@ if (sav->lft_s->sadb_lifetime_addtime != 0 && tv.tv_sec - sav->created > sav->lft_s->sadb_lifetime_addtime) { /* - * check SA to be used whether or not. - * when SA hasn't been used, delete it. + * check the SA if it has been used. + * when it hasn't been used, delete it. + * i don't think such SA will be used. */ if (sav->lft_c->sadb_lifetime_usetime == 0) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); @@ -4367,6 +4331,7 @@ key_expire(sav); } } + /* check SOFT lifetime by bytes */ /* * XXX I don't know the way to delete this SA @@ -4399,10 +4364,8 @@ /* sanity check */ if (sav->lft_c == NULL) { -#if IPSEC_DEBUG - printf("key_timehandler: " - "There is no CURRENT time, why?\n"); -#endif + ipseclog((LOG_DEBUG, "key_timehandler: " + "There is no CURRENT time, why?\n")); continue; } @@ -4446,13 +4409,11 @@ /* sanity check */ if (sav->state != SADB_SASTATE_DEAD) { -#if IPSEC_DEBUG - printf("key_timehandler: " + ipseclog((LOG_DEBUG, "key_timehandler: " "invalid sav->state " "(queue: %d SA: %d): " "kill it anyway\n", - SADB_SASTATE_DEAD, sav->state); -#endif + SADB_SASTATE_DEAD, sav->state)); } /* @@ -4508,6 +4469,8 @@ key_tick_init_random = 0; key_srandom(); } + + natt_now++; #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ @@ -4661,16 +4624,12 @@ if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { -#if IPSEC_DEBUG - printf("key_getspi: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_getspi: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { -#if IPSEC_DEBUG - printf("key_getspi: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_getspi: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { @@ -4686,9 +4645,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_getspi: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_getspi: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -4739,9 +4696,7 @@ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA index */ if ((newsah = key_newsah(&saidx)) == NULL) { -#if IPSEC_DEBUG - printf("key_getspi: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_getspi: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } } @@ -4878,9 +4833,7 @@ if (min == max) { if (key_checkspidup(saidx, min) != NULL) { -#if IPSEC_DEBUG - printf("key_do_getnewspi: SPI %u exists already.\n", min); -#endif + ipseclog((LOG_DEBUG, "key_do_getnewspi: SPI %u exists already.\n", min)); return 0; } @@ -4902,9 +4855,7 @@ } if (count == 0 || newspi == 0) { -#if IPSEC_DEBUG - printf("key_do_getnewspi: to allocate spi is failed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_do_getnewspi: to allocate spi is failed.\n")); return 0; } } @@ -4951,9 +4902,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_update: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -4968,17 +4917,13 @@ mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { -#if IPSEC_DEBUG - printf("key_update: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { -#if IPSEC_DEBUG - printf("key_update: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { @@ -4999,9 +4944,7 @@ /* get a SA header */ if ((sah = key_getsah(&saidx)) == NULL) { -#if IPSEC_DEBUG - printf("key_update: no SA index found.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: no SA index found.\n")); return key_senderror(so, m, ENOENT); } @@ -5015,45 +4958,40 @@ #if IPSEC_DOSEQCHECK if (mhp->msg->sadb_msg_seq != 0 && (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) { -#if IPSEC_DEBUG - printf("key_update: no larval SA with sequence %u exists.\n", - mhp->msg->sadb_msg_seq); -#endif + ipseclog((LOG_DEBUG, + "key_update: no larval SA with sequence %u exists.\n", + mhp->msg->sadb_msg_seq)); return key_senderror(so, m, ENOENT); } #else if ((sav = key_getsavbyspi(sah, sa0->sadb_sa_spi)) == NULL) { -#if IPSEC_DEBUG - printf("key_update: no such a SA found (spi:%u)\n", - (u_int32_t)ntohl(sa0->sadb_sa_spi)); -#endif + ipseclog((LOG_DEBUG, + "key_update: no such a SA found (spi:%u)\n", + (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif /* validity check */ if (sav->sah->saidx.proto != proto) { -#if IPSEC_DEBUG - printf("key_update: protocol mismatched (DB=%u param=%u)\n", - sav->sah->saidx.proto, proto); -#endif + ipseclog((LOG_DEBUG, + "key_update: protocol mismatched (DB=%u param=%u)\n", + sav->sah->saidx.proto, proto)); return key_senderror(so, m, EINVAL); } #if IPSEC_DOSEQCHECK if (sav->spi != sa0->sadb_sa_spi) { -#if IPSEC_DEBUG - printf("key_update: SPI mismatched (DB:%u param:%u)\n", - (u_int32_t)ntohl(sav->spi), - (u_int32_t)ntohl(sa0->sadb_sa_spi)); -#endif + ipseclog((LOG_DEBUG, + "key_update: SPI mismatched (DB:%u param:%u)\n", + (u_int32_t)ntohl(sav->spi), + (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif if (sav->pid != mhp->msg->sadb_msg_pid) { -#if IPSEC_DEBUG - printf("key_update: pid mismatched (DB:%u param:%u)\n", - sav->pid, mhp->msg->sadb_msg_pid); -#endif + ipseclog((LOG_DEBUG, + "key_update: pid mismatched (DB:%u param:%u)\n", + sav->pid, mhp->msg->sadb_msg_pid)); return key_senderror(so, m, EINVAL); } @@ -5076,9 +5014,7 @@ /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { -#if IPSEC_DEBUG - printf("key_update: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } @@ -5161,9 +5097,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_add: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_add: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -5178,18 +5112,14 @@ mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { -#if IPSEC_DEBUG - printf("key_add: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_add: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { /* XXX need more */ -#if IPSEC_DEBUG - printf("key_add: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_add: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { @@ -5211,9 +5141,7 @@ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA header */ if ((newsah = key_newsah(&saidx)) == NULL) { -#if IPSEC_DEBUG - printf("key_add: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_add: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } } @@ -5228,9 +5156,7 @@ /* create new SA entry. */ /* We can create new SA only if SPI is differenct. */ if (key_getsavbyspi(newsah, sa0->sadb_sa_spi)) { -#if IPSEC_DEBUG - printf("key_add: SA already exists.\n"); -#endif + ipseclog((LOG_DEBUG, "key_add: SA already exists.\n")); return key_senderror(so, m, EEXIST); } newsav = key_newsav(m, mhp, newsah, &error); @@ -5255,9 +5181,7 @@ /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { -#if IPSEC_DEBUG - printf("key_update: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_update: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } @@ -5290,9 +5214,7 @@ if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL || mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) { -#if IPSEC_DEBUG - printf("key_setident: invalid identity.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setident: invalid identity.\n")); return EINVAL; } @@ -5303,9 +5225,7 @@ /* validity check */ if (idsrc->sadb_ident_type != iddst->sadb_ident_type) { -#if IPSEC_DEBUG - printf("key_setident: ident type mismatch.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setident: ident type mismatch.\n")); return EINVAL; } @@ -5323,18 +5243,14 @@ /* make structure */ KMALLOC(sah->idents, struct sadb_ident *, idsrclen); if (sah->idents == NULL) { -#if IPSEC_DEBUG - printf("key_setident: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setident: No more memory.\n")); return ENOBUFS; } KMALLOC(sah->identd, struct sadb_ident *, iddstlen); if (sah->identd == NULL) { KFREE(sah->idents); sah->idents = NULL; -#if IPSEC_DEBUG - printf("key_setident: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_setident: No more memory.\n")); return ENOBUFS; } bcopy(idsrc, sah->idents, idsrclen); @@ -5413,25 +5329,19 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_delete: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { -#if IPSEC_DEBUG - printf("key_delete: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { -#if IPSEC_DEBUG - printf("key_delete: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -5441,14 +5351,10 @@ * that match the src/dst. This is used during * IKE INITIAL-CONTACT. */ -#if IPSEC_DEBUG - printf("key_delete: doing delete all.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: doing delete all.\n")); return key_delete_all(so, m, mhp, proto); } else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) { -#if IPSEC_DEBUG - printf("key_delete: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -5463,7 +5369,7 @@ LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withoutmode(&sah->saidx, &saidx) == 0) + if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ @@ -5472,9 +5378,7 @@ break; } if (sah == NULL) { -#if IPSEC_DEBUG - printf("key_delete: no SA found.\n"); -#endif + ipseclog((LOG_DEBUG, "key_delete: no SA found.\n")); return key_senderror(so, m, ENOENT); } @@ -5532,7 +5436,7 @@ LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withoutmode(&sah->saidx, &saidx) == 0) + if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* Delete all non-LARVAL SAs. */ @@ -5547,12 +5451,10 @@ nextsav = LIST_NEXT(sav, chain); /* sanity check */ if (sav->state != state) { -#if IPSEC_DEBUG - printf("key_delete_all: " + ipseclog((LOG_DEBUG, "key_delete_all: " "invalid sav->state " "(queue: %d SA: %d)\n", - state, sav->state); -#endif + state, sav->state)); continue; } @@ -5617,26 +5519,20 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_get: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_get: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { -#if IPSEC_DEBUG - printf("key_get: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_get: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { -#if IPSEC_DEBUG - printf("key_get: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_get: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -5651,7 +5547,7 @@ LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withoutmode(&sah->saidx, &saidx) == 0) + if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ @@ -5660,9 +5556,7 @@ break; } if (sah == NULL) { -#if IPSEC_DEBUG - printf("key_get: no SA found.\n"); -#endif + ipseclog((LOG_DEBUG, "key_get: no SA found.\n")); return key_senderror(so, m, ENOENT); } @@ -5672,9 +5566,7 @@ /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { -#if IPSEC_DEBUG - printf("key_get: there was invalid proto in SAD.\n"); -#endif + ipseclog((LOG_DEBUG, "key_get: there was invalid proto in SAD.\n")); return key_senderror(so, m, EINVAL); } @@ -6022,8 +5914,7 @@ /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, - (struct sockaddr *)&saidx->src, saidx->src.ss_len << 3, - IPSEC_ULPROTO_ANY); + (struct sockaddr *)&saidx->src, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; @@ -6031,8 +5922,7 @@ m_cat(result, m); m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, - (struct sockaddr *)&saidx->dst, saidx->dst.ss_len << 3, - IPSEC_ULPROTO_ANY); + (struct sockaddr *)&saidx->dst, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; @@ -6154,9 +6044,7 @@ /* get new entry */ KMALLOC(newacq, struct secacq *, sizeof(struct secacq)); if (newacq == NULL) { -#if IPSEC_DEBUG - printf("key_newacq: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_newacq: No more memory.\n")); return NULL; } bzero(newacq, sizeof(*newacq)); @@ -6178,7 +6066,7 @@ struct secacq *acq; LIST_FOREACH(acq, &acqtree, chain) { - if (key_cmpsaidx_exactly(saidx, &acq->saidx)) + if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY)) return acq; } @@ -6210,9 +6098,7 @@ /* get new entry */ KMALLOC(acq, struct secspacq *, sizeof(struct secspacq)); if (acq == NULL) { -#if IPSEC_DEBUG - printf("key_newspacq: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_newspacq: No more memory.\n")); return NULL; } bzero(acq, sizeof(*acq)); @@ -6283,9 +6169,7 @@ /* check sequence number */ if (mhp->msg->sadb_msg_seq == 0) { -#if IPSEC_DEBUG - printf("key_acquire2: must specify sequence number.\n"); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: must specify sequence number.\n")); m_freem(m); return 0; } @@ -6314,9 +6198,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_acquire2: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -6324,18 +6206,14 @@ mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_EXT_PROPOSAL] == NULL) { /* error */ -#if IPSEC_DEBUG - printf("key_acquire2: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) { /* error */ -#if IPSEC_DEBUG - printf("key_acquire2: invalid message is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -6349,22 +6227,18 @@ LIST_FOREACH(sah, &sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; - if (key_cmpsaidx_withmode(&sah->saidx, &saidx)) + if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID)) break; } if (sah != NULL) { -#if IPSEC_DEBUG - printf("key_acquire2: a SA exists already.\n"); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: a SA exists already.\n")); return key_senderror(so, m, EEXIST); } error = key_acquire(&saidx, NULL); if (error != 0) { -#if IPSEC_DEBUG - printf("key_acquire2: error %d returned " - "from key_acquire.\n", mhp->msg->sadb_msg_errno); -#endif + ipseclog((LOG_DEBUG, "key_acquire2: error %d returned " + "from key_acquire.\n", mhp->msg->sadb_msg_errno)); return key_senderror(so, m, error); } @@ -6407,9 +6281,7 @@ /* check whether existing or not */ LIST_FOREACH(reg, ®tree[mhp->msg->sadb_msg_satype], chain) { if (reg->so == so) { -#if IPSEC_DEBUG - printf("key_register: socket exists already.\n"); -#endif + ipseclog((LOG_DEBUG, "key_register: socket exists already.\n")); return key_senderror(so, m, EEXIST); } } @@ -6417,9 +6289,7 @@ /* create regnode */ KMALLOC(newreg, struct secreg *, sizeof(*newreg)); if (newreg == NULL) { -#if IPSEC_DEBUG - printf("key_register: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_register: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } bzero((caddr_t)newreg, sizeof(*newreg)); @@ -6631,7 +6501,9 @@ m_cat(result, m); /* create SA extension */ - m = key_setsadbxsa2(sav->sah->saidx.mode, sav->sah->saidx.reqid); + m = key_setsadbxsa2(sav->sah->saidx.mode, + sav->replay ? sav->replay->count : 0, + sav->sah->saidx.reqid); if (!m) { error = ENOBUFS; goto fail; @@ -6662,7 +6534,7 @@ /* set sadb_address for source */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, (struct sockaddr *)&sav->sah->saidx.src, - sav->sah->saidx.src.ss_len << 3, IPSEC_ULPROTO_ANY); + FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; @@ -6672,7 +6544,7 @@ /* set sadb_address for destination */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, (struct sockaddr *)&sav->sah->saidx.dst, - sav->sah->saidx.dst.ss_len << 3, IPSEC_ULPROTO_ANY); + FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; @@ -6699,6 +6571,7 @@ mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); + splx(s); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: @@ -6739,9 +6612,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_flush: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_flush: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -6775,9 +6646,7 @@ if (m->m_len < sizeof(struct sadb_msg) || sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { -#if IPSEC_DEBUG - printf("key_flush: No more memory.\n"); -#endif + ipseclog((LOG_DEBUG, "key_flush: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } @@ -6826,9 +6695,7 @@ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { -#if IPSEC_DEBUG - printf("key_dump: invalid satype is passed.\n"); -#endif + ipseclog((LOG_DEBUG, "key_dump: invalid satype is passed.\n")); return key_senderror(so, m, EINVAL); } @@ -6861,9 +6728,7 @@ /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { -#if IPSEC_DEBUG - printf("key_dump: there was invalid proto in SAD.\n"); -#endif + ipseclog((LOG_DEBUG, "key_dump: there was invalid proto in SAD.\n")); return key_senderror(so, m, EINVAL); } @@ -6996,7 +6861,7 @@ #if 0 /*kdebug_sadb assumes msg in linear buffer*/ KEYDEBUG(KEYDEBUG_KEY_DUMP, - printf("key_parse: passed sadb_msg\n"); + ipseclog((LOG_DEBUG, "key_parse: passed sadb_msg\n")); kdebug_sadb(msg)); #endif @@ -7011,29 +6876,24 @@ if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len != m->m_pkthdr.len) { -#if IPSEC_DEBUG - printf("key_parse: invalid message length.\n"); -#endif + ipseclog((LOG_DEBUG, "key_parse: invalid message length.\n")); pfkeystat.out_invlen++; error = EINVAL; goto senderror; } if (msg->sadb_msg_version != PF_KEY_V2) { -#if IPSEC_DEBUG - printf("key_parse: PF_KEY version %u is mismatched.\n", - msg->sadb_msg_version); -#endif + ipseclog((LOG_DEBUG, + "key_parse: PF_KEY version %u is mismatched.\n", + msg->sadb_msg_version)); pfkeystat.out_invver++; error = EINVAL; goto senderror; } if (msg->sadb_msg_type > SADB_MAX) { -#if IPSEC_DEBUG - printf("key_parse: invalid type %u is passed.\n", - msg->sadb_msg_type); -#endif + ipseclog((LOG_DEBUG, "key_parse: invalid type %u is passed.\n", + msg->sadb_msg_type)); pfkeystat.out_invmsgtype++; error = EINVAL; goto senderror; @@ -7089,11 +6949,8 @@ case SADB_GET: case SADB_ACQUIRE: case SADB_EXPIRE: -#if IPSEC_DEBUG - printf("key_parse: must specify satype " - "when msg type=%u.\n", - msg->sadb_msg_type); -#endif + ipseclog((LOG_DEBUG, "key_parse: must specify satype " + "when msg type=%u.\n", msg->sadb_msg_type)); pfkeystat.out_invsatype++; error = EINVAL; goto senderror; @@ -7111,10 +6968,8 @@ case SADB_X_SPDSETIDX: case SADB_X_SPDUPDATE: case SADB_X_SPDDELETE2: -#if IPSEC_DEBUG - printf("key_parse: illegal satype=%u\n", - msg->sadb_msg_type); -#endif + ipseclog((LOG_DEBUG, "key_parse: illegal satype=%u\n", + msg->sadb_msg_type)); pfkeystat.out_invsatype++; error = EINVAL; goto senderror; @@ -7124,10 +6979,8 @@ case SADB_SATYPE_OSPFV2: case SADB_SATYPE_RIPV2: case SADB_SATYPE_MIP: -#if IPSEC_DEBUG - printf("key_parse: type %u isn't supported.\n", - msg->sadb_msg_satype); -#endif + ipseclog((LOG_DEBUG, "key_parse: type %u isn't supported.\n", + msg->sadb_msg_satype)); pfkeystat.out_invsatype++; error = EOPNOTSUPP; goto senderror; @@ -7136,10 +6989,8 @@ break; /*FALLTHROUGH*/ default: -#if IPSEC_DEBUG - printf("key_parse: invalid type %u is passed.\n", - msg->sadb_msg_satype); -#endif + ipseclog((LOG_DEBUG, "key_parse: invalid type %u is passed.\n", + msg->sadb_msg_satype)); pfkeystat.out_invsatype++; error = EINVAL; goto senderror; @@ -7156,9 +7007,7 @@ /* check upper layer protocol */ if (src0->sadb_address_proto != dst0->sadb_address_proto) { -#if IPSEC_DEBUG - printf("key_parse: upper layer protocol mismatched.\n"); -#endif + ipseclog((LOG_DEBUG, "key_parse: upper layer protocol mismatched.\n")); pfkeystat.out_invaddr++; error = EINVAL; goto senderror; @@ -7167,18 +7016,15 @@ /* check family */ if (PFKEY_ADDR_SADDR(src0)->sa_family != PFKEY_ADDR_SADDR(dst0)->sa_family) { -#if IPSEC_DEBUG - printf("key_parse: address family mismatched.\n"); -#endif + ipseclog((LOG_DEBUG, "key_parse: address family mismatched.\n")); pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } if (PFKEY_ADDR_SADDR(src0)->sa_len != PFKEY_ADDR_SADDR(dst0)->sa_len) { -#if IPSEC_DEBUG - printf("key_parse: address struct size mismatched.\n"); -#endif + ipseclog((LOG_DEBUG, + "key_parse: address struct size mismatched.\n")); pfkeystat.out_invaddr++; error = EINVAL; goto senderror; @@ -7202,9 +7048,8 @@ } break; default: -#if IPSEC_DEBUG - printf("key_parse: unsupported address family.\n"); -#endif + ipseclog((LOG_DEBUG, + "key_parse: unsupported address family.\n")); pfkeystat.out_invaddr++; error = EAFNOSUPPORT; goto senderror; @@ -7225,9 +7070,8 @@ /* check max prefix length */ if (src0->sadb_address_prefixlen > plen || dst0->sadb_address_prefixlen > plen) { -#if IPSEC_DEBUG - printf("key_parse: illegal prefixlen.\n"); -#endif + ipseclog((LOG_DEBUG, + "key_parse: illegal prefixlen.\n")); pfkeystat.out_invaddr++; error = EINVAL; goto senderror; @@ -7333,21 +7177,18 @@ * KEY_AUTH or KEY_ENCRYPT ? */ if (mhp->ext[ext->sadb_ext_type] != NULL) { -#if IPSEC_DEBUG - printf("key_align: duplicate ext_type %u " - "is passed.\n", - ext->sadb_ext_type); -#endif + ipseclog((LOG_DEBUG, + "key_align: duplicate ext_type %u " + "is passed.\n", ext->sadb_ext_type)); m_freem(m); pfkeystat.out_dupext++; return EINVAL; } break; default: -#if IPSEC_DEBUG - printf("key_align: invalid ext_type %u is passed.\n", - ext->sadb_ext_type); -#endif + ipseclog((LOG_DEBUG, + "key_align: invalid ext_type %u is passed.\n", + ext->sadb_ext_type)); m_freem(m); pfkeystat.out_invexttype++; return EINVAL; diff -urN xnu-344.49/bsd/netkey/key_debug.c xnu-517/bsd/netkey/key_debug.c --- xnu-344.49/bsd/netkey/key_debug.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netkey/key_debug.c Sat Oct 25 00:25:55 2003 @@ -1,3 +1,6 @@ +/* $FreeBSD: src/sys/netkey/key_debug.c,v 1.10.2.5 2002/04/28 05:40:28 suz Exp $ */ +/* $KAME: key_debug.c,v 1.26 2001/06/27 10:46:50 sakane Exp $ */ + /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -379,9 +382,9 @@ printf("sadb_x_sa2{ mode=%u reqid=%u\n", sa2->sadb_x_sa2_mode, sa2->sadb_x_sa2_reqid); - printf(" reserved1=%u reserved2=%u reserved3=%u }\n", - sa2->sadb_x_sa2_reserved1, sa2->sadb_x_sa2_reserved1, - sa2->sadb_x_sa2_reserved1); + printf(" reserved1=%u reserved2=%u sequence=%u }\n", + sa2->sadb_x_sa2_reserved1, sa2->sadb_x_sa2_reserved2, + sa2->sadb_x_sa2_sequence); return; } @@ -671,7 +674,7 @@ kdebug_sockaddr(addr) struct sockaddr *addr; { - struct sockaddr_in *sin; + struct sockaddr_in *sin4; #ifdef INET6 struct sockaddr_in6 *sin6; #endif @@ -685,9 +688,9 @@ switch (addr->sa_family) { case AF_INET: - sin = (struct sockaddr_in *)addr; - printf(" port=%u\n", ntohs(sin->sin_port)); - ipsec_hexdump((caddr_t)&sin->sin_addr, sizeof(sin->sin_addr)); + sin4 = (struct sockaddr_in *)addr; + printf(" port=%u\n", ntohs(sin4->sin_port)); + ipsec_hexdump((caddr_t)&sin4->sin_addr, sizeof(sin4->sin_addr)); break; #ifdef INET6 case AF_INET6: diff -urN xnu-344.49/bsd/netkey/key_debug.h xnu-517/bsd/netkey/key_debug.h --- xnu-344.49/bsd/netkey/key_debug.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netkey/key_debug.h Sat Oct 25 00:25:55 2003 @@ -54,7 +54,8 @@ #define KEYDEBUG_IPSEC_DATA (KEYDEBUG_IPSEC | KEYDEBUG_DATA) #define KEYDEBUG_IPSEC_DUMP (KEYDEBUG_IPSEC | KEYDEBUG_DUMP) -#define KEYDEBUG(lev,arg) if ((key_debug_level & (lev)) == (lev)) { arg; } +#define KEYDEBUG(lev,arg) \ + do { if ((key_debug_level & (lev)) == (lev)) { arg; } } while (0) struct sadb_msg; struct sadb_ext; diff -urN xnu-344.49/bsd/netkey/key_var.h xnu-517/bsd/netkey/key_var.h --- xnu-344.49/bsd/netkey/key_var.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netkey/key_var.h Sat Oct 25 00:25:55 2003 @@ -46,7 +46,9 @@ #define KEYCTL_ESP_KEYMIN 9 #define KEYCTL_ESP_AUTH 10 #define KEYCTL_AH_KEYMIN 11 -#define KEYCTL_MAXID 12 +#define KEYCTL_PREFERED_OLDSA 12 +#define KEYCTL_NATT_KEEPALIVE_INTERVAL 13 +#define KEYCTL_MAXID 14 #define KEYCTL_NAMES { \ { 0, 0 }, \ @@ -58,9 +60,13 @@ { "larval_lifetime", CTLTYPE_INT }, \ { "blockacq_count", CTLTYPE_INT }, \ { "blockacq_lifetime", CTLTYPE_INT }, \ + { "esp_keymin", CTLTYPE_INT }, \ + { "esp_auth", CTLTYPE_INT }, \ + { "ah_keymin", CTLTYPE_INT }, \ + { "prefered_oldsa", CTLTYPE_INT }, \ + { "natt_keepalive_interval", CTLTYPE_INT }, \ } -//#if IPSEC_DEBUG #define KEYCTL_VARS { \ 0, \ &key_debug_level, \ @@ -73,22 +79,9 @@ &key_blockacq_lifetime, \ &ipsec_esp_keymin, \ &ipsec_ah_keymin, \ + &ipsec_prefered_oldsa, \ + &natt_keepalive_interval, \ } -//#else -//#define KEYCTL_VARS { \ -// 0, \ -// 0, \ -// &key_spi_trycnt, \ -// &key_spi_minval, \ -// &key_spi_maxval, \ -// &key_int_random, \ -// &key_larval_lifetime, \ -// &key_blockacq_count, \ -// &key_blockacq_lifetime, \ -// &ipsec_esp_keymin, \ -// &ipsec_ah_keymin, \ -//} -//#endif #ifdef KERNEL #define _ARRAYLEN(p) (sizeof(p)/sizeof(p[0])) diff -urN xnu-344.49/bsd/netkey/keydb.h xnu-517/bsd/netkey/keydb.h --- xnu-344.49/bsd/netkey/keydb.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/netkey/keydb.h Sat Oct 25 00:25:55 2003 @@ -97,6 +97,10 @@ pid_t pid; /* message's pid */ struct secashead *sah; /* back pointer to the secashead */ + + /* Nat Traversal related bits */ + u_int32_t natt_last_activity; + u_int16_t remote_ike_port; }; /* replay prevention */ diff -urN xnu-344.49/bsd/nfs/Makefile xnu-517/bsd/nfs/Makefile --- xnu-344.49/bsd/nfs/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/nfs/Makefile Tue Oct 21 21:24:55 2003 @@ -21,6 +21,7 @@ DATAFILES = \ krpc.h nfs.h nfsdiskless.h nfsm_subs.h nfsmount.h nfsnode.h \ + nlminfo.h nfs_lock.h \ nfsproto.h nfsrtt.h nfsrvcache.h nqnfs.h rpcv2.h xdr_subs.h diff -urN xnu-344.49/bsd/nfs/krpc_subr.c xnu-517/bsd/nfs/krpc_subr.c --- xnu-344.49/bsd/nfs/krpc_subr.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/krpc_subr.c Sat Oct 25 00:25:55 2003 @@ -240,6 +240,7 @@ tv.tv_sec = 1; tv.tv_usec = 0; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_RCVTIMEO; sopt.sopt_val = &tv; @@ -357,6 +358,12 @@ else printf("RPC timeout for server " IP_FORMAT "\n", IP_LIST(&(sin->sin_addr.s_addr))); + + /* + * soreceive is now conditionally using this pointer + * if present, it updates per-proc stats + */ + auio.uio_procp = NULL; /* * Wait for up to timo seconds for a reply. diff -urN xnu-344.49/bsd/nfs/nfs.h xnu-517/bsd/nfs/nfs.h --- xnu-344.49/bsd/nfs/nfs.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs.h Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -89,18 +89,24 @@ #ifndef NFS_MAXATTRTIMO #define NFS_MAXATTRTIMO 60 #endif -#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ -#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_WSIZE 16384 /* Def. write data size <= 16K */ +#define NFS_RSIZE 16384 /* Def. read data size <= 16K */ +#define NFS_DGRAM_WSIZE 8192 /* UDP Def. write data size <= 8K */ +#define NFS_DGRAM_RSIZE 8192 /* UDP Def. read data size <= 8K */ #define NFS_READDIRSIZE 8192 /* Def. readdir size */ -#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ -#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_DEFRAHEAD 4 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 16 /* Max. read ahead # blocks */ #define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ -#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runnable */ +#define NFS_MAXASYNCDAEMON 32 /* Max. number async_daemons runnable */ #define NFS_MAXGATHERDELAY 100 /* Max. write gather delay (msec) */ #ifndef NFS_GATHERDELAY #define NFS_GATHERDELAY 10 /* Default write gather delay (msec) */ #endif #define NFS_DIRBLKSIZ 4096 /* Must be a multiple of DIRBLKSIZ */ +#if defined(KERNEL) && !defined(DIRBLKSIZ) +#define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ + /* can't be larger than NFS_FABLKSIZE */ +#endif /* * Oddballs @@ -115,13 +121,13 @@ /* * XXX - * The B_INVAFTERWRITE flag should be set to whatever is required by the + * The NB_INVAFTERWRITE flag should be set to whatever is required by the * buffer cache code to say "Invalidate the block after it is written back". */ #ifdef __FreeBSD__ -#define B_INVAFTERWRITE B_NOCACHE +#define NB_INVAFTERWRITE NB_NOCACHE #else -#define B_INVAFTERWRITE B_INVAL +#define NB_INVAFTERWRITE NB_INVAL #endif /* @@ -133,15 +139,6 @@ #endif /* - * Set the attribute timeout based on how recently the file has been modified. - */ -#define NFS_ATTRTIMEO(np) \ - ((((np)->n_flag & NMODIFIED) || \ - (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ - ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ - (time.tv_sec - (np)->n_mtime) / 10)) - -/* * Expected allocation sizes for major data structures. If the actual size * of the structure exceeds these sizes, then malloc() will be allocating * almost twice the memory required. This is used in nfs_init() to warn @@ -202,21 +199,24 @@ #define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */ #define NFSMNT_RDIRPLUS 0x00010000 /* Use Readdirplus for V3 */ #define NFSMNT_READDIRSIZE 0x00020000 /* Set readdir size */ -#define NFSMNT_INTERNAL 0xfffc0000 /* Bits set internally */ -#define NFSMNT_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */ -#define NFSMNT_GOTPATHCONF 0x00080000 /* Got the V3 pathconf info */ -#define NFSMNT_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */ -#define NFSMNT_MNTD 0x00200000 /* Mnt server for mnt point */ -#define NFSMNT_DISMINPROG 0x00400000 /* Dismount in progress */ -#define NFSMNT_DISMNT 0x00800000 /* Dismounted */ -#define NFSMNT_SNDLOCK 0x01000000 /* Send socket lock */ -#define NFSMNT_WANTSND 0x02000000 /* Want above */ -#define NFSMNT_RCVLOCK 0x04000000 /* Rcv socket lock */ -#define NFSMNT_WANTRCV 0x08000000 /* Want above */ -#define NFSMNT_WAITAUTH 0x10000000 /* Wait for authentication */ -#define NFSMNT_HASAUTH 0x20000000 /* Has authenticator */ -#define NFSMNT_WANTAUTH 0x40000000 /* Wants an authenticator */ -#define NFSMNT_AUTHERR 0x80000000 /* Authentication error */ +#define NFSMNT_NOLOCKS 0x00040000 /* don't support file locking */ + +#define NFSSTA_TIMEO 0x00010000 /* experienced a timeout. */ +#define NFSSTA_FORCE 0x00020000 /* doing a forced unmount. */ +#define NFSSTA_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */ +#define NFSSTA_GOTPATHCONF 0x00080000 /* Got the V3 pathconf info */ +#define NFSSTA_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */ +#define NFSSTA_MNTD 0x00200000 /* Mnt server for mnt point */ +#define NFSSTA_DISMINPROG 0x00400000 /* Dismount in progress */ +#define NFSSTA_DISMNT 0x00800000 /* Dismounted */ +#define NFSSTA_SNDLOCK 0x01000000 /* Send socket lock */ +#define NFSSTA_WANTSND 0x02000000 /* Want above */ +#define NFSSTA_RCVLOCK 0x04000000 /* Rcv socket lock */ +#define NFSSTA_WANTRCV 0x08000000 /* Want above */ +#define NFSSTA_WAITAUTH 0x10000000 /* Wait for authentication */ +#define NFSSTA_HASAUTH 0x20000000 /* Has authenticator */ +#define NFSSTA_WANTAUTH 0x40000000 /* Wants an authenticator */ +#define NFSSTA_AUTHERR 0x80000000 /* Authentication error */ /* * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs @@ -311,6 +311,13 @@ #define NFSSVC_MNTD 0x100 /* + * Flags for nfsclnt() system call. + */ +#define NFSCLNT_LOCKDANS 0x200 +#define NFSCLNT_LOCKDFD 0x400 +#define NFSCLNT_LOCKDWAIT 0x800 + +/* * fs.nfs sysctl(3) identifiers */ #define NFS_NFSSTATS 1 /* struct: struct nfsstats */ @@ -350,7 +357,8 @@ MALLOC_DECLARE(M_NFSBIGFH); #endif -struct uio; struct buf; struct vattr; struct nameidata; /* XXX */ +struct uio; struct vattr; struct nameidata; /* XXX */ +struct nfsbuf; #define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ sigmask(SIGHUP)|sigmask(SIGQUIT)) @@ -361,7 +369,7 @@ */ #define NFSIGNORE_SOERROR(s, e) \ ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ - ((s) & PR_CONNREQUIRED) == 0) + (e) != EIO && ((s) & PR_CONNREQUIRED) == 0) /* * Nfs outstanding request list element @@ -382,6 +390,7 @@ u_int32_t r_procnum; /* NFS procedure number */ int r_rtt; /* RTT for rpc */ struct proc *r_procp; /* Proc that did I/O system call */ + long r_lastmsg; /* time of last tprintf */ }; /* @@ -390,14 +399,17 @@ extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; /* Flag values for r_flags */ -#define R_TIMING 0x01 /* timing request (in mntp) */ -#define R_SENT 0x02 /* request has been sent */ -#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ -#define R_INTR 0x08 /* intr mnt, signal pending */ -#define R_SOCKERR 0x10 /* Fatal error on socket */ -#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ -#define R_MUSTRESEND 0x40 /* Must resend request */ -#define R_GETONEREP 0x80 /* Probe for one reply only */ +#define R_TIMING 0x0001 /* timing request (in mntp) */ +#define R_SENT 0x0002 /* request has been sent */ +#define R_SOFTTERM 0x0004 /* soft mnt, too many retries */ +#define R_INTR 0x0008 /* intr mnt, signal pending */ +#define R_SOCKERR 0x0010 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x0020 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x0040 /* Must resend request */ +#define R_GETONEREP 0x0080 /* Probe for one reply only */ +#define R_BUSY 0x0100 /* Locked. */ +#define R_WAITING 0x0200 /* Someone waiting for lock. */ +#define R_RESENDERR 0x0400 /* resend failed. */ /* * A list of nfssvc_sock structures is maintained with all the sockets @@ -464,7 +476,8 @@ struct mbuf *ns_rec; struct mbuf *ns_recend; struct mbuf *ns_frag; - int ns_flag; + short ns_flag; /* modified under kernel funnel */ + short ns_nflag; /* modified under network funnel */ int ns_solock; int ns_cc; int ns_reclen; @@ -475,14 +488,14 @@ LIST_HEAD(nfsrvw_delayhash, nfsrv_descript) ns_wdelayhashtbl[NFS_WDELAYHASHSIZ]; }; -/* Bits for "ns_flag" */ -#define SLP_VALID 0x01 -#define SLP_DOREC 0x02 -#define SLP_NEEDQ 0x04 -#define SLP_DISCONN 0x08 -#define SLP_GETSTREAM 0x10 -#define SLP_LASTFRAG 0x20 -#define SLP_ALLFLAGS 0xff +/* Bits for "ns_*flag" */ +#define SLP_VALID 0x01 /* ns_flag */ +#define SLP_DOREC 0x02 /* ns_flag */ +#define SLPN_NEEDQ 0x04 /* ns_nflag */ +#define SLPN_DISCONN 0x08 /* ns_nflag */ +#define SLPN_GETSTREAM 0x10 /* ns_nflag */ +#define SLPN_LASTFRAG 0x20 /* ns_nflag */ +#define SLP_ALLFLAGS 0xff /* ns_flag && ns_nflag */ extern TAILQ_HEAD(nfssvc_sockhead, nfssvc_sock) nfssvc_sockhead; extern int nfssvc_sockhead_flag; @@ -620,8 +633,10 @@ int nfs_rephead __P((int, struct nfsrv_descript *, struct nfssvc_sock *, int, int, u_quad_t *, struct mbuf **, struct mbuf **, caddr_t *)); -int nfs_sndlock __P((int *, struct nfsreq *)); -void nfs_sndunlock __P((int *flagp)); +int nfs_sndlock __P((struct nfsreq *)); +void nfs_sndunlock __P((struct nfsreq *)); +int nfs_slplock __P((struct nfssvc_sock *, int)); +void nfs_slpunlock __P((struct nfssvc_sock *)); int nfs_disct __P((struct mbuf **, caddr_t *, int, int, caddr_t *)); int nfs_vinvalbuf __P((struct vnode *, int, struct ucred *, struct proc *, int)); @@ -629,8 +644,8 @@ int nfs_writerpc __P((struct vnode *, struct uio *, struct ucred *, int *, int *)); int nfs_readdirrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_asyncio __P((struct buf *, struct ucred *)); -int nfs_doio __P((struct buf *, struct ucred *, struct proc *)); +int nfs_asyncio __P((struct nfsbuf *, struct ucred *)); +int nfs_doio __P((struct nfsbuf *, struct ucred *, struct proc *)); int nfs_readlinkrpc __P((struct vnode *, struct uio *, struct ucred *)); int nfs_sigintr __P((struct nfsmount *, struct nfsreq *, struct proc *)); int nfs_readdirplusrpc __P((struct vnode *, struct uio *, struct ucred *)); @@ -678,11 +693,14 @@ int)); int nfsm_uiotombuf __P((struct uio *, struct mbuf **, int, caddr_t *)); void nfsrv_init __P((int)); +int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, + struct ucred *cred, struct proc *procp)); +int nfs_flushcommits(struct vnode *, struct proc *); void nfs_clearcommit __P((struct mount *)); int nfsrv_errmap __P((struct nfsrv_descript *, int)); void nfsrvw_sort __P((gid_t *, int)); void nfsrv_setcred __P((struct ucred *, struct ucred *)); -int nfs_writebp __P((struct buf *, int)); +int nfs_buf_write __P((struct nfsbuf *)); int nfsrv_object_create __P((struct vnode *)); void nfsrv_wakenfsd __P((struct nfssvc_sock *slp)); int nfsrv_writegather __P((struct nfsrv_descript **, struct nfssvc_sock *, @@ -842,8 +860,8 @@ #else /* NFSDIAG */ - #define NFSTRACE(cnst, fptr) - #define NFSTRACE4(cnst, fptr, a2, a3, a4) +# define NFSTRACE(cnst, fptr) +# define NFSTRACE4(cnst, fptr, a2, a3, a4) #endif /* NFSDIAG */ diff -urN xnu-344.49/bsd/nfs/nfs_bio.c xnu-517/bsd/nfs/nfs_bio.c --- xnu-344.49/bsd/nfs/nfs_bio.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_bio.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -66,8 +66,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -98,12 +99,863 @@ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \ (int)(B), (int)(C), (int)(D), (int)(E), 0) -static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, - struct proc *p, int operation)); - extern int nfs_numasync; +extern int nfs_ioddelwri; extern struct nfsstats nfsstats; -extern int nbdwrite; + +#define NFSBUFHASH(dvp, lbn) \ + (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash]) +LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; +struct nfsbuffreehead nfsbuffree, nfsbufdelwri; +u_long nfsbufhash; +int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer; +int nfs_nbdwrite; + +#define NFSBUFWRITE_THROTTLE 9 + +/* + * Initialize nfsbuf lists + */ +void +nfs_nbinit(void) +{ + nfsbufhashlock = 0; + nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash); + TAILQ_INIT(&nfsbuffree); + TAILQ_INIT(&nfsbufdelwri); + nfsbufcnt = nfsbuffreecnt = nfsbufdelwricnt = 0; + nfsbufmin = 128; // XXX tune me! + nfsbufmax = 8192; // XXX tune me! + nfsneedbuffer = 0; + nfs_nbdwrite = 0; +} + +/* + * try to free up some excess, unused nfsbufs + */ +static void +nfs_buf_freeup(void) +{ + struct nfsbuf *fbp; + int cnt; + +#define NFS_BUF_FREEUP() \ + do { \ + /* only call nfs_buf_freeup() if it has work to do */ \ + if ((nfsbuffreecnt > nfsbufcnt/4) && \ + (nfsbufcnt-nfsbuffreecnt/8 > nfsbufmin)) \ + nfs_buf_freeup(); \ + } while (0) + + if (nfsbuffreecnt < nfsbufcnt/4) + return; + cnt = nfsbuffreecnt/8; + if (nfsbufcnt-cnt < nfsbufmin) + return; + + FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt); + while (cnt-- > 0) { + fbp = TAILQ_FIRST(&nfsbuffree); + if (!fbp) + break; + nfs_buf_remfree(fbp); + /* disassociate buffer from any vnode */ + if (fbp->nb_vp) { + struct vnode *oldvp; + if (fbp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(fbp, nb_vnbufs); + fbp->nb_vnbufs.le_next = NFSNOLIST; + } + oldvp = fbp->nb_vp; + fbp->nb_vp = NULL; + HOLDRELE(oldvp); + } + LIST_REMOVE(fbp, nb_hash); + /* nuke any creds */ + if (fbp->nb_rcred != NOCRED) + crfree(fbp->nb_rcred); + if (fbp->nb_wcred != NOCRED) + crfree(fbp->nb_wcred); + /* if buf was NB_META, dump buffer */ + if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) { + FREE(fbp->nb_data, M_TEMP); + } + FREE(fbp, M_TEMP); + nfsbufcnt--; + } + FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt); +} + +void +nfs_buf_remfree(struct nfsbuf *bp) +{ + if (bp->nb_free.tqe_next == NFSNOLIST) + panic("nfsbuf not on free list"); + if (ISSET(bp->nb_flags, NB_DELWRI)) { + nfsbufdelwricnt--; + TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); + } else { + nfsbuffreecnt--; + TAILQ_REMOVE(&nfsbuffree, bp, nb_free); + } + bp->nb_free.tqe_next = NFSNOLIST; + NFSBUFCNTCHK(); +} + +/* + * check for existence of nfsbuf in cache + */ +struct nfsbuf * +nfs_buf_incore(struct vnode *vp, daddr_t blkno) +{ + /* Search hash chain */ + struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first; + for (; bp != NULL; bp = bp->nb_hash.le_next) + if (bp->nb_lblkno == blkno && bp->nb_vp == vp && + !ISSET(bp->nb_flags, NB_INVAL)) { + FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp); + return (bp); + } + return (NULL); +} + +/* + * Check if it's OK to drop a page. + * + * Called by vnode_pager() on pageout request of non-dirty page. + * We need to make sure that it's not part of a delayed write. + * If it is, we can't let the VM drop it because we may need it + * later when/if we need to write the data (again). + */ +int +nfs_buf_page_inval(struct vnode *vp, off_t offset) +{ + struct nfsbuf *bp; + bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset)); + if (!bp) + return (0); + FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); + if (ISSET(bp->nb_flags, NB_BUSY)) + return (EBUSY); + /* + * If there's a dirty range in the buffer, check to + * see if this page intersects with the dirty range. + * If it does, we can't let the pager drop the page. + */ + if (bp->nb_dirtyend > 0) { + int start = offset - NBOFF(bp); + if (bp->nb_dirtyend <= start || + bp->nb_dirtyoff >= (start + PAGE_SIZE)) + return (0); + return (EBUSY); + } + return (0); +} + +int +nfs_buf_upl_setup(struct nfsbuf *bp) +{ + kern_return_t kret; + upl_t upl; + int s; + + if (ISSET(bp->nb_flags, NB_PAGELIST)) + return (0); + + kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize, + &upl, NULL, UPL_PRECIOUS); + if (kret == KERN_INVALID_ARGUMENT) { + /* vm object probably doesn't exist any more */ + bp->nb_pagelist = NULL; + return (EINVAL); + } + if (kret != KERN_SUCCESS) { + printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret); + bp->nb_pagelist = NULL; + return (EIO); + } + + FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp); + + s = splbio(); + bp->nb_pagelist = upl; + SET(bp->nb_flags, NB_PAGELIST); + splx(s); + return (0); +} + +void +nfs_buf_upl_check(struct nfsbuf *bp) +{ + upl_page_info_t *pl; + off_t filesize, fileoffset; + int i, npages; + + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return; + + npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; + filesize = ubc_getsize(bp->nb_vp); + fileoffset = NBOFF(bp); + if (fileoffset < filesize) + SET(bp->nb_flags, NB_CACHE); + else + CLR(bp->nb_flags, NB_CACHE); + + pl = ubc_upl_pageinfo(bp->nb_pagelist); + bp->nb_valid = bp->nb_dirty = 0; + + for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) { + /* anything beyond the end of the file is not valid or dirty */ + if (fileoffset >= filesize) + break; + if (!upl_valid_page(pl, i)) { + CLR(bp->nb_flags, NB_CACHE); + continue; + } + NBPGVALID_SET(bp,i); + if (upl_dirty_page(pl, i)) { + NBPGDIRTY_SET(bp, i); + if (!ISSET(bp->nb_flags, NB_WASDIRTY)) + SET(bp->nb_flags, NB_WASDIRTY); + } + } + fileoffset = NBOFF(bp); + if (ISSET(bp->nb_flags, NB_CACHE)) { + bp->nb_validoff = 0; + bp->nb_validend = bp->nb_bufsize; + if (fileoffset + bp->nb_validend > filesize) + bp->nb_validend = filesize - fileoffset; + } else { + bp->nb_validoff = bp->nb_validend = -1; + } + FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty); + FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); +} + +static int +nfs_buf_map(struct nfsbuf *bp) +{ + kern_return_t kret; + + if (bp->nb_data) + return (0); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return (EINVAL); + + kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data)); + if (kret != KERN_SUCCESS) + panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); + if (bp->nb_data == 0) + panic("ubc_upl_map mapped 0"); + FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data); + return (0); +} + +/* + * check range of pages in nfsbuf's UPL for validity + */ +static int +nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size) +{ + off_t fileoffset, filesize; + int pg, lastpg; + upl_page_info_t *pl; + + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + return (0); + pl = ubc_upl_pageinfo(bp->nb_pagelist); + + size += off & PAGE_MASK; + off &= ~PAGE_MASK; + fileoffset = NBOFF(bp); + filesize = VTONFS(bp->nb_vp)->n_size; + if ((fileoffset + off + size) > filesize) + size = filesize - (fileoffset + off); + + pg = off/PAGE_SIZE; + lastpg = (off + size - 1)/PAGE_SIZE; + while (pg <= lastpg) { + if (!upl_valid_page(pl, pg)) + return (0); + pg++; + } + return (1); +} + +/* + * normalize an nfsbuf's valid range + * + * the read/write code guarantees that we'll always have a valid + * region that is an integral number of pages. If either end + * of the valid range isn't page-aligned, it gets corrected + * here as we extend the valid range through all of the + * contiguous valid pages. + */ +static void +nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp) +{ + int pg, npg; + /* pull validoff back to start of contiguous valid page range */ + pg = bp->nb_validoff/PAGE_SIZE; + while (pg >= 0 && NBPGVALID(bp,pg)) + pg--; + bp->nb_validoff = (pg+1) * PAGE_SIZE; + /* push validend forward to end of contiguous valid page range */ + npg = bp->nb_bufsize/PAGE_SIZE; + pg = bp->nb_validend/PAGE_SIZE; + while (pg < npg && NBPGVALID(bp,pg)) + pg++; + bp->nb_validend = pg * PAGE_SIZE; + /* clip to EOF */ + if (NBOFF(bp) + bp->nb_validend > np->n_size) + bp->nb_validend = np->n_size % bp->nb_bufsize; +} + +/* + * try to push out some delayed/uncommitted writes + */ +static void +nfs_buf_delwri_push(void) +{ + struct nfsbuf *bp; + int i; + + if (TAILQ_EMPTY(&nfsbufdelwri)) + return; + + /* first try to tell the nfsiods to do it */ + if (nfs_asyncio(NULL, NULL) == 0) + return; + + /* otherwise, try to do some of the work ourselves */ + i = 0; + while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { + struct nfsnode *np = VTONFS(bp->nb_vp); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + /* put buffer at end of delwri list */ + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_flushcommits(np->n_vnode, (struct proc *)0); + } else { + SET(bp->nb_flags, (NB_BUSY | NB_ASYNC)); + nfs_buf_write(bp); + } + i++; + } +} + +/* + * Get an nfs cache block. + * Allocate a new one if the block isn't currently in the cache + * and return the block marked busy. If the calling process is + * interrupted by a signal for an interruptible mount point, return + * NULL. + */ +struct nfsbuf * +nfs_buf_get( + struct vnode *vp, + daddr_t blkno, + int size, + struct proc *p, + int operation) +{ + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp; + int i, biosize, bufsize, rv; + struct ucred *cred; + int slpflag = PCATCH; + + FSDBG_TOP(541, vp, blkno, size, operation); + + bufsize = size; + if (bufsize > MAXBSIZE) + panic("nfs_buf_get: buffer larger than MAXBSIZE requested"); + + biosize = vp->v_mount->mnt_stat.f_iosize; + + if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) + operation = BLK_META; + else if (bufsize < biosize) + /* reg files should always have biosize blocks */ + bufsize = biosize; + + /* if BLK_WRITE, check for too many delayed/uncommitted writes */ + if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) { + FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + + /* poke the delwri list */ + nfs_buf_delwri_push(); + + /* sleep to let other threads run... */ + tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); + FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4)); + } + +loop: + /* + * Obtain a lock to prevent a race condition if the + * MALLOC() below happens to block. + */ + if (nfsbufhashlock) { + while (nfsbufhashlock) { + nfsbufhashlock = -1; + tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) + return (NULL); + } + goto loop; + } + nfsbufhashlock = 1; + + /* check for existence of nfsbuf in cache */ + if (bp = nfs_buf_incore(vp, blkno)) { + /* if busy, set wanted and wait */ + if (ISSET(bp->nb_flags, NB_BUSY)) { + FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags); + SET(bp->nb_flags, NB_WANTED); + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz); + slpflag = 0; + FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { + FSDBG_BOT(541, vp, blkno, 0, EINTR); + return (NULL); + } + goto loop; + } + if (bp->nb_bufsize != bufsize) + panic("nfsbuf size mismatch"); + SET(bp->nb_flags, (NB_BUSY | NB_CACHE)); + nfs_buf_remfree(bp); + /* additional paranoia: */ + if (ISSET(bp->nb_flags, NB_PAGELIST)) + panic("pagelist buffer was not busy"); + goto buffer_setup; + } + + /* + * where to get a free buffer: + * - alloc new if we haven't reached min bufs + * - free list + * - alloc new if we haven't reached max allowed + * - start clearing out delwri list and try again + */ + + if ((nfsbufcnt > nfsbufmin) && !TAILQ_EMPTY(&nfsbuffree)) { + /* pull an nfsbuf off the free list */ + bp = TAILQ_FIRST(&nfsbuffree); + FSDBG(544, vp, blkno, bp, bp->nb_flags); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_DELWRI)) + panic("nfs_buf_get: delwri"); + SET(bp->nb_flags, NB_BUSY); + /* disassociate buffer from previous vnode */ + if (bp->nb_vp) { + struct vnode *oldvp; + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + oldvp = bp->nb_vp; + bp->nb_vp = NULL; + HOLDRELE(oldvp); + } + LIST_REMOVE(bp, nb_hash); + /* nuke any creds we're holding */ + cred = bp->nb_rcred; + if (cred != NOCRED) { + bp->nb_rcred = NOCRED; + crfree(cred); + } + cred = bp->nb_wcred; + if (cred != NOCRED) { + bp->nb_wcred = NOCRED; + crfree(cred); + } + /* if buf will no longer be NB_META, dump old buffer */ + if ((operation != BLK_META) && + ISSET(bp->nb_flags, NB_META) && bp->nb_data) { + FREE(bp->nb_data, M_TEMP); + bp->nb_data = NULL; + } + /* re-init buf fields */ + bp->nb_error = 0; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + } else if (nfsbufcnt < nfsbufmax) { + /* just alloc a new one */ + MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); + nfsbufcnt++; + NFSBUFCNTCHK(); + /* init nfsbuf */ + bzero(bp, sizeof(*bp)); + bp->nb_free.tqe_next = NFSNOLIST; + bp->nb_validoff = bp->nb_validend = -1; + FSDBG(545, vp, blkno, bp, 0); + } else { + /* too many bufs... wait for buffers to free up */ + FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax); + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + + /* poke the delwri list */ + nfs_buf_delwri_push(); + + nfsneedbuffer = 1; + tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0); + FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax); + if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) { + FSDBG_BOT(541, vp, blkno, 0, EINTR); + return (NULL); + } + goto loop; + } + +setup_nfsbuf: + + /* setup nfsbuf */ + bp->nb_flags = NB_BUSY; + bp->nb_lblkno = blkno; + /* insert buf in hash */ + LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash); + /* associate buffer with new vnode */ + VHOLD(vp); + bp->nb_vp = vp; + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + +buffer_setup: + + switch (operation) { + case BLK_META: + SET(bp->nb_flags, NB_META); + if ((bp->nb_bufsize != bufsize) && bp->nb_data) { + FREE(bp->nb_data, M_TEMP); + bp->nb_data = NULL; + bp->nb_validoff = bp->nb_validend = -1; + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + bp->nb_valid = 0; + bp->nb_dirty = 0; + CLR(bp->nb_flags, NB_CACHE); + } + if (!bp->nb_data) + MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK); + if (!bp->nb_data) + panic("nfs_buf_get: null nb_data"); + bp->nb_bufsize = bufsize; + break; + + case BLK_READ: + case BLK_WRITE: + if (bufsize < PAGE_SIZE) + bufsize = PAGE_SIZE; + bp->nb_bufsize = bufsize; + bp->nb_validoff = bp->nb_validend = -1; + + if (UBCISVALID(vp)) { + /* setup upl */ + if (nfs_buf_upl_setup(bp)) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* cleanup buffer and return NULL */ + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + bp->nb_vp = NULL; + HOLDRELE(vp); + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + FSDBG_BOT(541, vp, blkno, 0x2bc, EIO); + return (NULL); + } + nfs_buf_upl_check(bp); + } + break; + + default: + panic("nfs_buf_get: %d unknown operation", operation); + } + + /* unlock hash */ + if (nfsbufhashlock < 0) { + nfsbufhashlock = 0; + wakeup(&nfsbufhashlock); + } else + nfsbufhashlock = 0; + + FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags); + + return (bp); +} + +void +nfs_buf_release(struct nfsbuf *bp) +{ + struct vnode *vp = bp->nb_vp; + + FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); + FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); + FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); + + if (UBCINFOEXISTS(vp) && bp->nb_bufsize) { + int upl_flags; + upl_t upl; + int i, rv; + + if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { + rv = nfs_buf_upl_setup(bp); + if (rv) + printf("nfs_buf_release: upl create failed %d\n", rv); + else + nfs_buf_upl_check(bp); + } + upl = bp->nb_pagelist; + if (!upl) + goto pagelist_cleanup_done; + if (bp->nb_data) { + if (ubc_upl_unmap(upl) != KERN_SUCCESS) + panic("ubc_upl_unmap failed"); + bp->nb_data = NULL; + } + if (bp->nb_flags & (NB_ERROR | NB_INVAL)) { + if (bp->nb_flags & (NB_READ | NB_INVAL)) + upl_flags = UPL_ABORT_DUMP_PAGES; + else + upl_flags = 0; + ubc_upl_abort(upl, upl_flags); + goto pagelist_cleanup_done; + } + for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) { + if (!NBPGVALID(bp,i)) + ubc_upl_abort_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + UPL_ABORT_DUMP_PAGES | + UPL_ABORT_FREE_ON_EMPTY); + else { + if (NBPGDIRTY(bp,i)) + upl_flags = UPL_COMMIT_SET_DIRTY; + else + upl_flags = UPL_COMMIT_CLEAR_DIRTY; + ubc_upl_commit_range(upl, + i*PAGE_SIZE, PAGE_SIZE, + upl_flags | + UPL_COMMIT_INACTIVATE | + UPL_COMMIT_FREE_ON_EMPTY); + } + } +pagelist_cleanup_done: + /* was this the last buffer in the file? */ + if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) { + /* if so, invalidate all pages of last buffer past EOF */ + int biosize = vp->v_mount->mnt_stat.f_iosize; + off_t off, size; + off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64; + size = trunc_page_64(NBOFF(bp) + biosize) - off; + if (size) + ubc_invalidate(vp, off, size); + } + CLR(bp->nb_flags, NB_PAGELIST); + bp->nb_pagelist = NULL; + } + + /* Wake up any processes waiting for any buffer to become free. */ + if (nfsneedbuffer) { + nfsneedbuffer = 0; + wakeup(&nfsneedbuffer); + } + /* Wake up any processes waiting for _this_ buffer to become free. */ + if (ISSET(bp->nb_flags, NB_WANTED)) { + CLR(bp->nb_flags, NB_WANTED); + wakeup(bp); + } + + /* If it's not cacheable, or an error, mark it invalid. */ + if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR))) + SET(bp->nb_flags, NB_INVAL); + + if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { + /* If it's invalid or empty, dissociate it from its vnode */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) { + LIST_REMOVE(bp, nb_vnbufs); + bp->nb_vnbufs.le_next = NFSNOLIST; + } + bp->nb_vp = NULL; + HOLDRELE(vp); + /* if this was a delayed write, wakeup anyone */ + /* waiting for delayed writes to complete */ + if (ISSET(bp->nb_flags, NB_DELWRI)) { + CLR(bp->nb_flags, NB_DELWRI); + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); + } + /* put buffer at head of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + NFS_BUF_FREEUP(); + } else if (ISSET(bp->nb_flags, NB_DELWRI)) { + /* put buffer at end of delwri list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + } else { + /* put buffer at end of free list */ + if (bp->nb_free.tqe_next != NFSNOLIST) + panic("nfsbuf on freelist"); + TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); + nfsbuffreecnt++; + NFS_BUF_FREEUP(); + } + + NFSBUFCNTCHK(); + + /* Unlock the buffer. */ + CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD)); + + FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); +} + +/* + * Wait for operations on the buffer to complete. + * When they do, extract and return the I/O's error value. + */ +int +nfs_buf_iowait(struct nfsbuf *bp) +{ + FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + while (!ISSET(bp->nb_flags, NB_DONE)) + tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0); + + FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + /* check for interruption of I/O, then errors. */ + if (ISSET(bp->nb_flags, NB_EINTR)) { + CLR(bp->nb_flags, NB_EINTR); + return (EINTR); + } else if (ISSET(bp->nb_flags, NB_ERROR)) + return (bp->nb_error ? bp->nb_error : EIO); + return (0); +} + +/* + * Mark I/O complete on a buffer. + */ +void +nfs_buf_iodone(struct nfsbuf *bp) +{ + struct vnode *vp; + + FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + + if (ISSET(bp->nb_flags, NB_DONE)) + panic("nfs_buf_iodone already"); + SET(bp->nb_flags, NB_DONE); /* note that it's done */ + /* + * I/O was done, so don't believe + * the DIRTY state from VM anymore + */ + CLR(bp->nb_flags, NB_WASDIRTY); + + if (!ISSET(bp->nb_flags, NB_READ)) { + CLR(bp->nb_flags, NB_WRITEINPROG); + vpwakeup(bp->nb_vp); + } + + /* Wakeup the throttled write operations as needed */ + vp = bp->nb_vp; + if (vp && (vp->v_flag & VTHROTTLED) + && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) { + vp->v_flag &= ~VTHROTTLED; + wakeup((caddr_t)&vp->v_numoutput); + } + + if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */ + nfs_buf_release(bp); + else { /* or just wakeup the buffer */ + CLR(bp->nb_flags, NB_WANTED); + wakeup(bp); + } + + FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); +} + +void +nfs_buf_write_delayed(struct nfsbuf *bp) +{ + struct proc *p = current_proc(); + struct vnode *vp = bp->nb_vp; + + FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); + FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); + + /* + * If the block hasn't been seen before: + * (1) Mark it as having been seen, + * (2) Charge for the write. + * (3) Make sure it's on its vnode's correct block list, + */ + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + if (p && p->p_stats) + p->p_stats->p_ru.ru_oublock++; /* XXX */ + nfs_nbdwrite++; + NFSBUFCNTCHK(); + /* move to dirty list */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs); + } + + /* + * If the vnode has "too many" write operations in progress + * wait for them to finish the IO + */ + while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) { + vp->v_flag |= VTHROTTLED; + tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0); + } + + /* + * If we have too many delayed write buffers, + * more than we can "safely" handle, just fall back to + * doing the async write + */ + if (nfs_nbdwrite < 0) + panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); + + if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) { + /* issue async write */ + SET(bp->nb_flags, NB_ASYNC); + nfs_buf_write(bp); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); + return; + } + + /* Otherwise, the "write" is done, so mark and release the buffer. */ + SET(bp->nb_flags, NB_DONE); + nfs_buf_release(bp); + FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0); + return; +} + /* * Vnode op for read using bio @@ -115,33 +967,41 @@ register struct uio *uio; int ioflag; struct ucred *cred; - int getpages; + int getpages; // XXX unused! { - register struct nfsnode *np = VTONFS(vp); - register int biosize, i; + struct nfsnode *np = VTONFS(vp); + int biosize, i; off_t diff; - struct buf *bp = 0, *rabp; + struct nfsbuf *bp = 0, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, rabn; + daddr_t lbn, rabn, lastrabn = -1; int bufsize; - int nra, error = 0, n = 0, on = 0, not_readin; + int nra, error = 0, n = 0, on = 0; int operation = (getpages? BLK_PAGEIN : BLK_READ); + caddr_t dp; + struct dirent *direntp; + + FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag); #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("nfs_read mode"); #endif - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + FSDBG_BOT(514, vp, 0xd1e0001, 0, 0); return (0); - if (uio->uio_offset < 0) + } + if (uio->uio_offset < 0) { + FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL); return (EINVAL); + } p = uio->uio_procp; - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + biosize = vp->v_mount->mnt_stat.f_iosize; /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is @@ -155,7 +1015,7 @@ * Then force a getattr rpc to ensure that you have up to date * attributes. * NB: This implies that cache data can be read when up to - * NFS_ATTRTIMEO seconds out of date. If you find that you need current + * NFS_MAXATTRTIMEO seconds out of date. If you find that you need current * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ @@ -166,24 +1026,35 @@ panic("nfs: bioread, not dir"); nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0003, 0, error); return (error); + } } np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0004, 0, error); return (error); + } np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0005, 0, error); return (error); + } if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0006, 0, error); return (error); + } np->n_mtime = vattr.va_mtime.tv_sec; } } @@ -198,70 +1069,126 @@ do { error = nqnfs_getlease(vp, ND_READ, cred, p); } while (error == NQNFS_EXPIRED); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0007, 0, error); return (error); + } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE) || ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { if (vp->v_type == VDIR) nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0008, 0, error); return (error); + } np->n_brev = np->n_lrev; } } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { nfs_invaldir(vp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0009, 0, error); return (error); + } } } - if (np->n_flag & NQNFSNONCACHE) { + if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) { + if ((vp->v_flag & VNOCACHE_DATA) && + (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + if (error) { + FSDBG_BOT(514, vp, 0xd1e000a, 0, error); + return (error); + } + } switch (vp->v_type) { case VREG: - return (nfs_readrpc(vp, uio, cred)); + error = nfs_readrpc(vp, uio, cred); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + return (error); case VLNK: - return (nfs_readlinkrpc(vp, uio, cred)); + error = nfs_readlinkrpc(vp, uio, cred); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); + return (error); case VDIR: break; default: - printf(" NQNFSNONCACHE: type %x unexpected\n", - vp->v_type); + printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type); }; } switch (vp->v_type) { case VREG: - nfsstats.biocache_reads++; lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - not_readin = 1; + + /* + * Copy directly from any cached pages without grabbing the bufs. + */ + if (uio->uio_segflg == UIO_USERSPACE) { + int io_resid = uio->uio_resid; + diff = np->n_size - uio->uio_offset; + if (diff < io_resid) + io_resid = diff; + if (io_resid > 0) { + error = cluster_copy_ubc_data(vp, uio, &io_resid, 0); + if (error) { + FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error); + return (error); + } + } + /* count any biocache reads that we just copied directly */ + if (lbn != uio->uio_offset / biosize) { + nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn; + FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error); + } + } + + lbn = uio->uio_offset / biosize; + on = uio->uio_offset % biosize; /* * Start the read ahead(s), as required. */ if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead && - (off_t)(lbn + 1 + nra) * biosize < np->n_size; - nra++) { + for (nra = 0; nra < nmp->nm_readahead; nra++) { rabn = lbn + 1 + nra; - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p, operation); - if (!rabp) - return (EINTR); - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); - if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); - } - } else - brelse(rabp); + if (rabn <= lastrabn) { + /* we've already (tried to) read this block */ + /* no need to try it again... */ + continue; } - } + lastrabn = rabn; + if ((off_t)rabn * biosize >= np->n_size) + break; + /* check if block exists and is valid. */ + rabp = nfs_buf_incore(vp, rabn); + if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) + continue; + rabp = nfs_buf_get(vp, rabn, biosize, p, operation); + if (!rabp) { + FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR); + return (EINTR); + } + if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) { + SET(rabp->nb_flags, (NB_READ|NB_ASYNC)); + if (nfs_asyncio(rabp, cred)) { + SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); + rabp->nb_error = EIO; + nfs_buf_release(rabp); + } + } else + nfs_buf_release(rabp); + } + } + + if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) { + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa); + return (0); } + nfsstats.biocache_reads++; + /* * If the block is in the cache and has the required data * in a valid region, just copy it out. @@ -270,84 +1197,162 @@ */ again: bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size && - (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } - bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation); - if (!bp) - return (EINTR); - - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); - CLR(bp->b_flags, (B_DONE | B_ERROR | B_INVAL)); - not_readin = 0; - error = nfs_doio(bp, cred, p); - if (error) { - brelse(bp); - return (error); - } - } - if (bufsize > on) { - n = min((unsigned)(bufsize - on), uio->uio_resid); - } else { - n = 0; - } + n = min((unsigned)(bufsize - on), uio->uio_resid); diff = np->n_size - uio->uio_offset; if (diff < n) n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - SET(bp->b_flags, (B_NOCACHE|B_INVAFTERWRITE)); - if (bp->b_dirtyend > 0) { - if (!ISSET(bp->b_flags, B_DELWRI)) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); + + bp = nfs_buf_get(vp, lbn, bufsize, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR); + return (EINTR); + } + + /* if any pages are valid... */ + if (bp->nb_valid) { + /* ...check for any invalid pages in the read range */ + int pg, firstpg, lastpg, dirtypg; + dirtypg = firstpg = lastpg = -1; + pg = on/PAGE_SIZE; + while (pg <= (on + n - 1)/PAGE_SIZE) { + if (!NBPGVALID(bp,pg)) { + if (firstpg < 0) + firstpg = pg; + lastpg = pg; + } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg)) + dirtypg = pg; + pg++; + } + + /* if there are no invalid pages, we're all set */ + if (firstpg < 0) { + if (bp->nb_validoff < 0) { + /* valid range isn't set up, so */ + /* set it to what we know is valid */ + bp->nb_validoff = trunc_page_32(on); + bp->nb_validend = round_page_32(on+n); + nfs_buf_normalize_valid_range(np, bp); + } + goto buffer_ready; + } + + /* there are invalid pages in the read range */ + if ((dirtypg > firstpg) && (dirtypg < lastpg)) { + /* there are also dirty page(s) in the range, */ + /* so write the buffer out and try again */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, NB_ASYNC); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(514, vp, 0xd1e000d, 0, error); + return (error); + } goto again; } + if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && + (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) { + /* we need to read in more than half the buffer and the */ + /* buffer's not dirty, so just fetch the whole buffer */ + bp->nb_valid = 0; + } else { + /* read the page range in */ + struct iovec iov; + struct uio auio; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64; + auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + NFS_BUF_MAP(bp); + iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE; + iov.iov_len = auio.uio_resid; + error = nfs_readrpc(vp, &auio, cred); + if (error) { + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e000e, 0, error); + return (error); + } + /* Make sure that the valid range is set to cover this read. */ + bp->nb_validoff = trunc_page_32(on); + bp->nb_validend = round_page_32(on+n); + nfs_buf_normalize_valid_range(np, bp); + if (auio.uio_resid > 0) { + /* if short read, must have hit EOF, */ + /* so zero the rest of the range */ + bzero(iov.iov_base, auio.uio_resid); + } + /* mark the pages (successfully read) as valid */ + for (pg=firstpg; pg <= lastpg; pg++) + NBPGVALID_SET(bp,pg); + } } + /* if no pages are valid, read the whole block */ + if (!bp->nb_valid) { + SET(bp->nb_flags, NB_READ); + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + error = nfs_doio(bp, cred, p); + if (error) { + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e000f, 0, error); + return (error); + } + } +buffer_ready: vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; + /* validate read range against valid range and clip */ + if (bp->nb_validend > 0) { + diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); + if (diff < n) + n = diff; + } + if (n > 0) + NFS_BUF_MAP(bp); break; case VLNK: nfsstats.biocache_readlinks++; - bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); - if (!bp) + bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR); return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); + SET(bp->nb_flags, NB_ERROR); + nfs_buf_release(bp); + FSDBG_BOT(514, vp, 0xd1e0011, 0, error); return (error); } } - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + n = min(uio->uio_resid, bp->nb_validend); on = 0; break; case VDIR: nfsstats.biocache_readdirs++; - if (np->n_direofoffset - && uio->uio_offset >= np->n_direofoffset) { - return (0); + if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { + FSDBG_BOT(514, vp, 0xde0f0001, 0, 0); + return (0); } lbn = uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); - bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p, operation); - if (!bp) - return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR); + return (EINTR); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); if (error) { - brelse(bp); + nfs_buf_release(bp); } while (error == NFSERR_BAD_COOKIE) { nfs_invaldir(vp); @@ -360,20 +1365,23 @@ */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset - && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) + && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) { + FSDBG_BOT(514, vp, 0xde0f0002, 0, 0); return (0); - bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p, - operation); - if (!bp) + } + bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation); + if (!bp) { + FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR); return (EINTR); - if (!ISSET(bp->b_flags, B_CACHE)) { - SET(bp->b_flags, B_READ); + } + if (!ISSET(bp->nb_flags, NB_CACHE)) { + SET(bp->nb_flags, NB_READ); error = nfs_doio(bp, cred, p); /* - * no error + B_INVAL == directory EOF, + * no error + NB_INVAL == directory EOF, * use the block. */ - if (error == 0 && (bp->b_flags & B_INVAL)) + if (error == 0 && (bp->nb_flags & NB_INVAL)) break; } /* @@ -383,7 +1391,7 @@ * block and go for the next one via the for loop. */ if (error || i < lbn) - brelse(bp); + nfs_buf_release(bp); } } /* @@ -391,8 +1399,10 @@ * error. If we hit an error and it wasn't a cookie error, * we give up. */ - if (error) + if (error) { + FSDBG_BOT(514, vp, 0xd1e0014, 0, error); return (error); + } } /* @@ -404,19 +1414,19 @@ (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && !(np->n_flag & NQNFSNONCACHE) && - !incore(vp, lbn + 1)) { - rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p, + !nfs_buf_incore(vp, lbn + 1)) { + rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, operation); if (rabp) { - if (!ISSET(rabp->b_flags, (B_CACHE|B_DELWRI))) { - SET(rabp->b_flags, (B_READ | B_ASYNC)); + if (!ISSET(rabp->nb_flags, (NB_CACHE))) { + SET(rabp->nb_flags, (NB_READ | NB_ASYNC)); if (nfs_asyncio(rabp, cred)) { - SET(rabp->b_flags, (B_INVAL|B_ERROR)); - rabp->b_error = EIO; - brelse(rabp); + SET(rabp->nb_flags, (NB_INVAL|NB_ERROR)); + rabp->nb_error = EIO; + nfs_buf_release(rabp); } } else { - brelse(rabp); + nfs_buf_release(rabp); } } } @@ -424,30 +1434,41 @@ * Make sure we use a signed variant of min() since * the second term may be negative. */ - n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); + n = lmin(uio->uio_resid, bp->nb_validend - on); /* - * Unlike VREG files, whos buffer size ( bp->b_bcount ) is - * chopped for the EOF condition, we cannot tell how large - * NFS directories are going to be until we hit EOF. So - * an NFS directory buffer is *not* chopped to its EOF. Now, - * it just so happens that b_resid will effectively chop it - * to EOF. *BUT* this information is lost if the buffer goes - * away and is reconstituted into a B_CACHE state (recovered - * from VM) later. So we keep track of the directory eof - * in np->n_direofoffset and chop it off as an extra step - * right here. + * We keep track of the directory eof in + * np->n_direofoffset and chop it off as an + * extra step right here. */ if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) n = np->n_direofoffset - uio->uio_offset; + /* + * Make sure that we return an integral number of entries so + * that any subsequent calls will start copying from the start + * of the next entry. + * + * If the current value of n has the last entry cut short, + * set n to copy everything up to the last entry instead. + */ + if (n > 0) { + dp = bp->nb_data + on; + while (dp < (bp->nb_data + on + n)) { + direntp = (struct dirent *)dp; + dp += direntp->d_reclen; + } + if (dp > (bp->nb_data + on + n)) + n = (dp - direntp->d_reclen) - (bp->nb_data + on); + } break; default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); - break; + printf("nfs_bioread: type %x unexpected\n",vp->v_type); + FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL); + return (EINVAL); }; if (n > 0) { - error = uiomove(bp->b_data + on, (int)n, uio); + error = uiomove(bp->nb_data + on, (int)n, uio); } switch (vp->v_type) { case VREG: @@ -457,13 +1478,12 @@ break; case VDIR: if (np->n_flag & NQNFSNONCACHE) - SET(bp->b_flags, B_INVAL); + SET(bp->nb_flags, NB_INVAL); break; - default: - printf(" nfs_bioread: type %x unexpected\n",vp->v_type); } - brelse(bp); + nfs_buf_release(bp); } while (error == 0 && uio->uio_resid > 0 && n > 0); + FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error); return (error); } @@ -480,23 +1500,24 @@ struct ucred *a_cred; } */ *ap; { - register int biosize; - register struct uio *uio = ap->a_uio; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; - struct buf *bp; + struct nfsbuf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; - int bufsize; + int biosize, bufsize, writeop; int n, on, error = 0, iomode, must_commit; - off_t boff; + off_t boff, start, end; struct iovec iov; struct uio auio; + FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag); + #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("nfs_write mode"); @@ -507,29 +1528,39 @@ return (EIO); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error); return (np->n_error); } - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { np->n_attrstamp = 0; error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error); return (error); + } } if (ioflag & IO_APPEND) { np->n_attrstamp = 0; error = VOP_GETATTR(vp, &vattr, cred, p); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error); return (error); + } uio->uio_offset = np->n_size; } } - if (uio->uio_offset < 0) + if (uio->uio_offset < 0) { + FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL); return (EINVAL); - if (uio->uio_resid == 0) + } + if (uio->uio_resid == 0) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); return (0); + } /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters @@ -537,15 +1568,11 @@ if (p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); + FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG); return (EFBIG); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - /*due to getblk/vm interractions, use vm page size or less values */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + + biosize = vp->v_mount->mnt_stat.f_iosize; do { /* @@ -556,210 +1583,376 @@ do { error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error); return (error); + } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error); return (error); + } np->n_brev = np->n_lrev; } } - if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { + if (ISSET(vp->v_flag, VNOCACHE_DATA) && + (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) { + error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); + if (error) { + FSDBG_BOT(515, vp, 0, 0, error); + return (error); + } + } + if (((np->n_flag & NQNFSNONCACHE) || + ISSET(vp->v_flag, VNOCACHE_DATA)) && + uio->uio_iovcnt == 1) { iomode = NFSV3WRITE_FILESYNC; error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); + on = uio->uio_offset % biosize; n = min((unsigned)(biosize - on), uio->uio_resid); again: bufsize = biosize; -#if 0 -/* (removed for UBC) */ - if ((lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } -#endif /* * Get a cache block for writing. The range to be written is - * (off..off+len) within the block. We ensure that the block + * (off..off+n) within the block. We ensure that the block * either has no dirty region or that the given range is * contiguous with the existing dirty region. */ - bp = nfs_getcacheblk(vp, lbn, bufsize, p, BLK_WRITE); - if (!bp) + bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE); + if (!bp) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR); return (EINTR); + } + /* map the block because we know we're going to write to it */ + NFS_BUF_MAP(bp); + + if (ISSET(vp->v_flag, VNOCACHE_DATA)) + SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL)); + + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + + /* + * If there's already a dirty range AND dirty pages in this block we + * need to send a commit AND write the dirty pages before continuing. + * + * If there's already a dirty range OR dirty pages in this block + * and the new write range is not contiguous with the existing range, + * then force the buffer to be written out now. + * (We used to just extend the dirty range to cover the valid, + * but unwritten, data in between also. But writing ranges + * of data that weren't actually written by an application + * risks overwriting some other client's data with stale data + * that's just masquerading as new written data.) + */ + if (bp->nb_dirtyend > 0) { + if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001); + /* write/commit buffer "synchronously" */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + return (error); + } + goto again; + } + } else if (bp->nb_dirty) { + int firstpg, lastpg; + u_int32_t pagemask; + /* calculate write range pagemask */ + firstpg = on/PAGE_SIZE; + lastpg = (on+n-1)/PAGE_SIZE; + pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); + /* check if there are dirty pages outside the write range */ + if (bp->nb_dirty & ~pagemask) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002); + /* write/commit buffer "synchronously" */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); + SET(bp->nb_flags, (NB_ASYNC | NB_STABLE)); + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); + return (error); + } + goto again; + } + /* if the first or last pages are already dirty */ + /* make sure that the dirty range encompasses those pages */ + if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) { + FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003); + bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE); + if (NBPGDIRTY(bp,lastpg)) { + bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE; + /* clip to EOF */ + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); + } else + bp->nb_dirtyend = on+n; + } + } + /* - * Resize nfsnode *after* we busy the buffer to prevent - * readers from reading garbage. + * Are we extending the size of the file with this write? + * If so, update file size now that we have the block. * If there was a partial buf at the old eof, validate * and zero the new bytes. */ if (uio->uio_offset + n > np->n_size) { - struct buf *bp0 = NULL; - daddr_t bn = np->n_size / biosize; - int off = np->n_size & (biosize - 1); - - if (off && bn < lbn && incore(vp, bn)) - bp0 = nfs_getcacheblk(vp, bn, biosize, p, - BLK_WRITE); + struct nfsbuf *eofbp = NULL; + daddr_t eofbn = np->n_size / biosize; + int eofoff = np->n_size % biosize; + int neweofoff = (uio->uio_offset + n) % biosize; + + FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff); + + if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn)) + eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE); + + /* if we're extending within the same last block */ + /* and the block is flagged as being cached... */ + if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) { + /* ...check that all pages in buffer are valid */ + int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE; + u_int32_t pagemask; + /* pagemask only has to extend to last page being written to */ + pagemask = (1 << (endpg+1)) - 1; + FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0); + if ((bp->nb_valid & pagemask) != pagemask) { + /* zerofill any hole */ + if (on > bp->nb_validend) { + int i; + for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++) + NBPGVALID_SET(bp, i); + NFS_BUF_MAP(bp); + FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e); + bzero((char *)bp->nb_data + bp->nb_validend, + on - bp->nb_validend); + } + /* zerofill any trailing data in the last page */ + if (neweofoff) { + NFS_BUF_MAP(bp); + FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f); + bzero((char *)bp->nb_data + neweofoff, + PAGE_SIZE - (neweofoff & PAGE_MASK)); + } + } + } np->n_flag |= NMODIFIED; np->n_size = uio->uio_offset + n; ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */ - if (bp0) { - bzero((char *)bp0->b_data + off, biosize - off); - bp0->b_validend = biosize; - brelse(bp0); + if (eofbp) { + /* + * We may need to zero any previously invalid data + * after the old EOF in the previous EOF buffer. + * + * For the old last page, don't zero bytes if there + * are invalid bytes in that page (i.e. the page isn't + * currently valid). + * For pages after the old last page, zero them and + * mark them as valid. + */ + char *d; + int i; + if (ISSET(vp->v_flag, VNOCACHE_DATA)) + SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL)); + NFS_BUF_MAP(eofbp); + FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e); + d = eofbp->nb_data; + i = eofoff/PAGE_SIZE; + while (eofoff < biosize) { + int poff = eofoff & PAGE_MASK; + if (!poff || NBPGVALID(eofbp,i)) { + bzero(d + eofoff, PAGE_SIZE - poff); + NBPGVALID_SET(eofbp, i); + } + if (bp->nb_validend == eofoff) + bp->nb_validend += PAGE_SIZE - poff; + eofoff += PAGE_SIZE - poff; + i++; + } + nfs_buf_release(eofbp); } } /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); - /* * If dirtyend exceeds file size, chop it down. This should * not occur unless there is a race. */ - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > - np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * - DEV_BSIZE; + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); /* - * UBC doesn't (yet) handle partial pages so nfs_biowrite was - * hacked to never bdwrite, to start every little write right - * away. Running IE Avie noticed the performance problem, thus - * this code, which permits those delayed writes by ensuring an - * initial read of the entire page. The read may hit eof - * ("short read") but that we will handle. + * UBC doesn't handle partial pages, so we need to make sure + * that any pages left in the page cache are completely valid. * - * We are quite dependant on the correctness of B_CACHE so check - * that first in case of problems. - */ - if (!ISSET(bp->b_flags, B_CACHE) && n < PAGE_SIZE) { - boff = (off_t)bp->b_blkno * DEV_BSIZE; - auio.uio_iov = &iov; - auio.uio_iovcnt = 1; - auio.uio_offset = boff; - auio.uio_resid = PAGE_SIZE; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_procp = p; - iov.iov_base = bp->b_data; - iov.iov_len = PAGE_SIZE; - error = nfs_readrpc(vp, &auio, cred); - if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc %d", error); - } - if (auio.uio_resid > 0) - bzero(iov.iov_base, auio.uio_resid); - bp->b_validoff = 0; - bp->b_validend = PAGE_SIZE - auio.uio_resid; - if (np->n_size > boff + bp->b_validend) - bp->b_validend = min(np->n_size - boff, - PAGE_SIZE); - bp->b_dirtyoff = 0; - bp->b_dirtyend = 0; - } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise try to extend the dirty region. + * Writes that are smaller than a block are delayed if they + * don't extend to the end of the block. + * + * If the block isn't (completely) cached, we may need to read + * in some parts of pages that aren't covered by the write. + * If the write offset (on) isn't page aligned, we'll need to + * read the start of the first page being written to. Likewise, + * if the offset of the end of the write (on+n) isn't page aligned, + * we'll need to read the end of the last page being written to. + * + * Notes: + * We don't want to read anything we're just going to write over. + * We don't want to issue multiple I/Os if we don't have to + * (because they're synchronous rpcs). + * We don't want to read anything we already have modified in the + * page cache. */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - off_t start, end; - - boff = (off_t)bp->b_blkno * DEV_BSIZE; - if (on > bp->b_dirtyend) { - start = boff + bp->b_validend; - end = boff + on; - } else { - start = boff + on + n; - end = boff + bp->b_validoff; + if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) { + int firstpg, lastpg, dirtypg; + int firstpgoff, lastpgoff; + start = end = -1; + firstpg = on/PAGE_SIZE; + firstpgoff = on & PAGE_MASK; + lastpg = (on+n-1)/PAGE_SIZE; + lastpgoff = (on+n) & PAGE_MASK; + if (firstpgoff && !NBPGVALID(bp,firstpg)) { + /* need to read start of first page */ + start = firstpg * PAGE_SIZE; + end = start + firstpgoff; + } + if (lastpgoff && !NBPGVALID(bp,lastpg)) { + /* need to read end of last page */ + if (start < 0) + start = (lastpg * PAGE_SIZE) + lastpgoff; + end = (lastpg + 1) * PAGE_SIZE; } - - /* - * It may be that the valid region in the buffer - * covers the region we want, in which case just - * extend the dirty region. Otherwise we try to - * extend the valid region. - */ if (end > start) { + /* need to read the data in range: start...end-1 */ + + /* + * XXX: If we know any of these reads are beyond the + * current EOF (what np->n_size was before we possibly + * just modified it above), we could short-circuit the + * reads and just zero buffer. No need to make a trip + * across the network to read nothing. + */ + + /* first, check for dirty pages in between */ + /* if there are, we'll have to do two reads because */ + /* we don't want to overwrite the dirty pages. */ + for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++) + if (NBPGDIRTY(bp,dirtypg)) + break; + + /* if start is at beginning of page, try */ + /* to get any preceeding pages as well. */ + if (!(start & PAGE_MASK)) { + /* stop at next dirty/valid page or start of block */ + for (; start > 0; start-=PAGE_SIZE) + if (NBPGVALID(bp,((start-1)/PAGE_SIZE))) + break; + } + + NFS_BUF_MAP(bp); + /* setup uio for read(s) */ + boff = NBOFF(bp); auio.uio_iov = &iov; auio.uio_iovcnt = 1; - auio.uio_offset = start; - auio.uio_resid = end - start; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = p; - iov.iov_base = bp->b_data + (start - boff); - iov.iov_len = end - start; + + if (dirtypg <= (end-1)/PAGE_SIZE) { + /* there's a dirty page in the way, so just do two reads */ + /* we'll read the preceding data here */ + auio.uio_offset = boff + start; + auio.uio_resid = iov.iov_len = on - start; + iov.iov_base = bp->nb_data + start; + error = nfs_readrpc(vp, &auio, cred); + if (error) { + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + printf("nfs_write: readrpc %d", error); + } + if (auio.uio_resid > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01); + bzero(iov.iov_base, auio.uio_resid); + } + /* update validoff/validend if necessary */ + if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) + bp->nb_validoff = start; + if ((bp->nb_validend < 0) || (bp->nb_validend < on)) + bp->nb_validend = on; + if (np->n_size > boff + bp->nb_validend) + bp->nb_validend = min(np->n_size - (boff + start), biosize); + /* validate any pages before the write offset */ + for (; start < on/PAGE_SIZE; start+=PAGE_SIZE) + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* adjust start to read any trailing data */ + start = on+n; + } + + /* if end is at end of page, try to */ + /* get any following pages as well. */ + if (!(end & PAGE_MASK)) { + /* stop at next valid page or end of block */ + for (; end < bufsize; end+=PAGE_SIZE) + if (NBPGVALID(bp,end/PAGE_SIZE)) + break; + } + + /* now we'll read the (rest of the) data */ + auio.uio_offset = boff + start; + auio.uio_resid = iov.iov_len = end - start; + iov.iov_base = bp->nb_data + start; error = nfs_readrpc(vp, &auio, cred); - /* - * If we couldn't read, do not do a VOP_BWRITE - * as originally coded. That could also error - * and looping back to "again" as it was doing - * could have us stuck trying to write same buf - * again. nfs_write, will get the entire region - * if nfs_readrpc succeeded. If unsuccessful - * we should just error out. Errors like ESTALE - * would keep us looping rather than transient - * errors justifying a retry. We can return here - * instead of altering dirty region later. We - * did not write old dirty region at this point. - */ if (error) { - bp->b_error = error; - SET(bp->b_flags, B_ERROR); - printf("nfs_write: readrpc2 %d", error); - brelse(bp); - return (error); + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + printf("nfs_write: readrpc %d", error); } - /* - * The read worked. - * If there was a short read, just zero fill. - */ - if (auio.uio_resid > 0) + if (auio.uio_resid > 0) { + FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02); bzero(iov.iov_base, auio.uio_resid); - if (on > bp->b_dirtyend) - bp->b_validend = on; - else - bp->b_validoff = on + n; + } + /* update validoff/validend if necessary */ + if ((bp->nb_validoff < 0) || (bp->nb_validoff > start)) + bp->nb_validoff = start; + if ((bp->nb_validend < 0) || (bp->nb_validend < end)) + bp->nb_validend = end; + if (np->n_size > boff + bp->nb_validend) + bp->nb_validend = min(np->n_size - (boff + start), biosize); + /* validate any pages before the write offset's page */ + for (; start < trunc_page_32(on); start+=PAGE_SIZE) + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* validate any pages after the range of pages being written to */ + for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE) + NBPGVALID_SET(bp, (end-1)/PAGE_SIZE); + /* Note: pages being written to will be validated when written */ } - /* - * We now have a valid region which extends up to the - * dirty region which we want. - */ - if (on > bp->b_dirtyend) - bp->b_dirtyend = on; - else - bp->b_dirtyoff = on + n; } - if (ISSET(bp->b_flags, B_ERROR)) { - error = bp->b_error; - brelse(bp); + + if (ISSET(bp->nb_flags, NB_ERROR)) { + error = bp->nb_error; + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } - /* - * NFS has embedded ucred so crhold() risks zone corruption - */ - if (bp->b_wcred == NOCRED) - bp->b_wcred = crdup(cred); + np->n_flag |= NMODIFIED; /* * Check for valid write lease and get one as required. - * In case getblk() and/or bwrite() delayed us. + * In case nfs_buf_get() and/or nfs_buf_write() delayed us. */ if ((nmp->nm_flag & NFSMNT_NQNFS) && NQNFS_CKINVALID(vp, np, ND_WRITE)) { @@ -767,124 +1960,222 @@ error = nqnfs_getlease(vp, ND_WRITE, cred, p); } while (error == NQNFS_EXPIRED); if (error) { - brelse(bp); + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error); return (error); } if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { - brelse(bp); + nfs_buf_release(bp); error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error); return (error); + } np->n_brev = np->n_lrev; goto again; } } - error = uiomove((char *)bp->b_data + on, n, uio); + NFS_BUF_MAP(bp); + error = uiomove((char *)bp->nb_data + on, n, uio); if (error) { - SET(bp->b_flags, B_ERROR); - brelse(bp); + SET(bp->nb_flags, NB_ERROR); + nfs_buf_release(bp); + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error); return (error); } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); + + /* validate any pages written to */ + start = on & ~PAGE_MASK; + for (; start < on+n; start += PAGE_SIZE) { + NBPGVALID_SET(bp, start/PAGE_SIZE); + /* + * This may seem a little weird, but we don't actually set the + * dirty bits for writes. This is because we keep the dirty range + * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for + * delayed writes, when we give the pages back to the VM we don't + * want to keep them marked dirty, because when we later write the + * buffer we won't be able to tell which pages were written dirty + * and which pages were mmapped and dirtied. + */ + } + if (bp->nb_dirtyend > 0) { + bp->nb_dirtyoff = min(on, bp->nb_dirtyoff); + bp->nb_dirtyend = max((on + n), bp->nb_dirtyend); } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; + bp->nb_dirtyoff = on; + bp->nb_dirtyend = on + n; } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; + if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff || + bp->nb_validoff > bp->nb_dirtyend) { + bp->nb_validoff = bp->nb_dirtyoff; + bp->nb_validend = bp->nb_dirtyend; } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff); + bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend); } + if (!ISSET(bp->nb_flags, NB_CACHE)) + nfs_buf_normalize_valid_range(np, bp); /* * Since this block is being modified, it must be written * again and not just committed. */ - CLR(bp->b_flags, B_NEEDCOMMIT); - - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ - if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); + + if ((np->n_flag & NQNFSNONCACHE) || + (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) { + bp->nb_proc = p; + error = nfs_buf_write(bp); + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, + uio->uio_resid, error); return (error); + } if (np->n_flag & NQNFSNONCACHE) { error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) + if (error) { + FSDBG_BOT(515, vp, uio->uio_offset, + uio->uio_resid, error); return (error); + } } - } else if ((n + on) == biosize && - (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->b_proc = (struct proc *)0; - SET(bp->b_flags, B_ASYNC); - (void)nfs_writebp(bp, 0); + } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) { + bp->nb_proc = (struct proc *)0; + SET(bp->nb_flags, NB_ASYNC); + nfs_buf_write(bp); } else - bdwrite(bp); + nfs_buf_write_delayed(bp); + + if (np->n_needcommitcnt > (nbuf/16)) + nfs_flushcommits(vp, p); + } while (uio->uio_resid > 0 && n > 0); + + FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0); return (0); } - /* - * Get an nfs cache block. - * Allocate a new one if the block isn't currently in the cache - * and return the block marked busy. If the calling process is - * interrupted by a signal for an interruptible mount point, return - * NULL. + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. */ -static struct buf * -nfs_getcacheblk(vp, bn, size, p, operation) - struct vnode *vp; - daddr_t bn; - int size; +static int +nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; struct proc *p; - int operation; /* defined in sys/buf.h */ + int slpflag, slptimeo; { - register struct buf *bp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - /*due to getblk/vm interractions, use vm page size or less values */ - int biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); + struct nfsbuf *bp; + struct nfsbuf *nbp, *blist; + int s, error = 0; + struct nfsnode *np = VTONFS(vp); - if (nbdwrite > ((nbuf/4)*3) && operation == BLK_WRITE) { -#define __BUFFERS_RECLAIMED 2 - struct buf *tbp[__BUFFERS_RECLAIMED]; - int i; - - /* too many delayed writes, try to free up some buffers */ - for (i = 0; i < __BUFFERS_RECLAIMED; i++) - tbp[i] = geteblk(512); - - /* Yield to IO thread */ - (void)tsleep((caddr_t)&nbdwrite, PCATCH, "nbdwrite", 1); - - for (i = (__BUFFERS_RECLAIMED - 1); i >= 0; i--) - brelse(tbp[i]); - } - - if (nmp->nm_flag & NFSMNT_INT) { - bp = getblk(vp, bn, size, PCATCH, 0, operation); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); - bp = getblk(vp, bn, size, 0, 2 * hz, operation); - } - } else - bp = getblk(vp, bn, size, 0, 0, operation); + if (flags & V_SAVE) { + if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + return (error); + if (np->n_dirtyblkhd.lh_first) + panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)", + vp, np->n_dirtyblkhd.lh_first); + } - if( vp->v_type == VREG) - bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE; + for (;;) { + blist = np->n_cleanblkhd.lh_first; + if (!blist) + blist = np->n_dirtyblkhd.lh_first; + if (!blist) + break; - return (bp); + for (bp = blist; bp; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + s = splbio(); + if (ISSET(bp->nb_flags, NB_BUSY)) { + SET(bp->nb_flags, NB_WANTED); + FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags); + error = tsleep((caddr_t)bp, + slpflag | (PRIBIO + 1), "nfs_vinvalbuf", + slptimeo); + FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags); + splx(s); + if (error) { + FSDBG(554, vp, bp, -1, error); + return (error); + } + break; + } + FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags); + nfs_buf_remfree(bp); + SET(bp->nb_flags, NB_BUSY); + splx(s); + if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) { + /* XXX extra paranoia: make sure we're not */ + /* somehow leaving any dirty data around */ + int mustwrite = 0; + int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ? + bp->nb_bufsize : (np->n_size - NBOFF(bp)); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error == EINVAL) { + /* vm object must no longer exist */ + /* hopefully we don't need to do */ + /* anything for this buffer */ + } else if (error) + printf("nfs_vinvalbuf: upl setup failed %d\n", + error); + bp->nb_valid = bp->nb_dirty = 0; + } + nfs_buf_upl_check(bp); + /* check for any dirty data before the EOF */ + if (bp->nb_dirtyend && bp->nb_dirtyoff < end) { + /* clip dirty range to EOF */ + if (bp->nb_dirtyend > end) + bp->nb_dirtyend = end; + mustwrite++; + } + bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; + if (bp->nb_dirty) + mustwrite++; + if (mustwrite) { + FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags); + if (!ISSET(bp->nb_flags, NB_PAGELIST)) + panic("nfs_vinvalbuf: dirty buffer without upl"); + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(cred); + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(554, bp, 0xd00dee, 0xbad, error); + np->n_error = error; + np->n_flag |= NWRITEERR; + error = 0; + } + break; + } + } + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp); + } + } + if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first) + panic("nfs_vinvalbuf: flush failed"); + return (0); } + /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. @@ -902,7 +2193,9 @@ int error = 0, slpflag, slptimeo; int didhold = 0; - if ((nmp->nm_flag & NFSMNT_INT) == 0) + FSDBG_TOP(554, vp, flags, intrflg, 0); + + if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -916,36 +2209,33 @@ */ while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; - error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", - slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); + FSDBG_TOP(555, vp, flags, intrflg, np->n_flag); + error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); + FSDBG_BOT(555, vp, flags, intrflg, np->n_flag); + if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) { + FSDBG_BOT(554, vp, flags, intrflg, error); + return (error); + } } /* * Now, flush as required. */ np->n_flag |= NFLUSHINPROG; - error = vinvalbuf(vp, flags, cred, p, slpflag, 0); + error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0); while (error) { - /* we seem to be stuck in a loop here if the thread got aborted. - * nfs_flush will return EINTR. Not sure if that will cause - * other consequences due to EINTR having other meanings in NFS - * To handle, no dirty pages, it seems safe to just return from - * here. But if we did have dirty pages, how would we get them - * written out if thread was aborted? Some other strategy is - * necessary. -- EKN - */ - if ((intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) || - (error == EINTR && current_thread_aborted())) { + FSDBG(554, vp, 0, 0, error); + error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p); + if (error) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup((caddr_t)&np->n_flag); } - return (EINTR); + FSDBG_BOT(554, vp, flags, intrflg, error); + return (error); } - error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); + error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { @@ -954,9 +2244,12 @@ } didhold = ubc_hold(vp); if (didhold) { - (void) ubc_clean(vp, 1); /* get the pages out of vm also */ + int rv = ubc_clean(vp, 1); /* get the pages out of vm also */ + if (!rv) + panic("nfs_vinvalbuf(): ubc_clean failed!"); ubc_rele(vp); } + FSDBG_BOT(554, vp, flags, intrflg, 0); return (0); } @@ -967,7 +2260,7 @@ */ int nfs_asyncio(bp, cred) - register struct buf *bp; + struct nfsbuf *bp; struct ucred *cred; { struct nfsmount *nmp; @@ -975,17 +2268,23 @@ int gotiod; int slpflag = 0; int slptimeo = 0; - int error; + int error, error2; if (nfs_numasync == 0) return (EIO); - - nmp = VFSTONFS(bp->b_vp->v_mount); + + FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0); + + nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL); again: - if (nmp->nm_flag & NFSMNT_INT) + if (nmp && nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; + /* no nfsbuf means tell nfsiod to process delwri list */ + if (!bp) + nfs_ioddelwri = 1; + /* * Find a free iod to process this request. */ @@ -1000,12 +2299,17 @@ i, nmp)); nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = nmp; - nmp->nm_bufqiods++; + if (nmp) + nmp->nm_bufqiods++; wakeup((caddr_t)&nfs_iodwant[i]); gotiod = TRUE; break; } + /* if we're just poking the delwri list, we're done */ + if (!bp) + return (0); + /* * If none are free, we may already have an iod working on this mount * point. If so, it will process our request. @@ -1023,19 +2327,31 @@ * If we have an iod which can process the request, then queue * the buffer. */ + FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods); if (gotiod) { /* * Ensure that the queue never grows too large. */ while (nmp->nm_bufqlen >= 2*nfs_numasync) { + if (ISSET(bp->nb_flags, NB_IOD)) { + /* An nfsiod is attempting this async operation so */ + /* we must not fall asleep on the bufq because we */ + /* could be waiting on ourself. Just return error */ + /* and we'll do this operation syncrhonously. */ + goto out; + } + FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1); NFS_DPF(ASYNCIO, ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, "nfsaio", slptimeo); if (error) { - if (nfs_sigintr(nmp, NULL, bp->b_proc)) - return (EINTR); + error2 = nfs_sigintr(nmp, NULL, bp->nb_proc); + if (error2) { + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2); + return (error2); + } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -1052,35 +2368,38 @@ } } - if (ISSET(bp->b_flags, B_READ)) { - if (bp->b_rcred == NOCRED && cred != NOCRED) { + if (ISSET(bp->nb_flags, NB_READ)) { + if (bp->nb_rcred == NOCRED && cred != NOCRED) { /* * NFS has embedded ucred. * Can not crhold() here as that causes zone corruption */ - bp->b_rcred = crdup(cred); + bp->nb_rcred = crdup(cred); } } else { - SET(bp->b_flags, B_WRITEINPROG); - if (bp->b_wcred == NOCRED && cred != NOCRED) { + SET(bp->nb_flags, NB_WRITEINPROG); + if (bp->nb_wcred == NOCRED && cred != NOCRED) { /* * NFS has embedded ucred. * Can not crhold() here as that causes zone corruption */ - bp->b_wcred = crdup(cred); + bp->nb_wcred = crdup(cred); } } - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); + TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free); nmp->nm_bufqlen++; + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0); return (0); } +out: /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); + FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO); return (EIO); } @@ -1090,7 +2409,7 @@ */ int nfs_doio(bp, cr, p) - register struct buf *bp; + struct nfsbuf *bp; struct ucred *cr; struct proc *p; { @@ -1102,7 +2421,7 @@ struct uio uio; struct iovec io; - vp = bp->b_vp; + vp = bp->nb_vp; np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); uiop = &uio; @@ -1111,66 +2430,34 @@ uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = p; - /* - * With UBC, getblk() can return a buf with B_DONE set. - * This indicates that the VM has valid data for that page. - * NFS being stateless, this case poses a problem. - * By definition, the NFS server should always be consulted - * for the data in that page. - * So we choose to clear the B_DONE and to do the IO. - * - * XXX revisit this if there is a performance issue. - * XXX In that case, we could play the attribute cache games ... + /* + * we've decided to perform I/O for this block, + * so we couldn't possibly NB_DONE. So, clear it. */ - if (ISSET(bp->b_flags, B_DONE)) { - if (!ISSET(bp->b_flags, B_ASYNC)) + if (ISSET(bp->nb_flags, NB_DONE)) { + if (!ISSET(bp->nb_flags, NB_ASYNC)) panic("nfs_doio: done and not async"); - CLR(bp->b_flags, B_DONE); + CLR(bp->nb_flags, NB_DONE); } - FSDBG_TOP(256, np->n_size, bp->b_blkno * DEV_BSIZE, bp->b_bcount, - bp->b_flags); - FSDBG(257, bp->b_validoff, bp->b_validend, bp->b_dirtyoff, - bp->b_dirtyend); - /* - * Historically, paging was done with physio, but no more. - */ - if (ISSET(bp->b_flags, B_PHYS)) { - /* - * ...though reading /dev/drum still gets us here. - */ - io.iov_len = uiop->uio_resid = bp->b_bcount; - /* mapping was done by vmapbuf() */ - io.iov_base = bp->b_data; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; - if (ISSET(bp->b_flags, B_READ)) { - uiop->uio_rw = UIO_READ; - nfsstats.read_physios++; - error = nfs_readrpc(vp, uiop, cr); - } else { - int com; - - iomode = NFSV3WRITE_DATASYNC; - uiop->uio_rw = UIO_WRITE; - nfsstats.write_physios++; - error = nfs_writerpc(vp, uiop, cr, &iomode, &com); - } - if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; - } - } else if (ISSET(bp->b_flags, B_READ)) { - io.iov_len = uiop->uio_resid = bp->b_bcount; - io.iov_base = bp->b_data; + FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags); + FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, + bp->nb_dirtyend); + + if (ISSET(bp->nb_flags, NB_READ)) { + if (vp->v_type == VREG) + NFS_BUF_MAP(bp); + io.iov_len = uiop->uio_resid = bp->nb_bufsize; + io.iov_base = bp->nb_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE; + uiop->uio_offset = NBOFF(bp); nfsstats.read_bios++; error = nfs_readrpc(vp, uiop, cr); - FSDBG(262, np->n_size, bp->b_blkno * DEV_BSIZE, - uiop->uio_resid, error); + FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error); if (!error) { - bp->b_validoff = 0; + /* update valid range */ + bp->nb_validoff = 0; if (uiop->uio_resid) { /* * If len > 0, there is a hole in the file and @@ -1178,33 +2465,26 @@ * the server yet. * Just zero fill the rest of the valid area. */ - diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - ((u_quad_t)bp->b_blkno * DEV_BSIZE + - diff); + diff = bp->nb_bufsize - uiop->uio_resid; + len = np->n_size - (NBOFF(bp) + diff); if (len > 0) { len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; + bzero((char *)bp->nb_data + diff, len); + bp->nb_validend = diff + len; FSDBG(258, diff, len, 0, 1); } else - bp->b_validend = diff; + bp->nb_validend = diff; } else - bp->b_validend = bp->b_bcount; - - if (bp->b_validend < bp->b_bufsize) { - /* - * we're about to release a partial buffer after a - * read... the only way we should get here is if - * this buffer contains the EOF before releasing it, - * we'll zero out to the end of the buffer so that - * if a mmap of this page occurs, we'll see zero's - * even if a ftruncate extends the file in the - * meantime - */ - bzero((caddr_t)(bp->b_data + bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(258, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, 2); + bp->nb_validend = bp->nb_bufsize; + bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; + if (bp->nb_validend & PAGE_MASK) { + /* valid range ends in the middle of a page so we */ + /* need to zero-fill any invalid data at the end */ + /* of the last page */ + bzero((caddr_t)(bp->nb_data + bp->nb_validend), + bp->nb_bufsize - bp->nb_validend); + FSDBG(258, bp->nb_validend, + bp->nb_bufsize - bp->nb_validend, 0, 2); } } if (p && (vp->v_flag & VTEXT) && @@ -1222,10 +2502,14 @@ uiop->uio_offset = (off_t)0; nfsstats.readlink_bios++; error = nfs_readlinkrpc(vp, uiop, cr); + if (!error) { + bp->nb_validoff = 0; + bp->nb_validend = uiop->uio_offset; + } break; case VDIR: nfsstats.readdir_bios++; - uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; + uiop->uio_offset = NBOFF(bp); if (!(nmp->nm_flag & NFSMNT_NFSV3)) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */ if (nmp->nm_flag & NFSMNT_RDIRPLUS) { @@ -1235,151 +2519,276 @@ } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = nfs_readdirrpc(vp, uiop, cr); + if (!error) { + bp->nb_validoff = 0; + bp->nb_validend = uiop->uio_offset - NBOFF(bp); + bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1; + } break; default: printf("nfs_doio: type %x unexpected\n", vp->v_type); break; }; if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = error; + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; } + } else { + /* we're doing a write */ + int doff, dend = 0; + + /* We need to make sure the pages are locked before doing I/O. */ + if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) { + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + error = nfs_buf_upl_setup(bp); + if (error) { + printf("nfs_doio: upl create failed %d\n", error); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = EIO; + return (EIO); + } + nfs_buf_upl_check(bp); + } + } + + if (ISSET(bp->nb_flags, NB_WASDIRTY)) { + FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee); + /* + * There are pages marked dirty that need to be written out. + * + * We don't want to just combine the write range with the + * range of pages that are dirty because that could cause us + * to write data that wasn't actually written to. + * We also don't want to write data more than once. + * + * If the dirty range just needs to be committed, we do that. + * Otherwise, we write the dirty range and clear the dirty bits + * for any COMPLETE pages covered by that range. + * If there are dirty pages left after that, we write out the + * parts that we haven't written yet. + */ + } + /* - * mapped I/O may have altered any bytes, so we extend - * the dirty zone to the valid zone. For best performance - * a better solution would be to save & restore page dirty bits - * around the uiomove which brings write-data into the buffer. - * Then here we'd check if the page is dirty rather than WASMAPPED - * Also vnode_pager would change - if a page is clean it might - * still need to be written due to DELWRI. + * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not + * an actual write will have to be done. + * If NB_WRITEINPROG is already set, then push it with a write anyhow. */ - if (UBCINFOEXISTS(vp) && ubc_issetflags(vp, UI_WASMAPPED)) { - bp->b_dirtyoff = min(bp->b_dirtyoff, bp->b_validoff); - bp->b_dirtyend = max(bp->b_dirtyend, bp->b_validend); + if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) { + doff = NBOFF(bp) + bp->nb_dirtyoff; + SET(bp->nb_flags, NB_WRITEINPROG); + error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff, + bp->nb_wcred, bp->nb_proc); + CLR(bp->nb_flags, NB_WRITEINPROG); + if (!error) { + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + CLR(bp->nb_flags, NB_NEEDCOMMIT); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } else if (error == NFSERR_STALEWRITEVERF) + nfs_clearcommit(vp->v_mount); } - if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) - bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; - - if (bp->b_dirtyend > bp->b_dirtyoff) { - io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + - bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; - uiop->uio_rw = UIO_WRITE; - nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == - B_ASYNC) + if (!error && bp->nb_dirtyend > 0) { + /* there's a dirty range that needs to be written out */ + u_int32_t pagemask; + int firstpg, lastpg; + + if (NBOFF(bp) + bp->nb_dirtyend > np->n_size) + bp->nb_dirtyend = np->n_size - NBOFF(bp); + + NFS_BUF_MAP(bp); + + doff = bp->nb_dirtyoff; + dend = bp->nb_dirtyend; + + /* if doff page is dirty, move doff to start of page */ + if (NBPGDIRTY(bp,doff/PAGE_SIZE)) + doff -= doff & PAGE_MASK; + /* try to expand write range to include preceding dirty pages */ + if (!(doff & PAGE_MASK)) + while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE)) + doff -= PAGE_SIZE; + /* if dend page is dirty, move dend to start of next page */ + if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE)) + dend = round_page_32(dend); + /* try to expand write range to include trailing dirty pages */ + if (!(dend & PAGE_MASK)) + while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE)) + dend += PAGE_SIZE; + /* make sure to keep dend clipped to EOF */ + if (NBOFF(bp) + dend > np->n_size) + dend = np->n_size - NBOFF(bp); + /* calculate range of complete pages being written */ + firstpg = round_page_32(doff) / PAGE_SIZE; + lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE; + /* calculate mask for that page range */ + pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1); + + /* compare page mask to nb_dirty; if there are other dirty pages */ + /* then write FILESYNC; otherwise, write UNSTABLE if async and */ + /* not needcommit/nocache/call; otherwise write FILESYNC */ + if (bp->nb_dirty & ~pagemask) + iomode = NFSV3WRITE_FILESYNC; + else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC) iomode = NFSV3WRITE_UNSTABLE; else iomode = NFSV3WRITE_FILESYNC; - SET(bp->b_flags, B_WRITEINPROG); + + /* write the dirty range */ + io.iov_len = uiop->uio_resid = dend - doff; + uiop->uio_offset = NBOFF(bp) + doff; + io.iov_base = (char *)bp->nb_data + doff; + uiop->uio_rw = UIO_WRITE; + + nfsstats.write_bios++; + + SET(bp->nb_flags, NB_WRITEINPROG); error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - if (!error && iomode == NFSV3WRITE_UNSTABLE) - SET(bp->b_flags, B_NEEDCOMMIT); - else - CLR(bp->b_flags, B_NEEDCOMMIT); - CLR(bp->b_flags, B_WRITEINPROG); + if (must_commit) + nfs_clearcommit(vp->v_mount); + /* clear dirty bits for pages we've written */ + if (!error) + bp->nb_dirty &= ~pagemask; + /* set/clear needcommit flag */ + if (!error && iomode == NFSV3WRITE_UNSTABLE) { + if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + np->n_needcommitcnt++; + SET(bp->nb_flags, NB_NEEDCOMMIT); + /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ + bp->nb_dirtyoff = doff; + bp->nb_dirtyend = dend; + } else { + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); + } + CLR(bp->nb_flags, NB_WRITEINPROG); /* - * For an interrupted write, the buffer is still valid - * and the write hasn't been pushed to the server yet, - * so we can't set B_ERROR and report the interruption - * by setting B_EINTR. For the B_ASYNC case, B_EINTR - * is not relevant, so the rpc attempt is essentially - * a noop. For the case of a V3 write rpc not being - * committed to stable storage, the block is still - * dirty and requires either a commit rpc or another - * write rpc with iomode == NFSV3WRITE_FILESYNC before - * the block is reused. This is indicated by setting - * the B_DELWRI and B_NEEDCOMMIT flags. + * For an interrupted write, the buffer is still valid and the write + * hasn't been pushed to the server yet, so we can't set NB_ERROR and + * report the interruption by setting NB_EINTR. For the NB_ASYNC case, + * NB_EINTR is not relevant. + * + * For the case of a V3 write rpc not being committed to stable + * storage, the block is still dirty and requires either a commit rpc + * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the + * block is reused. This is indicated by setting the NB_DELWRI and + * NB_NEEDCOMMIT flags. */ - if (error == EINTR || (!error && bp->b_flags & B_NEEDCOMMIT)) { - int s; - - CLR(bp->b_flags, B_INVAL | B_NOCACHE); - if (!ISSET(bp->b_flags, B_DELWRI)) { - SET(bp->b_flags, B_DELWRI); - nbdwrite++; - } - FSDBG(261, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); - /* - * Since for the B_ASYNC case, nfs_bwrite() has - * reassigned the buffer to the clean list, we have to - * reassign it back to the dirty one. Ugh. - */ - if (ISSET(bp->b_flags, B_ASYNC)) { - s = splbio(); - reassignbuf(bp, vp); - splx(s); - } else { - SET(bp->b_flags, B_EINTR); - } + if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) { + CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE); + if (!ISSET(bp->nb_flags, NB_DELWRI)) { + SET(bp->nb_flags, NB_DELWRI); + nfs_nbdwrite++; + NFSBUFCNTCHK(); + } + FSDBG(261, bp->nb_validoff, bp->nb_validend, + bp->nb_bufsize, 0); + /* + * Since for the NB_ASYNC case, nfs_bwrite() has + * reassigned the buffer to the clean list, we have to + * reassign it back to the dirty one. Ugh. + */ + if (ISSET(bp->nb_flags, NB_ASYNC)) { + /* move to dirty list */ + int s = splbio(); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + splx(s); + } else { + SET(bp->nb_flags, NB_EINTR); + } } else { + /* either there's an error or we don't need to commit */ if (error) { - SET(bp->b_flags, B_ERROR); - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - - /* - * validoff and validend represent the real data present - * in this buffer if validoff is non-zero, than we have - * to invalidate the buffer and kill the page when - * biodone is called... the same is also true when - * validend doesn't extend all the way to the end of the - * buffer and validend doesn't equate to the current - * EOF... eventually we need to deal with this in a more - * humane way (like keeping the partial buffer without - * making it immediately available to the VM page cache) - */ - if (bp->b_validoff) - SET(bp->b_flags, B_INVAL); - else - if (bp->b_validend < bp->b_bufsize) { - if ((off_t)bp->b_blkno * DEV_BSIZE + - bp->b_validend == np->n_size) { - bzero((caddr_t)(bp->b_data + - bp->b_validend), - bp->b_bufsize - bp->b_validend); - FSDBG(259, bp->b_validend, - bp->b_bufsize - bp->b_validend, 0, - 0); - } else - SET(bp->b_flags, B_INVAL); + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = np->n_error = error; + np->n_flag |= NWRITEERR; } + /* clear the dirty range */ + bp->nb_dirtyoff = bp->nb_dirtyend = 0; } + } + + if (!error && bp->nb_dirty) { + /* there are pages marked dirty that need to be written out */ + int pg, cnt, npages, off, len; + + nfsstats.write_bios++; + + NFS_BUF_MAP(bp); + + /* + * we do these writes synchronously because we can't really + * support the unstable/needommit method. We could write + * them unstable, clear the dirty bits, and then commit the + * whole block later, but if we need to rewrite the data, we + * won't have any idea which pages were written because that + * info can't be stored in the nb_dirtyoff/nb_dirtyend. We + * also can't leave the dirty bits set because then we wouldn't + * be able to tell if the pages were re-dirtied between the end + * of the write and the commit. + */ + iomode = NFSV3WRITE_FILESYNC; + uiop->uio_rw = UIO_WRITE; - } else { - if (bp->b_validoff || - (bp->b_validend < bp->b_bufsize && - (off_t)bp->b_blkno * DEV_BSIZE + bp->b_validend != - np->n_size)) { - SET(bp->b_flags, B_INVAL); + SET(bp->nb_flags, NB_WRITEINPROG); + npages = bp->nb_bufsize/PAGE_SIZE; + for (pg=0; pg < npages; pg++) { + if (!NBPGDIRTY(bp,pg)) + continue; + cnt = 1; + while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt)) + cnt++; + /* write cnt pages starting with page pg */ + off = pg * PAGE_SIZE; + len = cnt * PAGE_SIZE; + + /* clip writes to EOF */ + if (NBOFF(bp) + off + len > np->n_size) + len -= (NBOFF(bp) + off + len) - np->n_size; + if (len > 0) { + io.iov_len = uiop->uio_resid = len; + uiop->uio_offset = NBOFF(bp) + off; + io.iov_base = (char *)bp->nb_data + off; + error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); + if (must_commit) + nfs_clearcommit(vp->v_mount); + if (error) + break; + } + /* clear dirty bits */ + while (cnt--) { + bp->nb_dirty &= ~(1 << pg); + /* leave pg on last page */ + if (cnt) pg++; + } } - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, - bp->b_bufsize, bp->b_bcount); + if (!error) { + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } + CLR(bp->nb_flags, NB_NEEDCOMMIT); } - bp->b_resid = 0; - biodone(bp); - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bufsize, + CLR(bp->nb_flags, NB_WRITEINPROG); + FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, np->n_size); - return (0); } - } - bp->b_resid = uiop->uio_resid; - if (must_commit) - nfs_clearcommit(vp->v_mount); - if (bp->b_flags & B_INVAL) { - FSDBG(260, bp->b_validoff, bp->b_validend, bp->b_bufsize, - bp->b_bcount); + if (error) { + SET(bp->nb_flags, NB_ERROR); + bp->nb_error = error; + } } - FSDBG_BOT(256, bp->b_validoff, bp->b_validend, bp->b_bcount, error); - biodone(bp); + FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error); + + nfs_buf_iodone(bp); return (error); } diff -urN xnu-344.49/bsd/nfs/nfs_boot.c xnu-517/bsd/nfs/nfs_boot.c --- xnu-344.49/bsd/nfs/nfs_boot.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_boot.c Sat Oct 25 00:25:55 2003 @@ -208,7 +208,7 @@ if (netboot_iaddr(&my_ip) == FALSE) { printf("nfs_boot: networking is not initialized\n"); error = ENXIO; - goto failed; + goto failed_noswitch; } /* get the root path information */ @@ -289,9 +289,10 @@ else { error = 0; } -#endif NO_MOUNT_PRIVATE - failed: +#endif /* NO_MOUNT_PRIVATE */ +failed: thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); +failed_noswitch: return (error); } @@ -328,7 +329,7 @@ goto failed; } } -#endif NO_MOUNT_PRIVATE +#endif /* NO_MOUNT_PRIVATE */ failed: thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); return (error); diff -urN xnu-344.49/bsd/nfs/nfs_lock.c xnu-517/bsd/nfs/nfs_lock.c --- xnu-344.49/bsd/nfs/nfs_lock.c Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/nfs/nfs_lock.c Sat Oct 25 00:25:55 2003 @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp + */ + +#include +#include +#include +#include +#include /* for hz */ +#include +#include +#include +#include /* for hz */ /* Must come after sys/malloc.h */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define OFF_MAX QUAD_MAX + +uint64_t nfsadvlocks = 0; +struct timeval nfsadvlock_longest = {0, 0}; +struct timeval nfsadvlocks_time = {0, 0}; + +pid_t nfslockdpid = 0; +struct file *nfslockdfp = 0; +int nfslockdwaiting = 0; +int nfslockdfifowritten = 0; +int nfslockdfifolock = 0; +#define NFSLOCKDFIFOLOCK_LOCKED 1 +#define NFSLOCKDFIFOLOCK_WANT 2 + +/* + * XXX + * We have to let the process know if the call succeeded. I'm using an extra + * field in the uu_nlminfo field in the uthread structure, as it is already for + * lockd stuff. + */ + +/* + * nfs_advlock -- + * NFS advisory byte-level locks. + */ +int +nfs_dolock(struct vop_advlock_args *ap) +/* struct vop_advlock_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; +}; */ +{ + LOCKD_MSG msg; + struct nameidata nd; + struct vnode *vp, *wvp; + struct nfsnode *np; + int error, error1; + struct flock *fl; + int fmode, ioflg; + struct proc *p; + struct uthread *ut; + struct timeval elapsed; + struct nfsmount *nmp; + struct vattr vattr; + off_t start, end; + + ut = get_bsdthread_info(current_act()); + p = current_proc(); + + vp = ap->a_vp; + fl = ap->a_fl; + np = VTONFS(vp); + + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + if (nmp->nm_flag & NFSMNT_NOLOCKS) + return (EOPNOTSUPP); + + /* + * The NLM protocol doesn't allow the server to return an error + * on ranges, so we do it. Pre LFS (Large File Summit) + * standards required EINVAL for the range errors. More recent + * standards use EOVERFLOW, but their EINVAL wording still + * encompasses these errors. + * Any code sensitive to this is either: + * 1) written pre-LFS and so can handle only EINVAL, or + * 2) written post-LFS and thus ought to be tolerant of pre-LFS + * implementations. + * Since returning EOVERFLOW certainly breaks 1), we return EINVAL. + */ + if (fl->l_whence != SEEK_END) { + if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) || + fl->l_start < 0 || + (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) || + (fl->l_len < 0 && fl->l_start + fl->l_len < 0)) + return (EINVAL); + } + /* + * If daemon is running take a ref on its fifo + */ + if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) { + if (!nfslockdwaiting) + return (EOPNOTSUPP); + /* + * Don't wake lock daemon if it hasn't been started yet and + * this is an unlock request (since we couldn't possibly + * actually have a lock on the file). This could be an + * uninformed unlock request due to closef()'s behavior of doing + * unlocks on all files if a process has had a lock on ANY file. + */ + if (!nfslockdfp && (fl->l_type == F_UNLCK)) + return (EINVAL); + /* wake up lock daemon */ + (void)wakeup((void *)&nfslockdwaiting); + /* wait on nfslockdfp for a while to allow daemon to start */ + tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz); + /* check for nfslockdfp and f_data */ + if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) + return (EOPNOTSUPP); + } + VREF(wvp); + /* + * if there is no nfsowner table yet, allocate one. + */ + if (ut->uu_nlminfo == NULL) { + if (ap->a_op == F_UNLCK) { + vrele(wvp); + return (0); + } + MALLOC(ut->uu_nlminfo, struct nlminfo *, + sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO); + ut->uu_nlminfo->pid_start = p->p_stats->p_start; + } + /* + * Fill in the information structure. + */ + msg.lm_version = LOCKD_MSG_VERSION; + msg.lm_msg_ident.pid = p->p_pid; + msg.lm_msg_ident.ut = ut; + msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start; + msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq); + + /* + * The NFS Lock Manager protocol doesn't directly handle + * negative lengths or SEEK_END, so we need to normalize + * things here where we have all the info. + * (Note: SEEK_CUR is already adjusted for at this point) + */ + /* Convert the flock structure into a start and end. */ + switch (fl->l_whence) { + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * to fl->l_start when SEEK_CUR is used. + */ + start = fl->l_start; + break; + case SEEK_END: + /* need to flush, and refetch attributes to make */ + /* sure we have the correct end of file offset */ + if (np->n_flag & NMODIFIED) { + np->n_attrstamp = 0; + error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1); + if (error) { + vrele(wvp); + return (error); + } + } + np->n_attrstamp = 0; + error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + if (error) { + vrele(wvp); + return (error); + } + start = np->n_size + fl->l_start; + break; + default: + vrele(wvp); + return (EINVAL); + } + if (fl->l_len == 0) + end = -1; + else if (fl->l_len > 0) + end = start + fl->l_len - 1; + else { /* l_len is negative */ + end = start - 1; + start += fl->l_len; + } + if (start < 0) { + vrele(wvp); + return (EINVAL); + } + + msg.lm_fl = *fl; + msg.lm_fl.l_start = start; + if (end != -1) + msg.lm_fl.l_len = end - start + 1; + + msg.lm_wait = ap->a_flags & F_WAIT; + msg.lm_getlk = ap->a_op == F_GETLK; + + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + vrele(wvp); + return (ENXIO); + } + + bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr, + min(sizeof msg.lm_addr, + mtod(nmp->nm_nam, struct sockaddr *)->sa_len)); + msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH; + bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len); + msg.lm_nfsv3 = NFS_ISV3(vp); + cru2x(p->p_ucred, &msg.lm_cred); + + microuptime(&ut->uu_nlminfo->nlm_lockstart); + + fmode = FFLAGS(O_WRONLY); + if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) { + vrele(wvp); + return (error); + } + ++wvp->v_writecount; + +#define IO_NOMACCHECK 0; + ioflg = IO_UNIT | IO_NOMACCHECK; + for (;;) { + VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE); + + while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) { + nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT; + if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz)) + break; + } + nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED; + + error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0, + UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p); + + nfslockdfifowritten = 1; + + nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED; + if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) { + nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT; + wakeup((void *)&nfslockdfifolock); + } + /* wake up lock daemon */ + if (nfslockdwaiting) + (void)wakeup((void *)&nfslockdwaiting); + + if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) { + break; + } + /* + * If we're locking a file, wait for an answer. Unlocks succeed + * immediately. + */ + if (fl->l_type == F_UNLCK) + /* + * XXX this isn't exactly correct. The client side + * needs to continue sending it's unlock until + * it gets a response back. + */ + break; + + /* + * retry after 20 seconds if we haven't gotten a response yet. + * This number was picked out of thin air... but is longer + * then even a reasonably loaded system should take (at least + * on a local network). XXX Probably should use a back-off + * scheme. + */ + if ((error = tsleep((void *)ut->uu_nlminfo, + PCATCH | PUSER, "lockd", 20*hz)) != 0) { + if (error == EWOULDBLOCK) { + /* + * We timed out, so we rewrite the request + * to the fifo, but only if it isn't already + * full. + */ + ioflg |= IO_NDELAY; + continue; + } + + break; + } + + if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) { + if (ut->uu_nlminfo->set_getlk) { + fl->l_pid = ut->uu_nlminfo->getlk_pid; + fl->l_start = ut->uu_nlminfo->getlk_start; + fl->l_len = ut->uu_nlminfo->getlk_len; + fl->l_whence = SEEK_SET; + } else { + fl->l_type = F_UNLCK; + } + } + error = ut->uu_nlminfo->retcode; + break; + } + + /* XXX stats */ + nfsadvlocks++; + microuptime(&elapsed); + timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart); + if (timevalcmp(&elapsed, &nfsadvlock_longest, >)) + nfsadvlock_longest = elapsed; + timevaladd(&nfsadvlocks_time, &elapsed); + timerclear(&ut->uu_nlminfo->nlm_lockstart); + + error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p); + /* prefer any previous 'error' to our vn_close 'error1'. */ + return (error != 0 ? error : error1); +} + +/* + * nfslockdans -- + * NFS advisory byte-level locks answer from the lock daemon. + */ +int +nfslockdans(struct proc *p, struct lockd_ans *ansp) +{ + struct proc *targetp; + struct uthread *targetut, *uth; + int error; + + /* + * Let root, or someone who once was root (lockd generally + * switches to the daemon uid once it is done setting up) make + * this call. + * + * XXX This authorization check is probably not right. + */ + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 && + p->p_cred->p_svuid != 0) + return (error); + + /* the version should match, or we're out of sync */ + if (ansp->la_vers != LOCKD_ANS_VERSION) + return (EINVAL); + + /* Find the process & thread */ + if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL) + return (ESRCH); + targetut = ansp->la_msg_ident.ut; + TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) { + if (uth == targetut) + break; + } + /* + * Verify the pid hasn't been reused (if we can), and it isn't waiting + * for an answer from a more recent request. We return an EPIPE if + * the match fails, because we've already used ESRCH above, and this + * is sort of like writing on a pipe after the reader has closed it. + * If only the seq# is off, don't return an error just return. It could + * just be a response to a retransmitted request. + */ + if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL) + return (EPIPE); + if (ansp->la_msg_ident.msg_seq != -1) { + if (timevalcmp(&targetut->uu_nlminfo->pid_start, + &ansp->la_msg_ident.pid_start, !=)) + return (EPIPE); + if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq) + return (0); + } + + /* Found the thread, so set its return errno and wake it up. */ + + targetut->uu_nlminfo->retcode = ansp->la_errno; + targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set; + targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid; + targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start; + targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len; + + (void)wakeup((void *)targetut->uu_nlminfo); + + return (0); +} + +/* + * nfslockdfd -- + * NFS advisory byte-level locks: fifo file# from the lock daemon. + */ +int +nfslockdfd(struct proc *p, int fd) +{ + int error; + struct file *fp, *ofp; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + if (fd < 0) { + fp = 0; + } else { + error = getvnode(p, fd, &fp); + if (error) + return (error); + (void)fref(fp); + } + ofp = nfslockdfp; + nfslockdfp = fp; + if (ofp) + (void)frele(ofp); + nfslockdpid = nfslockdfp ? p->p_pid : 0; + (void)wakeup((void *)&nfslockdfp); + return (0); +} + +/* + * nfslockdwait -- + * lock daemon waiting for lock request + */ +int +nfslockdwait(struct proc *p) +{ + int error; + struct file *fp, *ofp; + + if (p->p_pid != nfslockdpid) { + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + } + if (nfslockdwaiting) + return (EBUSY); + if (nfslockdfifowritten) { + nfslockdfifowritten = 0; + return (0); + } + + nfslockdwaiting = 1; + tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0); + nfslockdwaiting = 0; + + return (0); +} diff -urN xnu-344.49/bsd/nfs/nfs_lock.h xnu-517/bsd/nfs/nfs_lock.h --- xnu-344.49/bsd/nfs/nfs_lock.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/nfs/nfs_lock.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,102 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from nfs_lock.h,v 2.2 1998/04/28 19:38:41 don Exp + * $FreeBSD$ + */ + +#include + +#ifdef __APPLE_API_PRIVATE + +/* + * lockd uses the nfsclnt system call for the unique kernel services it needs. + * It passes in a request structure with a version number at the start. + * This prevents libc from needing to change if the information passed + * between lockd and the kernel needs to change. + * + * If a structure changes, you must bump the version number. + */ + +#include + +/* + * The fifo where the kernel writes requests for locks on remote NFS files, + * and where lockd reads these requests. Note this is no longer hardwired + * in the kernel binary - lockd passes the file descriptor down via nfsclnt() + */ +#define _PATH_LCKFIFO "/var/run/nfslockd" + +/* + * This structure is used to uniquely identify the process which originated + * a particular message to lockd. A sequence number is used to differentiate + * multiple messages from the same process. A process start time is used to + * detect the unlikely, but possible, event of the recycling of a pid. + */ +struct lockd_msg_ident { + pid_t pid; /* The process ID. */ + struct timeval pid_start; /* Start time of process id */ + int msg_seq; /* Sequence number of message */ + struct uthread *ut; +}; + +#define LOCKD_MSG_VERSION 2 + +/* + * The structure that the kernel hands us for each lock request. + */ +typedef struct __lock_msg { + int lm_version; /* which version is this */ + struct lockd_msg_ident lm_msg_ident; /* originator of the message */ + struct flock lm_fl; /* The lock request. */ + int lm_wait; /* The F_WAIT flag. */ + int lm_getlk; /* is this a F_GETLK request */ + struct sockaddr_storage lm_addr; /* The address. */ + int lm_nfsv3; /* If NFS version 3. */ + size_t lm_fh_len; /* The file handle length. */ + struct xucred lm_cred; /* user cred for lock req */ + u_int8_t lm_fh[NFS_SMALLFH];/* The file handle. */ +} LOCKD_MSG; + +#define LOCKD_ANS_VERSION 1 + +struct lockd_ans { + int la_vers; + struct lockd_msg_ident la_msg_ident; /* originator of the message */ + int la_errno; + int la_getlk_set; /* use returned getlk values */ + int la_getlk_pid; /* returned pid for F_GETLK */ + off_t la_getlk_start; /* returned starting offset */ + off_t la_getlk_len; /* returned length */ +}; + +#ifdef KERNEL +int nfs_dolock(struct vop_advlock_args *ap); +int nfslockdans(struct proc *p, struct lockd_ans *ansp); +int nfslockdfd(struct proc *p, int fd); +int nfslockdwait(struct proc *p); +#endif +#endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/nfs/nfs_node.c xnu-517/bsd/nfs/nfs_node.c --- xnu-344.49/bsd/nfs/nfs_node.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_node.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -77,10 +77,6 @@ #include #include -#ifdef MALLOC_DEFINE -static MALLOC_DEFINE(M_NFSNODE, "NFS node", "NFS vnode private part"); -#endif - LIST_HEAD(nfsnodehashhead, nfsnode) *nfsnodehashtbl; u_long nfsnodehash; @@ -137,19 +133,27 @@ register struct vnode *vp; struct vnode *nvp; int error; + struct mount *mp; /* Check for unmount in progress */ - if (mntp->mnt_kern_flag & MNTK_UNMOUNT) { + if (!mntp || (mntp->mnt_kern_flag & MNTK_UNMOUNT)) { *npp = 0; - return (EPERM); + return (!mntp ? ENXIO : EPERM); } nhpp = NFSNOHASH(nfs_hash(fhp, fhsize)); loop: for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) { - if (mntp != NFSTOV(np)->v_mount || np->n_fhsize != fhsize || + mp = (np->n_flag & NINIT) ? np->n_mount : NFSTOV(np)->v_mount; + if (mntp != mp || np->n_fhsize != fhsize || bcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize)) continue; + /* if the node is still being initialized, sleep on it */ + if (np->n_flag & NINIT) { + np->n_flag |= NWINIT; + tsleep(np, PINOD, "nfsngt", 0); + goto loop; + } vp = NFSTOV(np); if (vget(vp, LK_EXCLUSIVE, p)) goto loop; @@ -170,29 +174,19 @@ nfs_node_hash_lock = 1; /* - * Do the MALLOC before the getnewvnode since doing so afterward - * might cause a bogus v_data pointer to get dereferenced - * elsewhere if MALLOC should block. + * allocate and initialize nfsnode and stick it in the hash + * before calling getnewvnode(). Anyone finding it in the + * hash before initialization is complete will wait for it. */ MALLOC_ZONE(np, struct nfsnode *, sizeof *np, M_NFSNODE, M_WAITOK); - - error = getnewvnode(VT_NFS, mntp, nfsv2_vnodeop_p, &nvp); - if (error) { - if (nfs_node_hash_lock < 0) - wakeup(&nfs_node_hash_lock); - nfs_node_hash_lock = 0; - *npp = 0; - FREE_ZONE(np, sizeof *np, M_NFSNODE); - return (error); - } - vp = nvp; bzero((caddr_t)np, sizeof *np); - vp->v_data = np; - np->n_vnode = vp; - /* - * Insert the nfsnode in the hash queue for its new file handle - */ - LIST_INSERT_HEAD(nhpp, np, n_hash); + np->n_flag |= NINIT; + np->n_mount = mntp; + lockinit(&np->n_lock, PINOD, "nfsnode", 0, 0); + /* lock the new nfsnode */ + lockmgr(&np->n_lock, LK_EXCLUSIVE, NULL, p); + + /* Insert the nfsnode in the hash queue for its new file handle */ if (fhsize > NFS_SMALLFH) { MALLOC_ZONE(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK); @@ -200,16 +194,36 @@ np->n_fhp = &np->n_fh; bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; - *npp = np; + LIST_INSERT_HEAD(nhpp, np, n_hash); + np->n_flag |= NHASHED; + /* release lock on hash table */ if (nfs_node_hash_lock < 0) wakeup(&nfs_node_hash_lock); nfs_node_hash_lock = 0; - /* - * Lock the new nfsnode. - */ - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + /* now, attempt to get a new vnode */ + error = getnewvnode(VT_NFS, mntp, nfsv2_vnodeop_p, &nvp); + if (error) { + LIST_REMOVE(np, n_hash); + np->n_flag &= ~NHASHED; + if (np->n_fhsize > NFS_SMALLFH) + FREE_ZONE((caddr_t)np->n_fhp, np->n_fhsize, M_NFSBIGFH); + FREE_ZONE(np, sizeof *np, M_NFSNODE); + *npp = 0; + return (error); + } + vp = nvp; + vp->v_data = np; + np->n_vnode = vp; + *npp = np; + + /* node is now initialized, check if anyone's waiting for it */ + np->n_flag &= ~NINIT; + if (np->n_flag & NWINIT) { + np->n_flag &= ~NWINIT; + wakeup((caddr_t)np); + } return (error); } @@ -243,35 +257,17 @@ #if DIAGNOSTIC kprintf("nfs_inactive removing %s, dvp=%x, a_vp=%x, ap=%x, np=%x, sp=%x\n", &sp->s_name[0], (unsigned)sp->s_dvp, (unsigned)ap->a_vp, (unsigned)ap, (unsigned)np, (unsigned)sp); #endif - /* - * We get a reference (vget) to ensure getnewvnode() - * doesn't recycle vp while we're asleep awaiting I/O. - * Note we don't need the reference unless usecount is - * already zero. In the case of a forcible unmount it - * wont be zero and doing a vget would fail because - * vclean holds VXLOCK. - */ - if (ap->a_vp->v_usecount > 0) { - VREF(ap->a_vp); - } else if (vget(ap->a_vp, 0, ap->a_p)) - panic("nfs_inactive: vget failed"); (void) nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, p, 1); np->n_size = 0; ubc_setsize(ap->a_vp, (off_t)0); - - /* We have a problem. The dvp could have gone away on us while - * in the unmount path. Thus it appears as VBAD and we cannot - * use it. If we tried locking the parent (future), for silly - * rename files, it is unclear where we would lock. The unmount - * code just pulls unlocked vnodes as it goes thru its list and - * yanks them. Could unmount be smarter to see if a busy reg vnode has - * a parent, and not yank it yet? Put in more passes at unmount - * time? In the meantime, just check if it went away on us. - * Could have gone away during the nfs_vinvalbuf or ubc_setsize - * which block. Or perhaps even before nfs_inactive got called. - */ - if ((sp->s_dvp)->v_type != VBAD) - nfs_removeit(sp); /* uses the dvp */ + nfs_removeit(sp); + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + LIST_REMOVE(np, n_hash); + np->n_flag &= ~NHASHED; cred = sp->s_cred; if (cred != NOCRED) { sp->s_cred = NOCRED; @@ -279,10 +275,9 @@ } vrele(sp->s_dvp); FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); - vrele(ap->a_vp); } np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED | - NQNFSNONCACHE | NQNFSWRITE); + NQNFSNONCACHE | NQNFSWRITE | NHASHED); VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } @@ -305,7 +300,10 @@ if (prtactive && vp->v_usecount != 0) vprint("nfs_reclaim: pushing active", vp); - LIST_REMOVE(np, n_hash); + if (np->n_flag & NHASHED) { + LIST_REMOVE(np, n_hash); + np->n_flag &= ~NHASHED; + } /* * In case we block during FREE_ZONEs below, get the entry out @@ -397,23 +395,4 @@ { return (lockstatus(&VTONFS(ap->a_vp)->n_lock)); -} - - -/* - * Nfs abort op, called after namei() when a CREATE/DELETE isn't actually - * done. Currently nothing to do. - */ -/* ARGSUSED */ -int -nfs_abortop(ap) - struct vop_abortop_args /* { - struct vnode *a_dvp; - struct componentname *a_cnp; - } */ *ap; -{ - - if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) - FREE_ZONE(ap->a_cnp->cn_pnbuf, ap->a_cnp->cn_pnlen, M_NAMEI); - return (0); } diff -urN xnu-344.49/bsd/nfs/nfs_nqlease.c xnu-517/bsd/nfs/nfs_nqlease.c --- xnu-344.49/bsd/nfs/nfs_nqlease.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_nqlease.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -429,8 +429,10 @@ { register struct nqlease *tlp; time_t newexpiry; + struct timeval now; - newexpiry = time.tv_sec + duration + nqsrv_clockskew; + microtime(&now); + newexpiry = now.tv_sec + duration + nqsrv_clockskew; if (lp->lc_expiry == newexpiry) return; if (lp->lc_timer.cqe_next != 0) { @@ -523,7 +525,7 @@ caddr_t bpos, cp; u_long xid, *tl; int len = 1, ok = 1, i = 0; - int sotype, *solockp; + int sotype, solock; while (ok && (lph->lph_flag & LC_VALID)) { if (nqsrv_cmpnam(slp, nam, lph)) @@ -547,10 +549,7 @@ } else goto nextone; sotype = so->so_type; - if (so->so_proto->pr_flags & PR_CONNREQUIRED) - solockp = &lph->lph_slp->ns_solock; - else - solockp = (int *)0; + solock = (so->so_proto->pr_flags & PR_CONNREQUIRED); nfsm_reqhead((struct vnode *)0, NQNFSPROC_EVICTED, NFSX_V3FH + NFSX_UNSIGNED); fhp = &nfh.fh_generic; @@ -583,15 +582,13 @@ } if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 && (lph->lph_slp->ns_flag & SLP_VALID) == 0) || - (solockp && (*solockp & NFSMNT_SNDLOCK))) + (solock && nfs_slplock(lph->lph_slp, 0) == 0)) { m_freem(m); - else { - if (solockp) - *solockp |= NFSMNT_SNDLOCK; + } else { (void) nfs_send(so, nam2, m, (struct nfsreq *)0); - if (solockp) - nfs_sndunlock(solockp); + if (solock) + nfs_slpunlock(lph->lph_slp); } if (lph->lph_flag & LC_UDP) MFREE(nam2, m); @@ -623,9 +620,11 @@ register int i; struct nqm *lphnext; int len, ok; + struct timeval now; tryagain: - if (time.tv_sec > lp->lc_expiry) + microtime(&now); + if (now.tv_sec > lp->lc_expiry) return; lph = &lp->lc_host; lphnext = lp->lc_morehosts; @@ -669,10 +668,12 @@ struct nqm *lphnext, *olphnext; struct mbuf *n; int i, len, ok; + struct timeval now; + microtime(&now); for (lp = nqtimerhead.cqh_first; lp != (void *)&nqtimerhead; lp = nextlp) { - if (lp->lc_expiry >= time.tv_sec) + if (lp->lc_expiry >= now.tv_sec) break; nextlp = lp->lc_timer.cqe_next; if (lp->lc_flag & LC_EXPIREDWANTED) { @@ -717,7 +718,7 @@ nfsrv_slpderef(lph->lph_slp); if (++i == len) { if (olphnext) { - _FREE_ZONE((caddr_t)olphnext, + FREE_ZONE((caddr_t)olphnext, sizeof (struct nqm), M_NQMHOST); olphnext = (struct nqm *)0; @@ -736,7 +737,7 @@ FREE_ZONE((caddr_t)lp, sizeof (struct nqlease), M_NQLEASE); if (olphnext) - _FREE_ZONE((caddr_t)olphnext, + FREE_ZONE((caddr_t)olphnext, sizeof (struct nqm), M_NQMHOST); nfsstats.srvnqnfs_leases--; } @@ -896,8 +897,9 @@ register caddr_t cp; register long t1, t2; register struct nfsnode *np; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; caddr_t bpos, dpos, cp2; + struct timeval now; time_t reqtime; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; @@ -905,6 +907,10 @@ u_quad_t frev; u_int64_t xid; + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + nfsstats.rpccnt[NQNFSPROC_GETLEASE]++; mb = mreq = nfsm_reqh(vp, NQNFSPROC_GETLEASE, NFSX_V3FH+2*NFSX_UNSIGNED, &bpos); @@ -912,16 +918,24 @@ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(rwflag); *tl = txdr_unsigned(nmp->nm_leaseterm); - reqtime = time.tv_sec; + microtime(&now); + reqtime = now.tv_sec; nfsm_request(vp, NQNFSPROC_GETLEASE, p, cred, &xid); np = VTONFS(vp); nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); cachable = fxdr_unsigned(int, *tl++); reqtime += fxdr_unsigned(int, *tl++); - if (reqtime > time.tv_sec) { - fxdr_hyper(tl, &frev); - nqnfs_clientlease(nmp, np, rwflag, cachable, reqtime, frev); - nfsm_loadattr(vp, (struct vattr *)0, &xid); + microtime(&now); + if (reqtime > now.tv_sec) { + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + error = ENXIO; + } else { + fxdr_hyper(tl, &frev); + nqnfs_clientlease(nmp, np, rwflag, cachable, + reqtime, frev); + nfsm_loadattr(vp, (struct vattr *)0, &xid); + } } else error = NQNFS_EXPIRED; nfsm_reqdone; @@ -947,8 +961,12 @@ struct mbuf *mreq, *mb, *mb2, *mheadend; struct nfsmount *nmp; struct nfsreq myrep; + int connrequired; + int *flagp; nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); nfsstats.rpccnt[NQNFSPROC_VACATED]++; nfsm_reqhead(vp, NQNFSPROC_VACATED, NFSX_FH(1)); nfsm_fhtom(vp, 1); @@ -968,11 +986,15 @@ } myrep.r_flags = 0; myrep.r_nmp = nmp; - if (nmp->nm_soflags & PR_CONNREQUIRED) - (void) nfs_sndlock(&nmp->nm_flag, (struct nfsreq *)0); + + connrequired = (nmp->nm_soflags & PR_CONNREQUIRED); + if (connrequired) + (void) nfs_sndlock(&myrep); + (void) nfs_send(nmp->nm_so, nmp->nm_nam, m, &myrep); - if (nmp->nm_soflags & PR_CONNREQUIRED) - nfs_sndunlock(&nmp->nm_flag); + + if (connrequired) + nfs_sndunlock(&myrep); nfsmout: return (error); } @@ -1060,17 +1082,19 @@ struct nfsuid *nuidp, *nnuidp; int error = 0, vpid; register struct nfsreq *rp; + struct timeval now; /* * First initialize some variables */ + microtime(&now); /* * If an authorization string is being passed in, get it. */ if ((flag & NFSSVC_GOTAUTH) && - (nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_DISMNT)) == 0) { - if (nmp->nm_flag & NFSMNT_HASAUTH) + (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_DISMNT)) == 0) { + if (nmp->nm_state & NFSSTA_HASAUTH) panic("cld kerb"); if ((flag & NFSSVC_AUTHINFAIL) == 0) { if (ncd->ncd_authlen <= nmp->nm_authlen && @@ -1084,18 +1108,18 @@ nmp->nm_key = ncd->ncd_key; #endif } else - nmp->nm_flag |= NFSMNT_AUTHERR; + nmp->nm_state |= NFSSTA_AUTHERR; } else - nmp->nm_flag |= NFSMNT_AUTHERR; - nmp->nm_flag |= NFSMNT_HASAUTH; + nmp->nm_state |= NFSSTA_AUTHERR; + nmp->nm_state |= NFSSTA_HASAUTH; wakeup((caddr_t)&nmp->nm_authlen); } else - nmp->nm_flag |= NFSMNT_WAITAUTH; + nmp->nm_state |= NFSSTA_WAITAUTH; /* * Loop every second updating queue until there is a termination sig. */ - while ((nmp->nm_flag & NFSMNT_DISMNT) == 0) { + while ((nmp->nm_state & NFSSTA_DISMNT) == 0) { if (nmp->nm_flag & NFSMNT_NQNFS) { /* * If there are no outstanding requests (and therefore no @@ -1116,10 +1140,10 @@ */ np = nmp->nm_timerhead.cqh_first; while (np != (void *)&nmp->nm_timerhead && - (nmp->nm_flag & NFSMNT_DISMINPROG) == 0) { + (nmp->nm_state & NFSSTA_DISMINPROG) == 0) { vp = NFSTOV(np); vpid = vp->v_id; - if (np->n_expiry < time.tv_sec) { + if (np->n_expiry < now.tv_sec) { if (vget(vp, LK_EXCLUSIVE, p) == 0) { nmp->nm_inprog = vp; if (vpid == vp->v_id) { @@ -1144,9 +1168,9 @@ vrele(vp); nmp->nm_inprog = NULLVP; } - } else if ((np->n_expiry - NQ_RENEWAL) < time.tv_sec) { + } else if ((np->n_expiry - NQ_RENEWAL) < now.tv_sec) { if ((np->n_flag & (NQNFSWRITE | NQNFSNONCACHE)) - == NQNFSWRITE && vp->v_dirtyblkhd.lh_first && + == NQNFSWRITE && np->n_dirtyblkhd.lh_first && vget(vp, LK_EXCLUSIVE, p) == 0) { nmp->nm_inprog = vp; if (vpid == vp->v_id && @@ -1166,10 +1190,10 @@ /* * Get an authorization string, if required. */ - if ((nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_DISMNT | NFSMNT_HASAUTH)) == 0) { + if ((nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_DISMNT | NFSSTA_HASAUTH)) == 0) { ncd->ncd_authuid = nmp->nm_authuid; if (copyout((caddr_t)ncd, argp, sizeof (struct nfsd_cargs))) - nmp->nm_flag |= NFSMNT_WAITAUTH; + nmp->nm_state |= NFSSTA_WAITAUTH; else return (ENEEDAUTH); } @@ -1177,8 +1201,8 @@ /* * Wait a bit (no pun) and do it again. */ - if ((nmp->nm_flag & NFSMNT_DISMNT) == 0 && - (nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_HASAUTH))) { + if ((nmp->nm_state & NFSSTA_DISMNT) == 0 && + (nmp->nm_state & (NFSSTA_WAITAUTH | NFSSTA_HASAUTH))) { error = tsleep((caddr_t)&nmp->nm_authstr, PSOCK | PCATCH, "nqnfstimr", hz / 3); if (error == EINTR || error == ERESTART) @@ -1193,7 +1217,7 @@ nnuidp = nuidp->nu_lru.tqe_next; LIST_REMOVE(nuidp, nu_hash); TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); - _FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); + FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); } /* * Loop through outstanding request list and remove dangling @@ -1202,7 +1226,7 @@ for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) if (rp->r_nmp == nmp) rp->r_nmp = (struct nfsmount *)0; - _FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); + FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); if (error == EWOULDBLOCK) error = 0; return (error); diff -urN xnu-344.49/bsd/nfs/nfs_serv.c xnu-517/bsd/nfs/nfs_serv.c --- xnu-344.49/bsd/nfs/nfs_serv.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_serv.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -99,8 +99,6 @@ #include #include -#include - #include #include #include @@ -485,6 +483,7 @@ nqsrv_getl(ndp->ni_startdir, ND_READ); vrele(ndp->ni_startdir); FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; vp = ndp->ni_vp; bzero((caddr_t)fhp, sizeof(nfh)); fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; @@ -645,6 +644,7 @@ struct vattr va, *vap = &va; off_t off; u_quad_t frev; + int didhold = 0; fhp = &nfh.fh_generic; nfsm_srvmtofh(fhp); @@ -745,6 +745,7 @@ uiop->uio_resid = cnt; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; + didhold = ubc_hold(vp); error = VOP_READ(vp, uiop, IO_NODELOCKED, cred); off = uiop->uio_offset; FREE((caddr_t)iv2, M_TEMP); @@ -754,17 +755,25 @@ * that alone. m_freem(mreq) looks bogus. Taking it out. Should be * mrep or not there at all. Causes panic. ekn */ if (error || (getret = VOP_GETATTR(vp, vap, cred, procp))) { + VOP_UNLOCK(vp, 0, procp); + if (didhold) + ubc_rele(vp); if (!error) error = getret; /* m_freem(mreq);*/ - vput(vp); + vrele(vp); nfsm_reply(NFSX_POSTOPATTR(v3)); nfsm_srvpostop_attr(getret, vap); return (0); } - } else + VOP_UNLOCK(vp, 0, procp); + if (didhold) + ubc_rele(vp); + vrele(vp); + } else { uiop->uio_resid = 0; - vput(vp); + vput(vp); + } nfsm_srvfillattr(vap, fp); len -= uiop->uio_resid; tlen = nfsm_rndup(len); @@ -817,6 +826,7 @@ struct uio io, *uiop = &io; off_t off; u_quad_t frev; + int didhold = 0; if (mrep == NULL) { *mrq = NULL; @@ -933,12 +943,16 @@ uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_procp = (struct proc *)0; uiop->uio_offset = off; + didhold = ubc_hold(vp); error = VOP_WRITE(vp, uiop, ioflags, cred); nfsstats.srvvop_writes++; FREE((caddr_t)iv, M_TEMP); } aftat_ret = VOP_GETATTR(vp, vap, cred, procp); - vput(vp); + VOP_UNLOCK(vp, 0, procp); + if (didhold) + ubc_rele(vp); + vrele(vp); if (!error) error = aftat_ret; nfsm_reply(NFSX_PREOPATTR(v3) + NFSX_POSTOPORFATTR(v3) + @@ -1003,6 +1017,8 @@ struct vnode *vp; struct uio io, *uiop = &io; u_quad_t frev, cur_usec; + int didhold; + struct timeval now; #ifndef nolint i = 0; @@ -1020,7 +1036,8 @@ LIST_INIT(&nfsd->nd_coalesce); nfsd->nd_mreq = NULL; nfsd->nd_stable = NFSV3WRITE_FILESYNC; - cur_usec = (u_quad_t)time.tv_sec * 1000000 + (u_quad_t)time.tv_usec; + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; nfsd->nd_time = cur_usec + (v3 ? nfsrvw_procrastinate_v3 : nfsrvw_procrastinate); @@ -1136,7 +1153,8 @@ * and generate the associated reply mbuf list(s). */ loop1: - cur_usec = (u_quad_t)time.tv_sec * 1000000 + (u_quad_t)time.tv_usec; + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; s = splsoftclock(); for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = owp) { owp = nfsd->nd_tq.le_next; @@ -1182,6 +1200,7 @@ uiop->uio_procp = (struct proc *)0; uiop->uio_offset = nfsd->nd_off; uiop->uio_resid = nfsd->nd_eoff - nfsd->nd_off; + didhold = 0; if (uiop->uio_resid > 0) { mp = mrep; i = 0; @@ -1204,6 +1223,7 @@ mp = mp->m_next; } if (!error) { + didhold = ubc_hold(vp); error = VOP_WRITE(vp, uiop, ioflags, cred); nfsstats.srvvop_writes++; } @@ -1212,7 +1232,10 @@ m_freem(mrep); if (vp) { aftat_ret = VOP_GETATTR(vp, &va, cred, procp); - vput(vp); + VOP_UNLOCK(vp, 0, procp); + if (didhold) + ubc_rele(vp); + vrele(vp); } /* @@ -1503,6 +1526,7 @@ nfsrv_object_create(nd.ni_vp); FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; if (exclusive_flag) { exclusive_flag = 0; VATTR_NULL(vap); @@ -1519,8 +1543,9 @@ if (vap->va_type != VFIFO && (error = suser(cred, (u_short *)0))) { vrele(nd.ni_startdir); - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); nfsm_reply(0); @@ -1537,13 +1562,15 @@ nd.ni_cnd.cn_proc = procp; nd.ni_cnd.cn_cred = cred; if ((error = lookup(&nd))) { - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; nfsm_reply(0); } nfsrv_object_create(nd.ni_vp); FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; if (nd.ni_cnd.cn_flags & ISSYMLINK) { vrele(nd.ni_dvp); vput(nd.ni_vp); @@ -1553,8 +1580,9 @@ } } else { vrele(nd.ni_startdir); - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); error = ENXIO; @@ -1562,7 +1590,8 @@ vp = nd.ni_vp; } else { vrele(nd.ni_startdir); - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; vp = nd.ni_vp; if (nd.ni_dvp == vp) vrele(nd.ni_dvp); @@ -1614,14 +1643,15 @@ nfsm_build(fp, struct nfs_fattr *, NFSX_V2FATTR); nfsm_srvfillattr(vap, fp); } - return (error); + return (0); nfsmout: if (dirp) vrele(dirp); if (nd.ni_cnd.cn_nameiop) { vrele(nd.ni_startdir); - _FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, + FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; } VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) @@ -1685,8 +1715,9 @@ vtyp = nfsv3tov_type(*tl); if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) { vrele(nd.ni_startdir); - _FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, + FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; error = NFSERR_BADTYPE; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); @@ -1706,8 +1737,9 @@ */ if (nd.ni_vp) { vrele(nd.ni_startdir); - _FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, + FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; error = EEXIST; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); @@ -1721,11 +1753,13 @@ if (!error) FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; } else { if (vtyp != VFIFO && (error = suser(cred, (u_short *)0))) { vrele(nd.ni_startdir); - _FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, + FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); vput(nd.ni_dvp); goto out; @@ -1741,6 +1775,7 @@ nd.ni_cnd.cn_cred = procp->p_ucred; error = lookup(&nd); FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; if (error) goto out; if (nd.ni_cnd.cn_flags & ISSYMLINK) { @@ -1774,8 +1809,9 @@ vrele(dirp); if (nd.ni_cnd.cn_nameiop) { vrele(nd.ni_startdir); - _FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, + FREE_ZONE((caddr_t)nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; } VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) @@ -2039,6 +2075,7 @@ } vrele(tond.ni_startdir); FREE_ZONE(tond.ni_cnd.cn_pnbuf, tond.ni_cnd.cn_pnlen, M_NAMEI); + tond.ni_cnd.cn_flags &= ~HASBUF; out1: if (fdirp) { fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, procp); @@ -2050,6 +2087,7 @@ } vrele(fromnd.ni_startdir); FREE_ZONE(fromnd.ni_cnd.cn_pnbuf, fromnd.ni_cnd.cn_pnlen, M_NAMEI); + fromnd.ni_cnd.cn_flags &= ~HASBUF; nfsm_reply(2 * NFSX_WCCDATA(v3)); if (v3) { nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); @@ -2065,11 +2103,13 @@ if (tond.ni_cnd.cn_nameiop) { vrele(tond.ni_startdir); FREE_ZONE(tond.ni_cnd.cn_pnbuf, tond.ni_cnd.cn_pnlen, M_NAMEI); + tond.ni_cnd.cn_flags &= ~HASBUF; } if (fromnd.ni_cnd.cn_nameiop) { vrele(fromnd.ni_startdir); FREE_ZONE(fromnd.ni_cnd.cn_pnbuf, fromnd.ni_cnd.cn_pnlen, M_NAMEI); + fromnd.ni_cnd.cn_flags &= ~HASBUF; VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); vrele(fromnd.ni_dvp); vrele(fvp); @@ -2249,7 +2289,8 @@ *(pathcp + len2) = '\0'; if (nd.ni_vp) { vrele(nd.ni_startdir); - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); @@ -2283,6 +2324,7 @@ } else vrele(nd.ni_startdir); FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; } out: if (pathcp) @@ -2303,7 +2345,8 @@ nfsmout: if (nd.ni_cnd.cn_nameiop) { vrele(nd.ni_startdir); - _FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + FREE_ZONE(nd.ni_cnd.cn_pnbuf, nd.ni_cnd.cn_pnlen, M_NAMEI); + nd.ni_cnd.cn_flags &= ~HASBUF; } if (dirp) vrele(dirp); @@ -2550,13 +2593,11 @@ * example, client NFS does not { although it is never remote mounted * anyhow } * The alternate call nfsrv_readdirplus() does lookups as well. - * PS: The NFS protocol spec. does not clarify what the "count" byte - * argument is a count of.. just name strings and file id's or the - * entire reply rpc or ... - * I tried just file name and id sizes and it confused the Sun client, - * so I am using the full rpc size now. The "paranoia.." comment refers - * to including the status longwords that are not a part of the dir. - * "entry" structures, but are in the rpc. + * PS: The XNFS protocol spec clearly describes what the "count"s arguments + * are supposed to cover. For readdir, the count is the total number of + * bytes included in everything from the directory's postopattr through + * the EOF flag. For readdirplus, the maxcount is the same, and the + * dircount includes all that except for the entry attributes and handles. */ struct flrep { nfsuint64 fl_off; @@ -2754,13 +2795,14 @@ goto again; } - len = 3 * NFSX_UNSIGNED; /* paranoia, probably can be 0 */ nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + siz); if (v3) { + len = NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF + 2 * NFSX_UNSIGNED; nfsm_srvpostop_attr(getret, &at); nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); txdr_hyper(&at.va_filerev, tl); - } + } else + len = 2 * NFSX_UNSIGNED; mp = mp2 = mb; bp = bpos; be = bp + M_TRAILINGSPACE(mp); @@ -3090,7 +3132,7 @@ * are calculated conservatively, including all * XDR overheads. */ - len += (7 * NFSX_UNSIGNED + nlen + rem + NFSX_V3FH + + len += (8 * NFSX_UNSIGNED + nlen + rem + NFSX_V3FH + NFSX_V3POSTOPATTR); dirlen += (6 * NFSX_UNSIGNED + nlen + rem); if (len > cnt || dirlen > fullsiz) { @@ -3211,6 +3253,7 @@ char *cp2; struct mbuf *mb, *mb2, *mreq; u_quad_t frev, off; + int didhold; #ifndef nolint cache = 0; @@ -3233,9 +3276,13 @@ return (0); } for_ret = VOP_GETATTR(vp, &bfor, cred, procp); + didhold = ubc_hold(vp); error = VOP_FSYNC(vp, cred, MNT_WAIT, procp); aft_ret = VOP_GETATTR(vp, &aft, cred, procp); - vput(vp); + VOP_UNLOCK(vp, 0, procp); + if (didhold) + ubc_rele(vp); + vrele(vp); nfsm_reply(NFSX_V3WCCDATA + NFSX_V3WRITEVERF); nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft); if (!error) { @@ -3316,7 +3363,7 @@ sfp->sf_afiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree); sfp->sf_invarsec = 0; } else { - sfp->sf_tsize = txdr_unsigned(NFS_MAXDGRAMDATA); + sfp->sf_tsize = txdr_unsigned(NFS_V2MAXDATA); sfp->sf_bsize = txdr_unsigned(sf->f_bsize); sfp->sf_blocks = txdr_unsigned(sf->f_blocks); sfp->sf_bfree = txdr_unsigned(sf->f_bfree); @@ -3343,7 +3390,7 @@ register struct nfsv3_fsinfo *sip; register long t1; caddr_t bpos; - int error = 0, rdonly, cache, getret = 1, pref; + int error = 0, rdonly, cache, getret = 1, pref, max; char *cp2; struct mbuf *mb, *mb2, *mreq; struct vnode *vp; @@ -3372,16 +3419,16 @@ /* * XXX * There should be file system VFS OP(s) to get this information. - * For now, assume ufs. + * For now, assume our usual NFS defaults. */ if (slp->ns_so->so_type == SOCK_DGRAM) - pref = NFS_MAXDGRAMDATA; + max = pref = NFS_MAXDGRAMDATA; else - pref = NFS_MAXDATA; - sip->fs_rtmax = txdr_unsigned(NFS_MAXDATA); + max = pref = NFS_MAXDATA; + sip->fs_rtmax = txdr_unsigned(max); sip->fs_rtpref = txdr_unsigned(pref); sip->fs_rtmult = txdr_unsigned(NFS_FABLKSIZE); - sip->fs_wtmax = txdr_unsigned(NFS_MAXDATA); + sip->fs_wtmax = txdr_unsigned(max); sip->fs_wtpref = txdr_unsigned(pref); sip->fs_wtmult = txdr_unsigned(NFS_FABLKSIZE); sip->fs_dtpref = txdr_unsigned(pref); @@ -3414,7 +3461,7 @@ register long t1; caddr_t bpos; int error = 0, rdonly, cache, getret = 1, linkmax, namemax; - int chownres, notrunc; + int chownres, notrunc, case_sensitive, case_preserving; char *cp2; struct mbuf *mb, *mb2, *mreq; struct vnode *vp; @@ -3441,6 +3488,10 @@ error = VOP_PATHCONF(vp, _PC_CHOWN_RESTRICTED, &chownres); if (!error) error = VOP_PATHCONF(vp, _PC_NO_TRUNC, ¬runc); + if (!error) + error = VOP_PATHCONF(vp, _PC_CASE_SENSITIVE, &case_sensitive); + if (!error) + error = VOP_PATHCONF(vp, _PC_CASE_PRESERVING, &case_preserving); getret = VOP_GETATTR(vp, &at, cred, procp); vput(vp); nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3PATHCONF); @@ -3453,14 +3504,9 @@ pc->pc_namemax = txdr_unsigned(namemax); pc->pc_notrunc = txdr_unsigned(notrunc); pc->pc_chownrestricted = txdr_unsigned(chownres); + pc->pc_caseinsensitive = txdr_unsigned(!case_sensitive); + pc->pc_casepreserving = txdr_unsigned(case_preserving); - /* - * These should probably be supported by VOP_PATHCONF(), but - * until msdosfs is exportable (why would you want to?), the - * Unix defaults should be ok. - */ - pc->pc_caseinsensitive = nfs_false; - pc->pc_casepreserving = nfs_true; nfsm_srvdone; } diff -urN xnu-344.49/bsd/nfs/nfs_socket.c xnu-517/bsd/nfs/nfs_socket.c --- xnu-344.49/bsd/nfs/nfs_socket.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_socket.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -178,12 +178,17 @@ int nfsrtton = 0; struct nfsrtt nfsrtt; -static int nfs_msg __P((struct proc *,char *,char *)); +static int nfs_msg __P((struct proc *, const char *, const char *, int)); +static void nfs_up(struct nfsreq *, const char *, int); +static void nfs_down(struct nfsreq *, const char *, int); static int nfs_rcvlock __P((struct nfsreq *)); -static void nfs_rcvunlock __P((int *flagp)); +static void nfs_rcvunlock __P((struct nfsreq *)); static int nfs_receive __P((struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp)); static int nfs_reconnect __P((struct nfsreq *rep)); +static void nfs_repbusy(struct nfsreq *rep); +static struct nfsreq * nfs_repnext(struct nfsreq *rep); +static void nfs_repdequeue(struct nfsreq *rep); #ifndef NFS_NOSERVER static int nfsrv_getstream __P((struct nfssvc_sock *,int)); @@ -428,19 +433,24 @@ } splx(s); } + /* + * Always time out on recieve, this allows us to reconnect the + * socket to deal with network changes. + */ + so->so_rcv.sb_timeo = (2 * hz); if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { - so->so_rcv.sb_timeo = (5 * hz); so->so_snd.sb_timeo = (5 * hz); } else { - so->so_rcv.sb_timeo = 0; so->so_snd.sb_timeo = 0; } if (nmp->nm_sotype == SOCK_DGRAM) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } else if (nmp->nm_sotype == SOCK_SEQPACKET) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } else { if (nmp->nm_sotype != SOCK_STREAM) panic("nfscon sotype"); @@ -450,6 +460,7 @@ int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &val; @@ -462,6 +473,7 @@ int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &val; @@ -470,12 +482,15 @@ sosetopt(so, &sopt); } - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) - * 2; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) - * 2; + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) * + (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2); } + if (sndreserve > NFS_MAXSOCKBUF) + sndreserve = NFS_MAXSOCKBUF; + if (rcvreserve > NFS_MAXSOCKBUF) + rcvreserve = NFS_MAXSOCKBUF; error = soreserve(so, sndreserve, rcvreserve); if (error) { goto bad; @@ -492,7 +507,7 @@ nmp->nm_sdrtt[3] = 0; nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_sent = 0; - FSDBG(529, nmp, nmp->nm_flag, nmp->nm_soflags, nmp->nm_cwnd); + FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd); nmp->nm_timeouts = 0; return (0); @@ -523,6 +538,9 @@ while ((error = nfs_connect(nmp, rep))) { if (error == EINTR || error == ERESTART) return (EINTR); + if (error == EIO) + return (EIO); + nfs_down(rep, "can not connect", error); (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } @@ -531,7 +549,7 @@ * Loop through outstanding request list and fix up all requests * on old socket. */ - for (rp = nfs_reqq.tqh_first; rp != 0; rp = rp->r_chain.tqe_next) { + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { if (rp->r_nmp == nmp) rp->r_flags |= R_MUSTRESEND; } @@ -578,15 +596,16 @@ struct nfsreq *rep; { struct sockaddr *sendnam; - int error, soflags, flags; + int error, error2, soflags, flags; int xidqueued = 0; struct nfsreq *rp; char savenametolog[MNAMELEN]; if (rep) { - if (rep->r_flags & R_SOFTTERM) { + error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp); + if (error) { m_freem(top); - return (EINTR); + return (error); } if ((so = rep->r_nmp->nm_so) == NULL) { rep->r_flags |= R_MUSTRESEND; @@ -595,7 +614,7 @@ } rep->r_flags &= ~R_MUSTRESEND; soflags = rep->r_nmp->nm_soflags; - for (rp = nfs_reqq.tqh_first; rp; rp = rp->r_chain.tqe_next) + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) if (rp == rep) break; if (rp) @@ -634,8 +653,7 @@ if (error) { if (rep) { if (xidqueued) { - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) + TAILQ_FOREACH(rp, &nfs_reqq, r_chain) if (rp == rep && rp->r_xid == xidqueued) break; if (!rp) @@ -647,9 +665,10 @@ /* * Deal with errors for the client side. */ - if (rep->r_flags & R_SOFTTERM) - error = EINTR; - else { + error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp); + if (error2) { + error = error2; + } else { rep->r_flags |= R_MUSTRESEND; NFS_DPF(DUP, ("nfs_send RESEND error=%d\n", error)); @@ -660,9 +679,10 @@ /* * Handle any recoverable (soft) socket errors here. (???) */ - if (error != EINTR && error != ERESTART && - error != EWOULDBLOCK && error != EPIPE) + if (error != EINTR && error != ERESTART && error != EIO && + error != EWOULDBLOCK && error != EPIPE) { error = 0; + } } return (error); } @@ -692,7 +712,7 @@ struct sockaddr *tmp_nam; struct mbuf *mhck; struct sockaddr_in *sin; - int error, sotype, rcvflg; + int error, error2, sotype, rcvflg; struct proc *p = current_proc(); /* XXX */ /* @@ -711,7 +731,7 @@ * until we have an entire rpc request/reply. */ if (sotype != SOCK_DGRAM) { - error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + error = nfs_sndlock(rep); if (error) return (error); tryagain: @@ -724,15 +744,17 @@ * attempt that has essentially shut down this * mount point. */ - if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) { + nfs_sndunlock(rep); + if (error) + return (error); return (EINTR); } so = rep->r_nmp->nm_so; if (!so) { error = nfs_reconnect(rep); if (error) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); return (error); } goto tryagain; @@ -751,13 +773,13 @@ if (error) { if (error == EINTR || error == ERESTART || (error = nfs_reconnect(rep))) { - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); return (error); } goto tryagain; } } - nfs_sndunlock(&rep->r_nmp->nm_flag); + nfs_sndunlock(rep); if (sotype == SOCK_STREAM) { aio.iov_base = (caddr_t) &len; aio.iov_len = sizeof(u_long); @@ -773,12 +795,13 @@ thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); error = soreceive(so, (struct sockaddr **)0, &auio, (struct mbuf **)0, (struct mbuf **)0, &rcvflg); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); if (!rep->r_nmp) /* if unmounted then bailout */ goto shutout; if (error == EWOULDBLOCK && rep) { - if (rep->r_flags & R_SOFTTERM) - return (EINTR); + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) + error = error2; } } while (error == EWOULDBLOCK); if (!error && auio.uio_resid > 0) { @@ -844,16 +867,18 @@ rcvflg = 0; error = soreceive(so, (struct sockaddr **)0, &auio, mp, &control, &rcvflg); + if (control) + m_freem(control); if (!rep->r_nmp) /* if unmounted then bailout */ { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); goto shutout; } - if (control) - m_freem(control); if (error == EWOULDBLOCK && rep) { - if (rep->r_flags & R_SOFTTERM) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EINTR); + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) { + thread_funnel_switch(NETWORK_FUNNEL, + KERNEL_FUNNEL); + return (error2); } } } while (error == EWOULDBLOCK || @@ -876,15 +901,29 @@ "receive error %d from nfs server %s\n", error, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); - error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + error = nfs_sndlock(rep); if (!error) error = nfs_reconnect(rep); if (!error) goto tryagain; } } else { - if ((so = rep->r_nmp->nm_so) == NULL) - return (EACCES); + /* + * We could have failed while rebinding the datagram socket + * so we need to attempt to rebind here. + */ + if ((so = rep->r_nmp->nm_so) == NULL) { + error = nfs_sndlock(rep); + if (!error) { + error = nfs_reconnect(rep); + nfs_sndunlock(rep); + } + if (error) + return (error); + if (!rep->r_nmp) /* if unmounted then bailout */ + return (ENXIO); + so = rep->r_nmp->nm_so; + } if (so->so_state & SS_ISCONNECTED) getnam = (struct sockaddr **)0; else @@ -907,18 +946,44 @@ FREE(*getnam, M_SONAME); *aname = mhck; } - if (!rep->r_nmp) /* if unmounted then bailout */ { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - goto shutout; - } - - if (error == EWOULDBLOCK && - (rep->r_flags & R_SOFTTERM)) { - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - return (EINTR); + if (!rep->r_nmp) /* if unmounted then bailout */ + goto dgramout; + if (error) { + error2 = nfs_sigintr(rep->r_nmp, rep, p); + if (error2) { + error = error2; + goto dgramout; + } + } + /* Reconnect for all errors. We may be receiving + * soft/hard/blocking errors because of a network + * change. + * XXX: we should rate limit or delay this + * to once every N attempts or something. + * although TCP doesn't seem to. + */ + if (error) { + thread_funnel_switch(NETWORK_FUNNEL, + KERNEL_FUNNEL); + error2 = nfs_sndlock(rep); + if (!error2) { + error2 = nfs_reconnect(rep); + if (error2) + error = error2; + else if (!rep->r_nmp) /* if unmounted then bailout */ + error = ENXIO; + else + so = rep->r_nmp->nm_so; + nfs_sndunlock(rep); + } else { + error = error2; + } + thread_funnel_switch(KERNEL_FUNNEL, + NETWORK_FUNNEL); } } while (error == EWOULDBLOCK); +dgramout: thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); len -= auio.uio_resid; } @@ -976,7 +1041,7 @@ * would hang trying to nfs_receive an already received reply. */ if (myrep->r_mrep != NULL) { - nfs_rcvunlock(&nmp->nm_flag); + nfs_rcvunlock(myrep); FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1); return (0); } @@ -985,20 +1050,22 @@ * is still intact by checks done in nfs_rcvlock. */ error = nfs_receive(myrep, &nam, &mrep); + if (nam) + m_freem(nam); /* * Bailout asap if nfsmount struct gone (unmounted). */ if (!myrep->r_nmp || !nmp->nm_so) { FSDBG(530, myrep->r_xid, myrep, nmp, -2); - return (ECONNABORTED); + return (ENXIO); } if (error) { FSDBG(530, myrep->r_xid, myrep, nmp, error); - nfs_rcvunlock(&nmp->nm_flag); + nfs_rcvunlock(myrep); /* Bailout asap if nfsmount struct gone (unmounted). */ if (!myrep->r_nmp || !nmp->nm_so) - return (ECONNABORTED); + return (ENXIO); /* * Ignore routing errors on connectionless protocols?? @@ -1011,8 +1078,6 @@ } return (error); } - if (nam) - m_freem(nam); /* * We assume all is fine, but if we did not have an error @@ -1029,7 +1094,7 @@ */ if (!mrep) { FSDBG(530, myrep->r_xid, myrep, nmp, -3); - return (ECONNABORTED); /* sounds good */ + return (ENXIO); /* sounds good */ } /* @@ -1053,8 +1118,8 @@ m_freem(mrep); #endif nfsmout: - if (nmp->nm_flag & NFSMNT_RCVLOCK) - nfs_rcvunlock(&nmp->nm_flag); + if (nmp->nm_state & NFSSTA_RCVLOCK) + nfs_rcvunlock(myrep); if (myrep->r_flags & R_GETONEREP) return (0); /* this path used by NQNFS */ continue; @@ -1064,13 +1129,17 @@ * Loop through the request list to match up the reply * Iff no match, just drop the datagram */ - for (rep = nfs_reqq.tqh_first; rep != 0; - rep = rep->r_chain.tqe_next) { + TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { if (rep->r_mrep == NULL && rxid == rep->r_xid) { /* Found it.. */ rep->r_mrep = mrep; rep->r_md = md; rep->r_dpos = dpos; + /* + * If we're tracking the round trip time + * then we update the circular log here + * with the stats from our current request. + */ if (nfsrtton) { struct rttl *rt; @@ -1084,7 +1153,7 @@ rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; - rt->tstamp = time; + microtime(&rt->tstamp); // XXX unused if (rep->r_flags & R_TIMING) rt->rtt = rep->r_rtt; else @@ -1105,11 +1174,10 @@ if (nmp->nm_cwnd > NFS_MAXCWND) nmp->nm_cwnd = NFS_MAXCWND; } - if (!(rep->r_flags & R_SENT)) - printf("nfs_reply: unsent xid=%x", - rep->r_xid); - rep->r_flags &= ~R_SENT; - nmp->nm_sent -= NFS_CWNDSCALE; + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_SENT; + nmp->nm_sent -= NFS_CWNDSCALE; + } /* * Update rtt using a gain of 0.125 on the mean * and a gain of 0.25 on the deviation. @@ -1137,7 +1205,7 @@ break; } } - nfs_rcvunlock(&nmp->nm_flag); + nfs_rcvunlock(myrep); /* * If not matched to a request, drop it. * If it's mine, get out. @@ -1179,7 +1247,7 @@ caddr_t *dposp; u_int64_t *xidp; { - register struct mbuf *m, *mrep; + register struct mbuf *m, *mrep, *m2; register struct nfsreq *rep, *rp; register u_long *tl; register int i; @@ -1196,33 +1264,33 @@ u_quad_t frev; char *auth_str, *verf_str; NFSKERBKEY_T key; /* save session key */ + int nmsotype; + struct timeval now; if (xidp) *xidp = 0; - nmp = VFSTONFS(vp->v_mount); + MALLOC_ZONE(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); - FSDBG_TOP(531, vp, procnum, nmp, rep); - - /* - * make sure if we blocked above, that the file system didn't get - * unmounted leaving nmp bogus value to trip on later and crash. - * Note nfs_unmount will set rep->r_nmp if unmounted volume, but we - * aren't that far yet. SO this is best we can do. I wanted to check - * for vp->v_mount = 0 also below, but that caused reboot crash. - * Something must think it's okay for vp-v_mount=0 during booting. - * Thus the best I can do here is see if we still have a vnode. - */ - if (vp->v_type == VBAD) { - FSDBG_BOT(531, 1, vp, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); - return (EINVAL); + nmp = VFSTONFS(vp->v_mount); + if (nmp == NULL || + (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) == + (NFSSTA_FORCE|NFSSTA_TIMEO)) { + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + return (ENXIO); } + nmsotype = nmp->nm_sotype; + + FSDBG_TOP(531, vp, procnum, nmp, rep); + rep->r_nmp = nmp; rep->r_vp = vp; rep->r_procp = procp; rep->r_procnum = procnum; + microuptime(&now); + rep->r_lastmsg = now.tv_sec - + ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); i = 0; m = mrest; while (m) { @@ -1235,6 +1303,12 @@ * Get the RPC header with authorization. */ kerbauth: + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + FSDBG_BOT(531, error, rep->r_xid, nmp, rep); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + return (ENXIO); + } verf_str = auth_str = (char *)0; if (nmp->nm_flag & NFSMNT_KERB) { verf_str = nickv; @@ -1243,11 +1317,22 @@ bzero((caddr_t)key, sizeof (key)); if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str, &auth_len, verf_str, verf_len)) { + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + FSDBG_BOT(531, 2, vp, error, rep); + FREE_ZONE((caddr_t)rep, + sizeof (struct nfsreq), M_NFSREQ); + m_freem(mrest); + return (ENXIO); + } error = nfs_getauth(nmp, rep, cred, &auth_str, &auth_len, verf_str, &verf_len, key); + nmp = VFSTONFS(vp->v_mount); + if (!error && !nmp) + error = ENXIO; if (error) { FSDBG_BOT(531, 2, vp, error, rep); - _FREE_ZONE((caddr_t)rep, + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); m_freem(mrest); return (error); @@ -1271,7 +1356,7 @@ /* * For stream protocols, insert a Sun RPC Record Mark. */ - if (nmp->nm_sotype == SOCK_STREAM) { + if (nmsotype == SOCK_STREAM) { M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); *mtod(m, u_long *) = htonl(0x80000000 | (m->m_pkthdr.len - NFSX_UNSIGNED)); @@ -1279,7 +1364,8 @@ rep->r_mreq = m; rep->r_xid = xid; tryagain: - if (nmp->nm_flag & NFSMNT_SOFT) + nmp = VFSTONFS(vp->v_mount); + if (nmp && (nmp->nm_flag & NFSMNT_SOFT)) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ @@ -1302,19 +1388,22 @@ TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); /* Get send time for nqnfs */ - reqtime = time.tv_sec; + microtime(&now); + reqtime = now.tv_sec; /* * If backing off another request or avoiding congestion, don't * send this one now but let timer do it. If not timing a request, * do it now. */ - if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || + if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) { + int connrequired = (nmp->nm_soflags & PR_CONNREQUIRED); + splx(s); - if (nmp->nm_soflags & PR_CONNREQUIRED) - error = nfs_sndlock(&nmp->nm_flag, rep); + if (connrequired) + error = nfs_sndlock(rep); /* * Set the R_SENT before doing the send in case another thread @@ -1328,13 +1417,15 @@ rep->r_flags |= R_SENT; } - m = m_copym(m, 0, M_COPYALL, M_WAIT); - error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); - if (nmp->nm_soflags & PR_CONNREQUIRED) - nfs_sndunlock(&nmp->nm_flag); + m2 = m_copym(m, 0, M_COPYALL, M_WAIT); + error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); + if (connrequired) + nfs_sndunlock(rep); } + nmp = VFSTONFS(vp->v_mount); if (error) { - nmp->nm_sent -= NFS_CWNDSCALE; + if (nmp) + nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } } else { @@ -1351,39 +1442,35 @@ /* * RPC done, unlink the request. */ - s = splsoftclock(); - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) - if (rp == rep && rp->r_xid == xid) - break; - if (!rp) - panic("nfs_request race, rep %x xid %x", rep, xid); - TAILQ_REMOVE(&nfs_reqq, rep, r_chain); - splx(s); + nfs_repdequeue(rep); + + nmp = VFSTONFS(vp->v_mount); /* * Decrement the outstanding request count. */ if (rep->r_flags & R_SENT) { - FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd); rep->r_flags &= ~R_SENT; /* paranoia */ - nmp->nm_sent -= NFS_CWNDSCALE; + if (nmp) { + FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd); + nmp->nm_sent -= NFS_CWNDSCALE; + } } /* * If there was a successful reply and a tprintf msg. * tprintf a response. */ - if (!error && (rep->r_flags & R_TPRINTFMSG)) - nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, - "is alive again"); + nfs_up(rep, "is alive again", error); mrep = rep->r_mrep; md = rep->r_md; dpos = rep->r_dpos; + if (!error && !nmp) + error = ENXIO; if (error) { m_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1408,7 +1495,7 @@ m_freem(mrep); m_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1433,16 +1520,21 @@ error == NFSERR_TRYLATER) { m_freem(mrep); error = 0; - waituntil = time.tv_sec + trylater_delay; + microuptime(&now); + waituntil = now.tv_sec + trylater_delay; NFS_DPF(DUP, ("nfs_request %s flag=%x trylater_cnt=%x waituntil=%lx trylater_delay=%x\n", nmp->nm_mountp->mnt_stat.f_mntfromname, nmp->nm_flag, trylater_cnt, waituntil, trylater_delay)); - while (time.tv_sec < waituntil) + while (now.tv_sec < waituntil) { (void)tsleep((caddr_t)&lbolt, PSOCK, "nqnfstry", 0); - trylater_delay *= nfs_backoff[trylater_cnt]; + microuptime(&now); + } + trylater_delay *= 2; + if (trylater_delay > 60) + trylater_delay = 60; if (trylater_cnt < 7) trylater_cnt++; goto tryagain; @@ -1463,7 +1555,7 @@ m_freem(mrep); m_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1479,7 +1571,8 @@ nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); cachable = fxdr_unsigned(int, *tl++); reqtime += fxdr_unsigned(int, *tl++); - if (reqtime > time.tv_sec) { + microtime(&now); + if (reqtime > now.tv_sec) { fxdr_hyper(tl, &frev); nqnfs_clientlease(nmp, np, nqlflag, cachable, reqtime, frev); @@ -1499,7 +1592,7 @@ nfsmout: m_freem(rep->r_mreq); FSDBG_BOT(531, error, rep->r_xid, nmp, rep); - _FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); + FREE_ZONE((caddr_t)rep, sizeof (struct nfsreq), M_NFSREQ); return (error); } @@ -1670,6 +1763,7 @@ static void nfs_softterm(struct nfsreq *rep) { + rep->r_flags |= R_SOFTTERM; if (rep->r_flags & R_SENT) { FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent, @@ -1690,6 +1784,63 @@ } /* + * Ensure rep isn't in use by the timer, then dequeue it. + */ +void +nfs_repdequeue(struct nfsreq *rep) +{ + int s; + + while ((rep->r_flags & R_BUSY)) { + rep->r_flags |= R_WAITING; + tsleep(rep, PSOCK, "repdeq", 0); + } + s = splsoftclock(); + TAILQ_REMOVE(&nfs_reqq, rep, r_chain); + splx(s); +} + +/* + * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not + * free()'d out from under it. + */ +void +nfs_repbusy(struct nfsreq *rep) +{ + + if ((rep->r_flags & R_BUSY)) + panic("rep locked"); + rep->r_flags |= R_BUSY; +} + +/* + * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied. + */ +struct nfsreq * +nfs_repnext(struct nfsreq *rep) +{ + struct nfsreq * nextrep; + + if (rep == NULL) + return (NULL); + /* + * We need to get and busy the next req before signalling the + * current one, otherwise wakeup() may block us and we'll race to + * grab the next req. + */ + nextrep = TAILQ_NEXT(rep, r_chain); + if (nextrep != NULL) + nfs_repbusy(nextrep); + /* unbusy and signal. */ + rep->r_flags &= ~R_BUSY; + if ((rep->r_flags & R_WAITING)) { + rep->r_flags &= ~R_WAITING; + wakeup(rep); + } + return (nextrep); +} + +/* * Nfs timer routine * Scan the nfsreq list and retranmit any requests that have timed out * To avoid retransmission attempts on STREAM sockets (in the future) make @@ -1699,7 +1850,7 @@ nfs_timer(arg) void *arg; /* never used */ { - register struct nfsreq *rep, *rp; + register struct nfsreq *rep; register struct mbuf *m; register struct socket *so; register struct nfsmount *nmp; @@ -1715,17 +1866,16 @@ #endif int flags, rexmit, cwnd, sent; u_long xid; + struct timeval now; s = splnet(); /* * XXX If preemptable threads are implemented the spls used for the * outstanding request queue must be replaced with mutexes. */ -rescan: #ifdef NFSTRACESUSPENDERS if (NFSTRACE_SUSPENDING) { - for (rep = nfs_reqq.tqh_first; rep != 0; - rep = rep->r_chain.tqe_next) + TAILQ_FOREACH(rep, &nfs_reqq, r_chain) if (rep->r_xid == nfstracexid) break; if (!rep) { @@ -1735,7 +1885,11 @@ } } #endif - for (rep = nfs_reqq.tqh_first; rep != 0; rep = rep->r_chain.tqe_next) { + rep = TAILQ_FIRST(&nfs_reqq); + if (rep != NULL) + nfs_repbusy(rep); + microuptime(&now); + for ( ; rep != NULL ; rep = nfs_repnext(rep)) { #ifdef NFSTRACESUSPENDERS if (rep->r_mrep && !NFSTRACE_SUSPENDING) { nfstracexid = rep->r_xid; @@ -1747,9 +1901,13 @@ continue; if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) continue; - if (nfs_sigintr(nmp, rep, rep->r_procp)) { - nfs_softterm(rep); + if (nfs_sigintr(nmp, rep, rep->r_procp)) continue; + if (nmp->nm_tprintf_initial_delay != 0 && + (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) && + rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) { + rep->r_lastmsg = now.tv_sec; + nfs_down(rep, "not responding", 0); } if (rep->r_rtt >= 0) { rep->r_rtt++; @@ -1768,15 +1926,10 @@ nmp->nm_timeouts++; } /* - * Check for server not responding + * Check for too many retransmits. This is never true for + * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1 + * and never allow r_rexmit to be more than NFS_MAXREXMIT. */ - if ((rep->r_flags & R_TPRINTFMSG) == 0 && - rep->r_rexmit > nmp->nm_deadthresh) { - nfs_msg(rep->r_procp, - nmp->nm_mountp->mnt_stat.f_mntfromname, - "not responding"); - rep->r_flags |= R_TPRINTFMSG; - } if (rep->r_rexmit >= rep->r_retry) { /* too many */ nfsstats.rpctimeouts++; nfs_softterm(rep); @@ -1857,29 +2010,11 @@ thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); FSDBG(535, xid, error, sent, cwnd); - /* - * This is to fix "nfs_sigintr" DSI panics. - * We may have slept during the send so the current - * place in the request queue may have been released. - * Due to zone_gc it may even be part of an - * unrelated newly allocated data structure. - * Restart the list scan from the top if needed... - */ - for (rp = nfs_reqq.tqh_first; rp; - rp = rp->r_chain.tqe_next) - if (rp == rep && rp->r_xid == xid) - break; - if (!rp) { - if (!error) - goto rescan; - panic("nfs_timer: race error %d xid 0x%x\n", - error, xid); - } if (error) { if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) so->so_error = 0; - rep->r_flags = flags; + rep->r_flags = flags | R_RESENDERR; rep->r_rexmit = rexmit; nmp->nm_cwnd = cwnd; nmp->nm_sent = sent; @@ -1893,8 +2028,9 @@ /* * Call the nqnfs server timer once a second to handle leases. */ - if (lasttime != time.tv_sec) { - lasttime = time.tv_sec; + microuptime(&now); + if (lasttime != now.tv_sec) { + lasttime = now.tv_sec; nqnfs_serverd(); } @@ -1902,10 +2038,10 @@ * Scan the write gathering queues for writes that need to be * completed now. */ - cur_usec = (u_quad_t)time.tv_sec * 1000000 + (u_quad_t)time.tv_usec; - for (slp = nfssvc_sockhead.tqh_first; slp != 0; - slp = slp->ns_chain.tqe_next) { - if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec) + cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec; + TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { + if (LIST_FIRST(&slp->ns_tq) && + LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) nfsrv_wakenfsd(slp); } #endif /* NFS_NOSERVER */ @@ -1917,26 +2053,82 @@ /* * Test for a termination condition pending on the process. - * This is used for NFSMNT_INT mounts. + * This is used to determine if we need to bail on a mount. + * EIO is returned if there has been a soft timeout. + * EINTR is returned if there is a signal pending that is not being ignored + * and the mount is interruptable, or if we are a thread that is in the process + * of cancellation (also SIGKILL posted). */ int nfs_sigintr(nmp, rep, p) struct nfsmount *nmp; struct nfsreq *rep; - register struct proc *p; + struct proc *p; { + struct uthread *curr_td; + sigset_t pending_sigs; + int context_good = 0; + struct nfsmount *repnmp; + + if (nmp == NULL) + return (ENXIO); + if (rep != NULL) { + repnmp = rep->r_nmp; + /* we've had a forced unmount. */ + if (repnmp == NULL) + return (ENXIO); + /* request has timed out on a 'soft' mount. */ + if (rep->r_flags & R_SOFTTERM) + return (EIO); + /* + * We're in the progress of a force unmount and there's + * been a timeout we're dead and fail IO. + */ + if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) == + (NFSSTA_FORCE|NFSSTA_TIMEO)) + return (EIO); + /* Someone is unmounting us, go soft and mark it. */ + if ((repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT)) { + repnmp->nm_flag |= NFSMNT_SOFT; + nmp->nm_state |= NFSSTA_FORCE; + } + /* + * If the mount is hung and we've requested not to hang + * on remote filesystems, then bail now. + */ + if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0 && + (repnmp->nm_state & NFSSTA_TIMEO) != 0) + return (EIO); + } + /* XXX: is this valid? this probably should be an assertion. */ + if (p == NULL) + return (0); - struct uthread *ut; - - ut = (struct uthread *)get_bsdthread_info(current_act()); - - if (rep && (rep->r_flags & R_SOFTTERM)) + /* + * XXX: Since nfs doesn't have a good shot at getting the current + * thread we take a guess. (only struct proc * are passed to VOPs) + * What we do is look at the current thread, if it belongs to the + * passed in proc pointer then we have a "good/accurate" context + * and can make an accurate guess as to what to do. + * However if we have a bad context we have to make due with what + * is in the proc struct which may not be as up to date as we'd + * like. + * This is ok because the process will call us with the correct + * context after a short timeout while waiting for a response. + */ + curr_td = (struct uthread *)get_bsdthread_info(current_act()); + if (curr_td->uu_proc == p) + context_good = 1; + if (context_good && current_thread_aborted()) return (EINTR); - if (!(nmp->nm_flag & NFSMNT_INT)) - return (0); - if (p && ut && ut->uu_siglist && - (((ut->uu_siglist & ~ut->uu_sigmask) & ~p->p_sigignore) & - NFSINT_SIGMASK)) + /* mask off thread and process blocked signals. */ + if (context_good) + pending_sigs = curr_td->uu_siglist & ~curr_td->uu_sigmask; + else + pending_sigs = p->p_siglist; + /* mask off process level and NFS ignored signals. */ + pending_sigs &= ~p->p_sigignore & NFSINT_SIGMASK; + if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0) return (EINTR); return (0); } @@ -1948,25 +2140,29 @@ * in progress when a reconnect is necessary. */ int -nfs_sndlock(flagp, rep) - register int *flagp; +nfs_sndlock(rep) struct nfsreq *rep; { + register int *statep; struct proc *p; - int slpflag = 0, slptimeo = 0; + int error, slpflag = 0, slptimeo = 0; - if (rep) { - p = rep->r_procp; - if (rep->r_nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - } else - p = (struct proc *)0; - while (*flagp & NFSMNT_SNDLOCK) { - if (nfs_sigintr(rep->r_nmp, rep, p)) - return (EINTR); - *flagp |= NFSMNT_WANTSND; - (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", - slptimeo); + if (rep->r_nmp == NULL) + return (ENXIO); + statep = &rep->r_nmp->nm_state; + + p = rep->r_procp; + if (rep->r_nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + while (*statep & NFSSTA_SNDLOCK) { + error = nfs_sigintr(rep->r_nmp, rep, p); + if (error) + return (error); + *statep |= NFSSTA_WANTSND; + if (p != NULL && (p->p_flag & P_NOREMOTEHANG) != 0) + slptimeo = hz; + (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), + "nfsndlck", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -1976,9 +2172,9 @@ * nfs_sigintr and callers expect it in tact. */ if (!rep->r_nmp) - return (ECONNABORTED); /* don't have lock until out of loop */ + return (ENXIO); /* don't have lock until out of loop */ } - *flagp |= NFSMNT_SNDLOCK; + *statep |= NFSSTA_SNDLOCK; return (0); } @@ -1986,16 +2182,20 @@ * Unlock the stream socket for others. */ void -nfs_sndunlock(flagp) - register int *flagp; +nfs_sndunlock(rep) + struct nfsreq *rep; { + register int *statep; - if ((*flagp & NFSMNT_SNDLOCK) == 0) + if (rep->r_nmp == NULL) + return; + statep = &rep->r_nmp->nm_state; + if ((*statep & NFSSTA_SNDLOCK) == 0) panic("nfs sndunlock"); - *flagp &= ~NFSMNT_SNDLOCK; - if (*flagp & NFSMNT_WANTSND) { - *flagp &= ~NFSMNT_WANTSND; - wakeup((caddr_t)flagp); + *statep &= ~NFSSTA_SNDLOCK; + if (*statep & NFSSTA_WANTSND) { + *statep &= ~NFSSTA_WANTSND; + wakeup((caddr_t)statep); } } @@ -2003,26 +2203,26 @@ nfs_rcvlock(rep) register struct nfsreq *rep; { - register int *flagp; - int slpflag, slptimeo = 0; + register int *statep; + int error, slpflag, slptimeo = 0; /* make sure we still have our mountpoint */ if (!rep->r_nmp) { if (rep->r_mrep != NULL) return (EALREADY); - return (ECONNABORTED); + return (ENXIO); } - flagp = &rep->r_nmp->nm_flag; - FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *flagp); - if (*flagp & NFSMNT_INT) + statep = &rep->r_nmp->nm_state; + FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep); + if (rep->r_nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; else slpflag = 0; - while (*flagp & NFSMNT_RCVLOCK) { - if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) { + while (*statep & NFSSTA_RCVLOCK) { + if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) { FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100); - return (EINTR); + return (error); } else if (rep->r_mrep != NULL) { /* * Don't bother sleeping if reply already arrived @@ -2031,9 +2231,16 @@ return (EALREADY); } FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102); - *flagp |= NFSMNT_WANTRCV; - (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", - slptimeo); + *statep |= NFSSTA_WANTRCV; + /* + * We need to poll if we're P_NOREMOTEHANG so that we + * call nfs_sigintr periodically above. + */ + if (rep->r_procp != NULL && + (rep->r_procp->p_flag & P_NOREMOTEHANG) != 0) + slptimeo = hz; + (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), + "nfsrcvlk", slptimeo); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; @@ -2044,15 +2251,15 @@ */ if (!rep->r_nmp) { FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103); - return (ECONNABORTED); /* don't have lock until out of loop */ + return (ENXIO); /* don't have lock until out of loop */ } } /* * nfs_reply will handle it if reply already arrived. * (We may have slept or been preempted while on network funnel). */ - FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *flagp); - *flagp |= NFSMNT_RCVLOCK; + FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep); + *statep |= NFSSTA_RCVLOCK; return (0); } @@ -2060,17 +2267,22 @@ * Unlock the stream socket for others. */ static void -nfs_rcvunlock(flagp) - register int *flagp; +nfs_rcvunlock(rep) + register struct nfsreq *rep; { + register int *statep; + + if (rep->r_nmp == NULL) + return; + statep = &rep->r_nmp->nm_state; - FSDBG(533, flagp, *flagp, 0, 0); - if ((*flagp & NFSMNT_RCVLOCK) == 0) + FSDBG(533, statep, *statep, 0, 0); + if ((*statep & NFSSTA_RCVLOCK) == 0) panic("nfs rcvunlock"); - *flagp &= ~NFSMNT_RCVLOCK; - if (*flagp & NFSMNT_WANTRCV) { - *flagp &= ~NFSMNT_WANTRCV; - wakeup((caddr_t)flagp); + *statep &= ~NFSSTA_RCVLOCK; + if (*statep & NFSSTA_WANTRCV) { + *statep &= ~NFSSTA_WANTRCV; + wakeup((caddr_t)statep); } } @@ -2083,7 +2295,7 @@ * be called with M_WAIT from an nfsd. */ /* - * Needs to eun under network funnel + * Needs to run under network funnel */ void nfsrv_rcv(so, arg, waitflag) @@ -2096,7 +2308,7 @@ struct mbuf *mp, *mhck; struct sockaddr *nam=0; struct uio auio; - int flags, error; + int flags, ns_nflag=0, error; struct sockaddr_in *sin; if ((slp->ns_flag & SLP_VALID) == 0) @@ -2106,7 +2318,8 @@ * Define this to test for nfsds handling this under heavy load. */ if (waitflag == M_DONTWAIT) { - slp->ns_flag |= SLP_NEEDQ; goto dorecs; + ns_nflag = SLPN_NEEDQ; + goto dorecs; } #endif auio.uio_procp = NULL; @@ -2117,7 +2330,7 @@ * the nfs servers are heavily loaded. */ if (slp->ns_rec && waitflag == M_DONTWAIT) { - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; goto dorecs; } @@ -2129,9 +2342,9 @@ error = soreceive(so, (struct sockaddr **) 0, &auio, &mp, (struct mbuf **)0, &flags); if (error || mp == (struct mbuf *)0) { if (error == EWOULDBLOCK) - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; else - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; goto dorecs; } m = mp; @@ -2152,9 +2365,9 @@ error = nfsrv_getstream(slp, waitflag); if (error) { if (error == EPERM) - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; else - slp->ns_flag |= SLP_NEEDQ; + ns_nflag = SLPN_NEEDQ; } } else { do { @@ -2187,7 +2400,7 @@ if (error) { if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && error != EWOULDBLOCK) { - slp->ns_flag |= SLP_DISCONN; + ns_nflag = SLPN_DISCONN; goto dorecs; } } @@ -2198,8 +2411,10 @@ * Now try and process the request records, non-blocking. */ dorecs: + if (ns_nflag) + slp->ns_nflag |= ns_nflag; if (waitflag == M_DONTWAIT && - (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) { + (slp->ns_rec || (slp->ns_nflag & (SLPN_NEEDQ | SLPN_DISCONN)))) { thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); nfsrv_wakenfsd(slp); thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); @@ -2222,13 +2437,13 @@ struct mbuf *om, *m2, *recm; u_long recmark; - if (slp->ns_flag & SLP_GETSTREAM) + if (slp->ns_nflag & SLPN_GETSTREAM) panic("nfs getstream"); - slp->ns_flag |= SLP_GETSTREAM; + slp->ns_nflag |= SLPN_GETSTREAM; for (;;) { if (slp->ns_reclen == 0) { if (slp->ns_cc < NFSX_UNSIGNED) { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (0); } m = slp->ns_raw; @@ -2253,11 +2468,11 @@ recmark = ntohl(recmark); slp->ns_reclen = recmark & ~0x80000000; if (recmark & 0x80000000) - slp->ns_flag |= SLP_LASTFRAG; + slp->ns_nflag |= SLPN_LASTFRAG; else - slp->ns_flag &= ~SLP_LASTFRAG; + slp->ns_nflag &= ~SLPN_LASTFRAG; if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (EPERM); } } @@ -2291,7 +2506,7 @@ m->m_len -= slp->ns_reclen - len; len = slp->ns_reclen; } else { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (EWOULDBLOCK); } } else if ((len + m->m_len) == slp->ns_reclen) { @@ -2310,7 +2525,7 @@ slp->ns_cc -= len; slp->ns_reclen = 0; } else { - slp->ns_flag &= ~SLP_GETSTREAM; + slp->ns_nflag &= ~SLPN_GETSTREAM; return (0); } @@ -2321,7 +2536,7 @@ while (*mpp) mpp = &((*mpp)->m_next); *mpp = recm; - if (slp->ns_flag & SLP_LASTFRAG) { + if (slp->ns_nflag & SLPN_LASTFRAG) { if (slp->ns_recend) slp->ns_recend->m_nextpkt = slp->ns_frag; else @@ -2368,8 +2583,9 @@ nd->nd_dpos = mtod(m, caddr_t); error = nfs_getreq(nd, nfsd, TRUE); if (error) { - m_freem(nam); - _FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); + if (nam) + m_freem(nam); + FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); return (error); } *ndp = nd; @@ -2399,7 +2615,7 @@ int error = 0, nqnfs = 0, ticklen; struct mbuf *mrep, *md; register struct nfsuid *nuidp; - struct timeval tvin, tvout; + struct timeval tvin, tvout, now; #if 0 /* until encrypted keys are implemented */ NFSKERBKEYSCHED_T keys; /* stores key schedule */ #endif @@ -2585,7 +2801,8 @@ tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec); tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec); - if (nuidp->nu_expire < time.tv_sec || + microtime(&now); + if (nuidp->nu_expire < now.tv_sec || nuidp->nu_timestamp.tv_sec > tvout.tv_sec || (nuidp->nu_timestamp.tv_sec == tvout.tv_sec && nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) { @@ -2637,7 +2854,7 @@ if ((slp->ns_flag & SLP_VALID) == 0) return; - for (nd = nfsd_head.tqh_first; nd != 0; nd = nd->nfsd_chain.tqe_next) { + TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) { if (nd->nfsd_flag & NFSD_WAITING) { nd->nfsd_flag &= ~NFSD_WAITING; if (nd->nfsd_slp) @@ -2654,9 +2871,10 @@ #endif /* NFS_NOSERVER */ static int -nfs_msg(p, server, msg) +nfs_msg(p, server, msg, error) struct proc *p; - char *server, *msg; + const char *server, *msg; + int error; { tpr_t tpr; @@ -2664,7 +2882,50 @@ tpr = tprintf_open(p); else tpr = NULL; - tprintf(tpr, "nfs server %s: %s\n", server, msg); + if (error) + tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg, + error); + else + tprintf(tpr, "nfs server %s: %s\n", server, msg); tprintf_close(tpr); return (0); +} + +static void +nfs_down(rep, msg, error) + struct nfsreq *rep; + const char *msg; + int error; +{ + int dosignal; + + if (rep == NULL || rep->r_nmp == NULL) + return; + if (!(rep->r_nmp->nm_state & NFSSTA_TIMEO)) { + vfs_event_signal(&rep->r_nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESP, 0); + rep->r_nmp->nm_state |= NFSSTA_TIMEO; + } + rep->r_flags |= R_TPRINTFMSG; + nfs_msg(rep->r_procp, rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, + msg, error); +} + +static void +nfs_up(rep, msg, error) + struct nfsreq *rep; + const char *msg; + int error; +{ + + if (error != 0 || rep == NULL || rep->r_nmp == NULL) + return; + if ((rep->r_flags & R_TPRINTFMSG) != 0) + nfs_msg(rep->r_procp, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0); + if ((rep->r_nmp->nm_state & NFSSTA_TIMEO)) { + rep->r_nmp->nm_state &= ~NFSSTA_TIMEO; + vfs_event_signal(&rep->r_nmp->nm_mountp->mnt_stat.f_fsid, + VQ_NOTRESP, 1); + } } diff -urN xnu-344.49/bsd/nfs/nfs_subs.c xnu-517/bsd/nfs/nfs_subs.c --- xnu-344.49/bsd/nfs/nfs_subs.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_subs.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -109,6 +110,9 @@ #include +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW, 0, "nfs hinge"); + #define FSDBG(A, B, C, D, E) \ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \ (int)(B), (int)(C), (int)(D), (int)(E), 0) @@ -589,15 +593,9 @@ extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; -struct getfh_args; -extern int getfh(struct proc *, struct getfh_args *, int *); -struct nfssvc_args; -extern int nfssvc(struct proc *, struct nfssvc_args *, int *); LIST_HEAD(nfsnodehashhead, nfsnode); -int nfs_webnamei __P((struct nameidata *, struct vnode *, struct proc *)); - /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. @@ -628,7 +626,7 @@ */ if (vp) { nmp = VFSTONFS(vp->v_mount); - if (nmp->nm_flag & NFSMNT_NQNFS) { + if (nmp && (nmp->nm_flag & NFSMNT_NQNFS)) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); @@ -696,7 +694,6 @@ /* * derive initial xid from system time - * XXX time is invalid if root not yet mounted */ if (!base && (rootvp)) { microtime(&tv); @@ -1182,6 +1179,7 @@ nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = (struct nfsmount *)0; } + nfs_nbinit(); /* Init the nfsbuf table */ nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ @@ -1219,13 +1217,6 @@ lease_updatetime = nfs_lease_updatetime; #endif vfsp->vfc_refcount++; /* make us non-unloadable */ - sysent[SYS_nfssvc].sy_narg = 2; - sysent[SYS_nfssvc].sy_call = nfssvc; -#ifndef NFS_NOSERVER - sysent[SYS_getfh].sy_narg = 2; - sysent[SYS_getfh].sy_call = getfh; -#endif - return (0); } @@ -1263,18 +1254,15 @@ enum vtype vtyp; u_short vmode; struct timespec mtime; + struct timeval now; struct vnode *nvp; int v3; FSDBG_TOP(527, vp, 0, *xidp >> 32, *xidp); - /* - * this routine is a good place to check for VBAD again. We caught - * most of them in nfsm_request, but postprocessing may indirectly get - * here, so check again. - */ - if (vp->v_type == VBAD) { - FSDBG_BOT(527, EINVAL, 1, 0, *xidp); - return (EINVAL); + + if (!VFSTONFS(vp->v_mount)) { + FSDBG_BOT(527, ENXIO, 1, 0, *xidp); + return (ENXIO); } v3 = NFS_ISV3(vp); @@ -1333,7 +1321,7 @@ * information. */ np = VTONFS(vp); -if (*xidp < np->n_xid) { + if (*xidp < np->n_xid) { /* * We have already updated attributes with a response from * a later request. The attributes we have here are probably @@ -1352,12 +1340,6 @@ if (vp->v_type != vtyp) { vp->v_type = vtyp; - if (UBCINFOMISSING(vp) || UBCINFORECLAIMED(vp)) - if ((error = ubc_info_init(vp))) { /* VREG */ - FSDBG_BOT(527, error, 3, 0, *xidp); - return(error); - } - if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } @@ -1399,7 +1381,7 @@ vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); fxdr_hyper(&fp->fa3_size, &vap->va_size); - vap->va_blocksize = NFS_FABLKSIZE; + vap->va_blocksize = 16*1024; fxdr_hyper(&fp->fa3_used, &vap->va_bytes); vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); @@ -1422,7 +1404,21 @@ vap->va_filerev = 0; } - np->n_attrstamp = time.tv_sec; + microuptime(&now); + np->n_attrstamp = now.tv_sec; + + if (UBCINFOMISSING(vp) || UBCINFORECLAIMED(vp)) { + if (UBCINFORECLAIMED(vp) && ISSET(vp->v_flag, (VXLOCK|VORECLAIM))) { + // vnode is being vclean'ed, abort + FSDBG_BOT(527, ENXIO, 1, 0, *xidp); + return (ENXIO); + } + if ((error = ubc_info_init(vp))) { /* VREG */ + FSDBG_BOT(527, error, 3, 0, *xidp); + return(error); + } + } + if (vap->va_size != np->n_size) { FSDBG(527, vp, vap->va_size, np->n_size, (vap->va_type == VREG) | @@ -1442,8 +1438,9 @@ dontshrink && np->n_size < ubc_getsize(vp)) { vap->va_size = np->n_size = orig_size; np->n_attrstamp = 0; - } else + } else { ubc_setsize(vp, (off_t)np->n_size); /* XXX */ + } } else np->n_size = vap->va_size; } @@ -1473,8 +1470,25 @@ { register struct nfsnode *np = VTONFS(vp); register struct vattr *vap; + struct timeval now, nowup; + int32_t timeo; + + /* Set attribute timeout based on how recently the file has been modified. */ + if ((np)->n_flag & NMODIFIED) + timeo = NFS_MINATTRTIMO; + else { + /* Note that if the client and server clocks are way out of sync, */ + /* timeout will probably get clamped to a min or max value */ + microtime(&now); + timeo = (now.tv_sec - (np)->n_mtime) / 10; + if (timeo < NFS_MINATTRTIMO) + timeo = NFS_MINATTRTIMO; + else if (timeo > NFS_MAXATTRTIMO) + timeo = NFS_MAXATTRTIMO; + } - if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { + microuptime(&nowup); + if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, vp, 0, 0, 1); nfsstats.attrcache_misses++; return (ENOENT); @@ -1542,10 +1556,15 @@ int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; int olen = len; + char *tmppn; *retdirp = (struct vnode *)0; - MALLOC_ZONE(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); - cnp->cn_pnlen = len + 1; + + if (len > MAXPATHLEN - 1) + return (ENAMETOOLONG); + + MALLOC_ZONE(cnp->cn_pnbuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + cnp->cn_pnlen = MAXPATHLEN; /* * Copy the name from the mbuf list to ndp->ni_pnbuf @@ -1609,14 +1628,16 @@ *retdirp = dp; /* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -/* XXX debo 12/15/97 Need to fix M_NAMEI allocations to use zone protocol */ #ifdef notyet if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ - MALLOC(cp, char *, olen + 1, M_NAMEI, M_WAITOK); + + assert(olen <= MAXPATHLEN - 1); + + MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { @@ -1634,7 +1655,7 @@ */ default: error = EIO; - FREE(cp, M_NAMEI); + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); goto out; } } @@ -1650,15 +1671,20 @@ continue; } else { error = ENOENT; - FREE(cp, M_NAMEI); + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; - FREE(cnp->cn_pnbuf, M_NAMEI); + + tmppn = cnp->cn_pnbuf; + long len = cnp->cn_pnlen; cnp->cn_pnbuf = cp; + cnp->cn_pnlen = MAXPATHLEN; + FREE_ZONE(tmppn, len, M_NAMEI); + } #endif @@ -1714,7 +1740,6 @@ error = EINVAL; break; /* XXX CSM 12/4/97 Revisit when enabling WebNFS */ -/* XXX debo 12/15/97 Need to fix M_NAMEI allocations to use zone protocol */ #ifdef notyet } @@ -1722,8 +1747,9 @@ error = ELOOP; break; } + /* XXX assert(olen <= MAXPATHLEN - 1); */ if (ndp->ni_pathlen > 1) - MALLOC(cp, char *, olen + 1, M_NAMEI, M_WAITOK); + MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; @@ -1737,9 +1763,9 @@ auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { - badlink: +badlink: if (ndp->ni_pathlen > 1) - FREE(cp, M_NAMEI); + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); break; } linklen = MAXPATHLEN - auio.uio_resid; @@ -1752,9 +1778,12 @@ goto badlink; } if (ndp->ni_pathlen > 1) { - bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); - FREE(cnp->cn_pnbuf, M_NAMEI); + long len = cnp->cn_pnlen; + tmppn = cnp->cn_pnbuf; cnp->cn_pnbuf = cp; + cnp->cn_pnlen = olen + 1; + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + FREE_ZONE(tmppn, len, M_NAMEI); } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; @@ -1772,7 +1801,11 @@ } } out: - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); + tmppn = cnp->cn_pnbuf; + cnp->cn_pnbuf = NULL; + cnp->cn_flags &= ~HASBUF; + FREE_ZONE(tmppn, cnp->cn_pnlen, M_NAMEI); + return (error); } @@ -2162,8 +2195,8 @@ /* * The write verifier has changed (probably due to a server reboot), so all - * B_NEEDCOMMIT blocks will have to be written again. Since they are on the - * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT + * NB_NEEDCOMMIT blocks will have to be written again. Since they are on the + * dirty block list as NB_DELWRI, all this takes is clearing the NB_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void @@ -2171,7 +2204,8 @@ struct mount *mp; { register struct vnode *vp, *nvp; - register struct buf *bp, *nbp; + register struct nfsbuf *bp, *nbp; + struct nfsnode *np; int s; s = splbio(); @@ -2180,11 +2214,15 @@ if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + np = VTONFS(vp); + for (bp = np->n_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + if ((bp->nb_flags & (NB_BUSY | NB_DELWRI | NB_NEEDCOMMIT)) + == (NB_DELWRI | NB_NEEDCOMMIT)) { + bp->nb_flags &= ~NB_NEEDCOMMIT; + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + } } } splx(s); diff -urN xnu-344.49/bsd/nfs/nfs_syscalls.c xnu-517/bsd/nfs/nfs_syscalls.c --- xnu-344.49/bsd/nfs/nfs_syscalls.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_syscalls.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -76,15 +76,17 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include +#include +#include #include #include #include @@ -104,7 +106,7 @@ #include #include #include - +#include /* Global defs. */ extern int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, @@ -112,6 +114,7 @@ struct proc *procp, struct mbuf **mreqp)); extern int nfs_numasync; +extern int nfs_ioddelwri; extern time_t nqnfsstarttime; extern int nqsrv_writeslack; extern int nfsrtton; @@ -179,7 +182,7 @@ error = suser(p->p_ucred, &p->p_acflag); if(error) return (error); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, p); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_USERSPACE, uap->fname, p); error = namei(&nd); if (error) return (error); @@ -195,6 +198,188 @@ } #endif /* NFS_NOSERVER */ + +/* + * syscall for the rpc.lockd to use to translate a NFS file handle into + * an open descriptor. + * + * warning: do not remove the suser() call or this becomes one giant + * security hole. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhopen_args { + const struct fhandle *u_fhp; + int flags; +}; +#endif +int +fhopen(p, uap, retval) + struct proc *p; + register struct fhopen_args *uap; + register_t *retval; +{ + struct mount *mp; + struct vnode *vp; + struct fhandle fhp; + struct vattr vat; + struct vattr *vap = &vat; + struct flock lf; + struct file *fp; + register struct filedesc *fdp = p->p_fd; + int fmode, mode, error, type; + struct file *nfp; + int indx; + struct ucred *credanon; + int exflags; + struct ucred *cred = p->p_ucred; + int didhold = 0; + extern struct fileops vnops; + + /* + * Must be super user + */ + error = suser(cred, &p->p_acflag); + if (error) + return (error); + + fmode = FFLAGS(uap->flags); + /* why not allow a non-read/write open for our lockd? */ + if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) + return (EINVAL); + error = copyin((void*)uap->u_fhp, &fhp, sizeof(fhp)); + if (error) + return (error); + /* find the mount point */ + mp = vfs_getvfs(&fhp.fh_fsid); + if (mp == NULL) + return (ESTALE); + /* now give me my vnode, it gets returned to me locked */ +/* XXX CSM need to split VFS_CHECKEXP out of VFS_FHTOVP? */ + error = VFS_FHTOVP(mp, &fhp.fh_fid, NULL, &vp, &exflags, &credanon); + if (error) + return (error); + /* + * from now on we have to make sure not + * to forget about the vnode + * any error that causes an abort must vput(vp) + * just set error = err and 'goto bad;'. + */ + + /* + * from vn_open + */ + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + + if (UBCINFOEXISTS(vp) && ((didhold = ubc_hold(vp)) == 0)) { + error = ENOENT; + goto bad; + } + + if (fmode & FREAD && fmode & (FWRITE | O_TRUNC)) { + int err = 0; + if (vp->v_type == VDIR) + err = EISDIR; + else + err = vn_writechk(vp); + if (err && !(error = VOP_ACCESS(vp, VREAD, cred, p))) + error = err; + if (error || (error = VOP_ACCESS(vp, VREAD|VWRITE, cred, p))) + goto bad; + } else if (fmode & FREAD) { + if ((error = VOP_ACCESS(vp, VREAD, cred, p))) + goto bad; + } else if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + if ((error = vn_writechk(vp)) || + (error = VOP_ACCESS(vp, VWRITE, cred, p))) + goto bad; + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp, 0, p); /* XXX */ + VOP_LEASE(vp, p, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + error = VOP_SETATTR(vp, vap, cred, p); + if (error) + goto bad; + } + + error = VOP_OPEN(vp, fmode, cred, p); + if (error) + goto bad; + + if (fmode & FWRITE) + if (++vp->v_writecount <= 0) + panic("fhopen: v_writecount"); + /* + * end of vn_open code + */ + + if ((error = falloc(p, &nfp, &indx)) != 0) { + if (fmode & FWRITE) + vp->v_writecount--; + goto bad; + } + fp = nfp; + + /* + * Hold an extra reference to avoid having fp ripped out + * from under us while we block in the lock op + */ + fref(fp); + nfp->f_data = (caddr_t)vp; + nfp->f_flag = fmode & FMASK; + nfp->f_ops = &vnops; + nfp->f_type = DTYPE_VNODE; + if (fmode & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (fmode & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((fmode & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, p); + if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + type)) != 0) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdrelse(p, indx); + /* + * release our private reference + */ + frele(fp); + + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + fp->f_flag |= FHASLOCK; + } + + VOP_UNLOCK(vp, 0, p); + *fdflags(p, indx) &= ~UF_RESERVED; + frele(fp); + *retval = indx; + return (0); + +bad: + VOP_UNLOCK(vp, 0, p); + if (didhold) + ubc_rele(vp); + vrele(vp); + return (error); +} + /* * Nfs server psuedo system call for the nfsd's * Based on the flag value it either: @@ -224,6 +409,7 @@ struct nfssvc_sock *slp; struct nfsuid *nuidp; struct nfsmount *nmp; + struct timeval now; #endif /* NFS_NOSERVER */ int error; @@ -259,13 +445,10 @@ if (error) return (error); - /* disable split funnels now */ - thread_funnel_merge(kernel_flock, network_flock); - - if ((nmp->nm_flag & NFSMNT_MNTD) && + if ((nmp->nm_state & NFSSTA_MNTD) && (uap->flag & NFSSVC_GOTAUTH) == 0) return (0); - nmp->nm_flag |= NFSMNT_MNTD; + nmp->nm_state |= NFSSTA_MNTD; error = nqnfs_clientd(nmp, p->p_ucred, &ncd, uap->flag, uap->argp, p); } else if (uap->flag & NFSSVC_ADDSOCK) { @@ -292,9 +475,6 @@ if (error) return (error); - /* disable split funnels now */ - thread_funnel_merge(kernel_flock, network_flock); - if ((uap->flag & NFSSVC_AUTHIN) && ((nfsd = nsd->nsd_nfsd)) && (nfsd->nfsd_slp->ns_flag & SLP_VALID)) { slp = nfsd->nfsd_slp; @@ -327,7 +507,7 @@ nuidp = (struct nfsuid *)0; if ((slp->ns_flag & SLP_VALID) == 0) { if (nuidp) - _FREE_ZONE((caddr_t)nuidp, + FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); } else { if (nuidp == (struct nfsuid *)0) { @@ -337,14 +517,15 @@ nu_lru); if (nuidp->nu_flag & NU_NAM) m_freem(nuidp->nu_nam); - } + } nuidp->nu_flag = 0; nuidp->nu_cr = nsd->nsd_cr; if (nuidp->nu_cr.cr_ngroups > NGROUPS) nuidp->nu_cr.cr_ngroups = NGROUPS; nuidp->nu_cr.cr_ref = 1; nuidp->nu_timestamp = nsd->nsd_timestamp; - nuidp->nu_expire = time.tv_sec + nsd->nsd_ttl; + microtime(&now); + nuidp->nu_expire = now.tv_sec + nsd->nsd_ttl; /* * and save the session key in nu_key. */ @@ -430,10 +611,13 @@ } #endif /* ISO */ } + /* reserve buffer space for 2 maximally-sized packets */ + siz = NFS_MAXPACKET; if (so->so_type == SOCK_STREAM) - siz = NFS_MAXPACKET + sizeof (u_long); - else - siz = NFS_MAXPACKET; + siz += sizeof (u_long); + siz *= 2; + if (siz > NFS_MAXSOCKBUF) + siz = NFS_MAXSOCKBUF; error = soreserve(so, siz, siz); if (error) { m_freem(mynam); @@ -451,6 +635,7 @@ int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &val; @@ -464,6 +649,7 @@ int val; bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &val; @@ -495,8 +681,9 @@ so->so_upcallarg = (caddr_t)slp; so->so_upcall = nfsrv_rcv; so->so_rcv.sb_flags |= SB_UPCALL; /* required for freebsd merge */ + slp->ns_nflag = SLPN_NEEDQ; thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - slp->ns_flag = (SLP_VALID | SLP_NEEDQ); + slp->ns_flag = SLP_VALID; nfsrv_wakenfsd(slp); splx(s); return (0); @@ -516,14 +703,13 @@ register int siz; register struct nfssvc_sock *slp; register struct socket *so; - register int *solockp; struct nfsd *nfsd = nsd->nsd_nfsd; struct nfsrv_descript *nd = NULL; struct mbuf *mreq; int error = 0, cacherep, s, sotype, writes_todo; int procrastinate; u_quad_t cur_usec; - extern void nfs_aio_thread_init(); + struct timeval now; #ifndef nolint cacherep = RC_DOIT; @@ -537,7 +723,6 @@ nfsd->nfsd_procp = p; TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); nfs_numnfsd++; - nfs_aio_thread_init(); } /* * Loop getting rpc requests until SIGKILL. @@ -572,21 +757,23 @@ if ((slp = nfsd->nfsd_slp) == (struct nfssvc_sock *)0) continue; if (slp->ns_flag & SLP_VALID) { - if (slp->ns_flag & SLP_DISCONN) + nfs_slplock(slp, 1); + if (slp->ns_nflag & SLPN_DISCONN) { nfsrv_zapsock(slp); - else if (slp->ns_flag & SLP_NEEDQ) { - slp->ns_flag &= ~SLP_NEEDQ; - (void) nfs_sndlock(&slp->ns_solock, - (struct nfsreq *)0); + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + } else if (slp->ns_nflag & SLPN_NEEDQ) { thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); + slp->ns_nflag &= ~SLPN_NEEDQ; nfsrv_rcv(slp->ns_so, (caddr_t)slp, M_WAIT); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - nfs_sndunlock(&slp->ns_solock); - } + } else + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); error = nfsrv_dorec(slp, nfsd, &nd); - cur_usec = (u_quad_t)time.tv_sec * 1000000 + - (u_quad_t)time.tv_usec; + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + nfs_slpunlock(slp); + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + + (u_quad_t)now.tv_usec; if (error && slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time <= cur_usec) { error = 0; @@ -602,7 +789,9 @@ } if (error || (slp->ns_flag & SLP_VALID) == 0) { if (nd) { - _FREE_ZONE((caddr_t)nd, + if (nd->nd_nam2) + m_freem(nd->nd_nam2); + FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nd = NULL; } @@ -614,12 +803,8 @@ splx(s); so = slp->ns_so; sotype = so->so_type; - if (so->so_proto->pr_flags & PR_CONNREQUIRED) - solockp = &slp->ns_solock; - else - solockp = (int *)0; if (nd) { - nd->nd_starttime = time; + microuptime(&nd->nd_starttime); if (nd->nd_nam2) nd->nd_nam = nd->nd_nam2; else @@ -648,9 +833,10 @@ * Check for just starting up for NQNFS and send * fake "try again later" replies to the NQNFS clients. */ - if (notstarted && nqnfsstarttime <= time.tv_sec) { + microtime(&now); + if (notstarted && nqnfsstarttime <= now.tv_sec) { if (modify_flag) { - nqnfsstarttime = time.tv_sec + nqsrv_writeslack; + nqnfsstarttime = now.tv_sec + nqsrv_writeslack; modify_flag = 0; } else notstarted = 0; @@ -672,7 +858,7 @@ } else if (nfs_privport) { /* Check if source port is privileged */ u_short port; - struct sockaddr *nam = nd->nd_nam; + struct sockaddr *nam = mtod(nd->nd_nam, struct sockaddr*); struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; @@ -713,8 +899,10 @@ if (nd->nd_procnum != NQNFSPROC_VACATED) nfsstats.srv_errs++; nfsrv_updatecache(nd, FALSE, mreq); - if (nd->nd_nam2) + if (nd->nd_nam2) { m_freem(nd->nd_nam2); + nd->nd_nam2 = NULL; + } break; } nfsstats.srvrpccnt[nd->nd_procnum]++; @@ -742,26 +930,31 @@ M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); *mtod(m, u_long *) = htonl(0x80000000 | siz); } - if (solockp) - (void) nfs_sndlock(solockp, (struct nfsreq *)0); + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + (void) nfs_slplock(slp, 1); if (slp->ns_flag & SLP_VALID) error = nfs_send(so, nd->nd_nam2, m, NULL); else { error = EPIPE; m_freem(m); } + mreq = NULL; if (nfsrtton) nfsd_rt(sotype, nd, cacherep); - if (nd->nd_nam2) + if (nd->nd_nam2) { MFREE(nd->nd_nam2, m); - if (nd->nd_mrep) + nd->nd_nam2 = NULL; + } + if (nd->nd_mrep) { m_freem(nd->nd_mrep); + nd->nd_mrep = NULL; + } if (error == EPIPE) nfsrv_zapsock(slp); - if (solockp) - nfs_sndunlock(solockp); + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + nfs_slpunlock(slp); if (error == EINTR || error == ERESTART) { - _FREE_ZONE((caddr_t)nd, + FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nfsrv_slpderef(slp); s = splnet(); @@ -773,9 +966,14 @@ nfsd_rt(sotype, nd, cacherep); m_freem(nd->nd_mrep); m_freem(nd->nd_nam2); + nd->nd_mrep = nd->nd_nam2 = NULL; break; }; if (nd) { + if (nd->nd_mrep) + m_freem(nd->nd_mrep); + if (nd->nd_nam2) + m_freem(nd->nd_nam2); FREE_ZONE((caddr_t)nd, sizeof *nd, M_NFSRVDESC); nd = NULL; } @@ -784,8 +982,9 @@ * Check to see if there are outstanding writes that * need to be serviced. */ - cur_usec = (u_quad_t)time.tv_sec * 1000000 + - (u_quad_t)time.tv_usec; + microuptime(&now); + cur_usec = (u_quad_t)now.tv_sec * 1000000 + + (u_quad_t)now.tv_usec; s = splsoftclock(); if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time <= cur_usec) { @@ -796,11 +995,14 @@ splx(s); } while (writes_todo); s = splnet(); + thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL); if (nfsrv_dorec(slp, nfsd, &nd)) { + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); nfsd->nfsd_flag &= ~NFSD_REQINPROG; nfsd->nfsd_slp = NULL; nfsrv_slpderef(slp); - } + } else + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); } done: TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain); @@ -819,6 +1021,31 @@ SYSCTL_INT(_vfs_nfs, OID_AUTO, defect, CTLFLAG_RW, &nfs_defect, 0, ""); #endif +#ifndef _SYS_SYSPROTO_H_ +struct nfsclnt_args { + int flag; + caddr_t argp; +}; +#endif +int +nfsclnt(struct proc *p, struct nfsclnt_args *uap) +{ + struct lockd_ans la; + int error; + + if (uap->flag == NFSCLNT_LOCKDWAIT) { + return (nfslockdwait(p)); + } + if (uap->flag == NFSCLNT_LOCKDANS) { + error = copyin(uap->argp, &la, sizeof(la)); + return (error != 0 ? error : nfslockdans(p, &la)); + } + if (uap->flag == NFSCLNT_LOCKDFD) + return (nfslockdfd(p, (int)uap->argp)); + return EINVAL; +} + + static int nfssvc_iod_continue(int); /* @@ -830,7 +1057,6 @@ nfssvc_iod(p) struct proc *p; { - register struct buf *bp; register int i, myiod; struct nfsmount *nmp; int error = 0; @@ -850,8 +1076,7 @@ return (EBUSY); nfs_numasync++; - /* stuff myiod into uthread to get off local stack for - continuation */ + /* stuff myiod into uthread to get off local stack for continuation */ ut = (struct uthread *)get_bsdthread_info(current_act()); ut->uu_state.uu_nfs_myiod = myiod; /* squirrel away for continuation */ @@ -867,7 +1092,7 @@ static int nfssvc_iod_continue(error) { - register struct buf *bp; + register struct nfsbuf *bp; register int i, myiod; struct nfsmount *nmp; struct uthread *ut; @@ -882,12 +1107,12 @@ /* * Just loop around doin our stuff until SIGKILL - * - actually we don't loop with continuations... + * - actually we don't loop with continuations... */ for (;;) { while (((nmp = nfs_iodmount[myiod]) == NULL || nmp->nm_bufq.tqh_first == NULL) - && error == 0) { + && error == 0 && nfs_ioddelwri == 0) { if (nmp) nmp->nm_bufqiods--; nfs_iodwant[myiod] = p; @@ -906,30 +1131,51 @@ error = 0; unix_syscall_return(error); } - while ((bp = nmp->nm_bufq.tqh_first) != NULL) { - /* Take one off the front of the list */ - TAILQ_REMOVE(&nmp->nm_bufq, bp, b_freelist); - nmp->nm_bufqlen--; - if (nmp->nm_bufqwant && nmp->nm_bufqlen < 2 * nfs_numasync) { - nmp->nm_bufqwant = FALSE; - wakeup(&nmp->nm_bufq); - } - if (ISSET(bp->b_flags, B_READ)) - (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); - else - (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + if (nmp != NULL) { + while ((bp = nmp->nm_bufq.tqh_first) != NULL) { + /* Take one off the front of the list */ + TAILQ_REMOVE(&nmp->nm_bufq, bp, nb_free); + bp->nb_free.tqe_next = NFSNOLIST; + nmp->nm_bufqlen--; + if (nmp->nm_bufqwant && nmp->nm_bufqlen < 2 * nfs_numasync) { + nmp->nm_bufqwant = FALSE; + wakeup(&nmp->nm_bufq); + } + if (ISSET(bp->nb_flags, NB_READ)) + (void) nfs_doio(bp, bp->nb_rcred, (struct proc *)0); + else + (void) nfs_doio(bp, bp->nb_wcred, (struct proc *)0); - /* - * If there are more than one iod on this mount, then defect - * so that the iods can be shared out fairly between the mounts - */ - if (nfs_defect && nmp->nm_bufqiods > 1) { - NFS_DPF(ASYNCIO, - ("nfssvc_iod: iod %d defecting from mount %p\n", - myiod, nmp)); - nfs_iodmount[myiod] = NULL; - nmp->nm_bufqiods--; - break; + /* + * If there are more than one iod on this mount, then defect + * so that the iods can be shared out fairly between the mounts + */ + if (nfs_defect && nmp->nm_bufqiods > 1) { + NFS_DPF(ASYNCIO, + ("nfssvc_iod: iod %d defecting from mount %p\n", + myiod, nmp)); + nfs_iodmount[myiod] = NULL; + nmp->nm_bufqiods--; + break; + } + } + } + if (nfs_ioddelwri) { + i = 0; + nfs_ioddelwri = 0; + while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { + struct nfsnode *np = VTONFS(bp->nb_vp); + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + /* put buffer at end of delwri list */ + TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); + nfsbufdelwricnt++; + nfs_flushcommits(np->n_vnode, (struct proc *)0); + } else { + SET(bp->nb_flags, (NB_BUSY | NB_ASYNC | NB_IOD)); + nfs_buf_write(bp); + } + i++; } } } @@ -954,6 +1200,7 @@ int s; slp->ns_flag &= ~SLP_ALLFLAGS; + slp->ns_nflag &= ~SLP_ALLFLAGS; fp = slp->ns_fp; if (fp) { slp->ns_fp = (struct file *)0; @@ -962,12 +1209,13 @@ so->so_upcall = NULL; so->so_rcv.sb_flags &= ~SB_UPCALL; soshutdown(so, 2); - thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); - closef(fp, (struct proc *)0); if (slp->ns_nam) MFREE(slp->ns_nam, m); m_freem(slp->ns_raw); m_freem(slp->ns_rec); + slp->ns_nam = slp->ns_raw = slp->ns_rec = NULL; + thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL); + closef(fp, (struct proc *)0); for (nuidp = slp->ns_uidlruhead.tqh_first; nuidp != 0; nuidp = nnuidp) { nnuidp = nuidp->nu_lru.tqe_next; @@ -975,14 +1223,14 @@ TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru); if (nuidp->nu_flag & NU_NAM) m_freem(nuidp->nu_nam); - _FREE_ZONE((caddr_t)nuidp, + FREE_ZONE((caddr_t)nuidp, sizeof (struct nfsuid), M_NFSUID); } s = splsoftclock(); for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) { nnwp = nwp->nd_tq.le_next; LIST_REMOVE(nwp, nd_tq); - _FREE_ZONE((caddr_t)nwp, sizeof *nwp, M_NFSRVDESC); + FREE_ZONE((caddr_t)nwp, sizeof *nwp, M_NFSRVDESC); } LIST_INIT(&slp->ns_tq); splx(s); @@ -1006,17 +1254,17 @@ { int error = 0; - while ((nmp->nm_flag & NFSMNT_WAITAUTH) == 0) { - nmp->nm_flag |= NFSMNT_WANTAUTH; + while ((nmp->nm_state & NFSSTA_WAITAUTH) == 0) { + nmp->nm_state |= NFSSTA_WANTAUTH; (void) tsleep((caddr_t)&nmp->nm_authtype, PSOCK, "nfsauth1", 2 * hz); error = nfs_sigintr(nmp, rep, rep->r_procp); if (error) { - nmp->nm_flag &= ~NFSMNT_WANTAUTH; + nmp->nm_state &= ~NFSSTA_WANTAUTH; return (error); } } - nmp->nm_flag &= ~(NFSMNT_WAITAUTH | NFSMNT_WANTAUTH); + nmp->nm_state &= ~(NFSSTA_WAITAUTH | NFSSTA_WANTAUTH); MALLOC(*auth_str, char *, RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); nmp->nm_authstr = *auth_str; nmp->nm_authlen = RPCAUTH_MAXSIZ; @@ -1028,13 +1276,13 @@ /* * And wait for mount_nfs to do its stuff. */ - while ((nmp->nm_flag & NFSMNT_HASAUTH) == 0 && error == 0) { + while ((nmp->nm_state & NFSSTA_HASAUTH) == 0 && error == 0) { (void) tsleep((caddr_t)&nmp->nm_authlen, PSOCK, "nfsauth2", 2 * hz); error = nfs_sigintr(nmp, rep, rep->r_procp); } - if (nmp->nm_flag & NFSMNT_AUTHERR) { - nmp->nm_flag &= ~NFSMNT_AUTHERR; + if (nmp->nm_state & NFSSTA_AUTHERR) { + nmp->nm_state &= ~NFSSTA_AUTHERR; error = EAUTH; } if (error) @@ -1044,10 +1292,10 @@ *verf_len = nmp->nm_verflen; bcopy((caddr_t)nmp->nm_key, (caddr_t)key, sizeof (key)); } - nmp->nm_flag &= ~NFSMNT_HASAUTH; - nmp->nm_flag |= NFSMNT_WAITAUTH; - if (nmp->nm_flag & NFSMNT_WANTAUTH) { - nmp->nm_flag &= ~NFSMNT_WANTAUTH; + nmp->nm_state &= ~NFSSTA_HASAUTH; + nmp->nm_state |= NFSSTA_WAITAUTH; + if (nmp->nm_state & NFSSTA_WANTAUTH) { + nmp->nm_state &= ~NFSSTA_WANTAUTH; wakeup((caddr_t)&nmp->nm_authtype); } return (error); @@ -1067,7 +1315,7 @@ { register struct nfsuid *nuidp; register u_long *nickp, *verfp; - struct timeval ktvin, ktvout; + struct timeval ktvin, ktvout, now; #if DIAGNOSTIC if (verf_len < (4 * NFSX_UNSIGNED)) @@ -1078,7 +1326,8 @@ if (nuidp->nu_cr.cr_uid == cred->cr_uid) break; } - if (!nuidp || nuidp->nu_expire < time.tv_sec) + microtime(&now); + if (!nuidp || nuidp->nu_expire < now.tv_sec) return (EACCES); /* @@ -1098,10 +1347,11 @@ */ verfp = (u_long *)verf_str; *verfp++ = txdr_unsigned(RPCAKN_NICKNAME); - if (time.tv_sec > nuidp->nu_timestamp.tv_sec || - (time.tv_sec == nuidp->nu_timestamp.tv_sec && - time.tv_usec > nuidp->nu_timestamp.tv_usec)) - nuidp->nu_timestamp = time; + microtime(&now); + if (now.tv_sec > nuidp->nu_timestamp.tv_sec || + (now.tv_sec == nuidp->nu_timestamp.tv_sec && + now.tv_usec > nuidp->nu_timestamp.tv_usec)) + nuidp->nu_timestamp = now; else nuidp->nu_timestamp.tv_usec++; ktvin.tv_sec = txdr_unsigned(nuidp->nu_timestamp.tv_sec); @@ -1138,7 +1388,7 @@ register u_long *tl; register long t1; struct mbuf *md = *mdp; - struct timeval ktvin, ktvout; + struct timeval ktvin, ktvout, now; u_long nick; char *dpos = *dposp, *cp2; int deltasec, error = 0; @@ -1157,7 +1407,8 @@ #endif ktvout.tv_sec = fxdr_unsigned(long, ktvout.tv_sec); ktvout.tv_usec = fxdr_unsigned(long, ktvout.tv_usec); - deltasec = time.tv_sec - ktvout.tv_sec; + microtime(&now); + deltasec = now.tv_sec - ktvout.tv_sec; if (deltasec < 0) deltasec = -deltasec; /* @@ -1177,7 +1428,7 @@ } nuidp->nu_flag = 0; nuidp->nu_cr.cr_uid = cred->cr_uid; - nuidp->nu_expire = time.tv_sec + NFS_KERBTTL; + nuidp->nu_expire = now.tv_sec + NFS_KERBTTL; nuidp->nu_timestamp = ktvout; nuidp->nu_nickname = nick; bcopy(key, nuidp->nu_key, sizeof (key)); @@ -1211,6 +1462,44 @@ } /* + * Lock a socket against others. + */ +int +nfs_slplock(slp, wait) + register struct nfssvc_sock *slp; + int wait; +{ + int *statep = &slp->ns_solock; + + if (!wait && (*statep & NFSSTA_SNDLOCK)) + return(0); /* already locked, fail */ + while (*statep & NFSSTA_SNDLOCK) { + *statep |= NFSSTA_WANTSND; + (void) tsleep((caddr_t)statep, PZERO - 1, "nfsslplck", 0); + } + *statep |= NFSSTA_SNDLOCK; + return (1); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_slpunlock(slp) + struct nfssvc_sock *slp; +{ + int *statep = &slp->ns_solock; + + if ((*statep & NFSSTA_SNDLOCK) == 0) + panic("nfs slpunlock"); + *statep &= ~NFSSTA_SNDLOCK; + if (*statep & NFSSTA_WANTSND) { + *statep &= ~NFSSTA_WANTSND; + wakeup((caddr_t)statep); + } +} + +/* * Initialize the data structures for the server. * Handshake with any new nfsds starting up to avoid any chance of * corruption. @@ -1274,6 +1563,7 @@ int cacherep; { register struct drt *rt; + struct timeval now; rt = &nfsdrt.drt[nfsdrt.pos]; if (cacherep == RC_DOIT) @@ -1293,9 +1583,10 @@ rt->ipadr = mtod(nd->nd_nam, struct sockaddr_in *)->sin_addr.s_addr; else rt->ipadr = INADDR_ANY; - rt->resptime = ((time.tv_sec - nd->nd_starttime.tv_sec) * 1000000) + - (time.tv_usec - nd->nd_starttime.tv_usec); - rt->tstamp = time; + microuptime(&now); + rt->resptime = ((now.tv_sec - nd->nd_starttime.tv_sec) * 1000000) + + (now.tv_usec - nd->nd_starttime.tv_usec); + microtime(&rt->tstamp); // XXX unused nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ; } #endif /* NFS_NOSERVER */ diff -urN xnu-344.49/bsd/nfs/nfs_vfsops.c xnu-517/bsd/nfs/nfs_vfsops.c --- xnu-344.49/bsd/nfs/nfs_vfsops.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_vfsops.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -74,7 +74,6 @@ #include #include #include -#include #include #include #include @@ -121,6 +120,18 @@ #endif #endif +SYSCTL_DECL(_vfs_generic_nfs); +SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW, 0, + "nfs client hinge"); +/* how long NFS will wait before signalling vfs that it's down. */ +static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; +SYSCTL_INT(_vfs_generic_nfs_client, NFS_TPRINTF_INITIAL_DELAY, + initialdowndelay, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); +/* how long between console messages "nfs server foo not responding" */ +static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; +SYSCTL_INT(_vfs_generic_nfs_client, NFS_TPRINTF_DELAY, + nextdowndelay, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); + static int nfs_iosize __P((struct nfsmount *nmp)); static int mountnfs __P((struct nfs_args *,struct mount *, struct mbuf *,char *,char *,struct vnode **)); @@ -141,7 +152,7 @@ static int nfs_fhtovp __P((struct mount *mp, struct fid *fhp, struct mbuf *nam, struct vnode **vpp, int *exflagsp, struct ucred **credanonp)); -static int nfs_vget __P((struct mount *, ino_t, struct vnode **)); +static int nfs_vget __P((struct mount *, void *, struct vnode **)); /* @@ -192,12 +203,15 @@ * Calculate the size used for io buffers. Use the larger * of the two sizes to minimise nfs requests but make sure * that it is at least one VM page to avoid wasting buffer - * space. + * space and to allow easy mmapping of I/O buffers. + * The read/write rpc calls handle the splitting up of + * buffers into multiple requests if the buffer size is + * larger than the I/O size. */ iosize = max(nmp->nm_rsize, nmp->nm_wsize); if (iosize < PAGE_SIZE) iosize = PAGE_SIZE; - return (trunc_page(iosize)); + return (trunc_page_32(iosize)); } static void nfs_convert_oargs(args,oargs) @@ -255,7 +269,7 @@ return(error); cred = crget(); cred->cr_ngroups = 1; - if (v3 && (nmp->nm_flag & NFSMNT_GOTFSINFO) == 0) + if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, cred, p); nfsstats.rpccnt[NFSPROC_FSSTAT]++; nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3)); @@ -355,7 +369,7 @@ if (max < nmp->nm_readdirsize) { nmp->nm_readdirsize = max; } - nmp->nm_flag |= NFSMNT_GOTFSINFO; + nmp->nm_state |= NFSSTA_GOTFSINFO; } nfsm_reqdone; return (error); @@ -513,8 +527,12 @@ if ((error = mountnfs(&args, mp, m, mntname, args.hostname, vpp))) { printf("nfs_mountroot: mount %s failed: %d", mntname, error); mp->mnt_vfc->vfc_refcount--; + + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + FREE(mp->mnt_xinfo_ptr, M_TEMP); vfs_unbusy(mp, procp); - _FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + + FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); return (error); } #if 0 /* Causes incorrect reporting of "mounted on" */ @@ -607,9 +625,9 @@ mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); - /* Initialize the default IO constraints */ - mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; - mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; + /* Initialize the default IO constraints */ + mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; + mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); (void)vfs_busy(mp, LK_NOWAIT, 0, procp); @@ -645,8 +663,12 @@ if ((error = mountnfs(&args, mp, m, mntname, args.hostname, &vp))) { printf("nfs_mountroot: mount %s failed: %d", mntname, error); mp->mnt_vfc->vfc_refcount--; + + if (mp->mnt_kern_flag & MNTK_IO_XINFO) + FREE(mp->mnt_xinfo_ptr, M_TEMP); vfs_unbusy(mp, procp); - _FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); + + FREE_ZONE(mp, sizeof (struct mount), M_MOUNT); return (error); } @@ -679,7 +701,7 @@ struct mbuf *nam; struct vnode *vp; char pth[MNAMELEN], hst[MNAMELEN]; - u_int len; + size_t len; u_char nfh[NFSX_V3FHMAX]; error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args)); @@ -750,6 +772,13 @@ error = NFSERR_NOTSUPP; goto bad2; } + + /* + * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes + * no sense in that context. + */ + if (argp->sotype == SOCK_STREAM) + argp->flags &= ~NFSMNT_NOCONN; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); @@ -777,26 +806,30 @@ mp->mnt_maxsymlinklen = 1; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; - nmp->nm_wsize = NFS_WSIZE; - nmp->nm_rsize = NFS_RSIZE; + if (argp->sotype == SOCK_DGRAM) { + nmp->nm_wsize = NFS_DGRAM_WSIZE; + nmp->nm_rsize = NFS_DGRAM_RSIZE; + } else { + nmp->nm_wsize = NFS_WSIZE; + nmp->nm_rsize = NFS_RSIZE; + } nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; nmp->nm_leaseterm = NQ_DEFLEASE; nmp->nm_deadthresh = NQ_DEADTHRESH; + nmp->nm_tprintf_delay = nfs_tprintf_delay; + if (nmp->nm_tprintf_delay < 0) + nmp->nm_tprintf_delay = 0; + nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; + if (nmp->nm_tprintf_initial_delay < 0) + nmp->nm_tprintf_initial_delay = 0; CIRCLEQ_INIT(&nmp->nm_timerhead); nmp->nm_inprog = NULLVP; bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); bcopy(pth, mp->mnt_stat.f_mntonname, MNAMELEN); nmp->nm_nam = nam; - /* - * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes - * no sense in that context. - */ - if (argp->sotype == SOCK_STREAM) - argp->flags &= ~NFSMNT_NOCONN; - if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) @@ -876,13 +909,6 @@ goto bad; /* - * This is silly, but it has to be set so that vinifod() works. - * We do not want to do an nfs_statfs() here since we can get - * stuck on a dead server and we are holding a lock on the mount - * point. - */ - mp->mnt_stat.f_iosize = nfs_iosize(nmp); - /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if @@ -906,7 +932,24 @@ * effect of filling in (*vpp)->v_type with the correct value. */ curproc = current_proc(); - VOP_GETATTR(*vpp, &attrs, curproc->p_ucred, curproc); + error = VOP_GETATTR(*vpp, &attrs, curproc->p_ucred, curproc); + if (error) { + /* + * we got problems... we couldn't get the attributes + * from the NFS server... so the mount fails. + */ + vput(*vpp); + goto bad; + } + + /* + * Set the mount point's block I/O size. + * We really need to do this after we get info back from + * the server about what its preferred I/O sizes are. + */ + if (nmp->nm_flag & NFSMNT_NFSV3) + nfs_fsinfo(nmp, *vpp, curproc->p_ucred, curproc); + mp->mnt_stat.f_iosize = nfs_iosize(nmp); /* * Lose the lock but keep the ref. @@ -916,7 +959,7 @@ return (0); bad: nfs_disconnect(nmp); - _FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); + FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); bad2: m_freem(nam); return (error); @@ -936,10 +979,18 @@ struct vnode *vp; int error, flags = 0; - if (mntflags & MNT_FORCE) - flags |= FORCECLOSE; nmp = VFSTONFS(mp); /* + * During a force unmount we want to... + * Mark that we are doing a force unmount. + * Make the mountpoint soft. + */ + if (mntflags & MNT_FORCE) { + flags |= FORCECLOSE; + nmp->nm_state |= NFSSTA_FORCE; + nmp->nm_flag |= NFSMNT_SOFT; + } + /* * Goes something like this.. * - Call vflush() to clear out vnodes for this file system, * except for the swap files. Deal with them in 2nd pass. @@ -953,7 +1004,7 @@ /* * Must handshake with nqnfs_clientd() if it is active. */ - nmp->nm_flag |= NFSMNT_DISMINPROG; + nmp->nm_state |= NFSSTA_DISMINPROG; while (nmp->nm_inprog != NULLVP) (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsdism", 0); /* @@ -962,18 +1013,18 @@ * not get EBUSY back. */ error = vflush(mp, vp, SKIPSWAP | flags); - if (mntflags & MNT_FORCE) + if (mntflags & MNT_FORCE) { error = vflush(mp, NULLVP, flags); /* locks vp in the process */ - else { + } else { if (vp->v_usecount > 1) { - nmp->nm_flag &= ~NFSMNT_DISMINPROG; + nmp->nm_state &= ~NFSSTA_DISMINPROG; return (EBUSY); } error = vflush(mp, vp, flags); } if (error) { - nmp->nm_flag &= ~NFSMNT_DISMINPROG; + nmp->nm_state &= ~NFSSTA_DISMINPROG; return (error); } @@ -982,7 +1033,7 @@ * For NQNFS, let the server daemon free the nfsmount structure. */ if (nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) - nmp->nm_flag |= NFSMNT_DISMNT; + nmp->nm_state |= NFSSTA_DISMNT; /* * Release the root vnode reference held by mountnfs() @@ -1018,7 +1069,7 @@ if (hw_atomic_sub(&nfsreqqusers, 1) != 0) nfsatompanic("unmount sub"); #endif - _FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); + FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); } return (0); } @@ -1033,13 +1084,18 @@ { register struct vnode *vp; struct nfsmount *nmp; - int error; + int error, vpid; nmp = VFSTONFS(mp); vp = nmp->nm_dvp; - error = vget(vp, LK_EXCLUSIVE, current_proc()); - if (error) - return (error); + vpid = vp->v_id; + while (error = vget(vp, LK_EXCLUSIVE, current_proc())) { + /* vget may return ENOENT if the dir changes while in vget */ + /* If that happens, try vget again, else return the error */ + if ((error != ENOENT) || (vp->v_id == vpid)) + return (error); + vpid = vp->v_id; + } if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_flag |= VROOT; @@ -1067,17 +1123,15 @@ * Force stale buffer cache information to be flushed. */ loop: - for (vp = mp->mnt_vnodelist.lh_first; - vp != NULL; - vp = vp->v_mntvnodes.le_next) { - int didhold = 0; + LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + int didhold; /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; - if (VOP_ISLOCKED(vp) || vp->v_dirtyblkhd.lh_first == NULL) + if (VOP_ISLOCKED(vp) || LIST_FIRST(&VTONFS(vp)->n_dirtyblkhd) == NULL) continue; if (vget(vp, LK_EXCLUSIVE, p)) goto loop; @@ -1101,7 +1155,7 @@ static int nfs_vget(mp, ino, vpp) struct mount *mp; - ino_t ino; + void *ino; /* XXX void* or ino_t? */ struct vnode **vpp; { @@ -1175,7 +1229,12 @@ nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { - int rv; + int error; + struct sysctl_req *req; + struct vfsidctl vc; + struct mount *mp; + struct nfsmount *nmp; + struct vfsquery vq; /* * All names at this level are terminal. @@ -1183,6 +1242,24 @@ if(namelen > 1) return ENOTDIR; /* overloaded */ + /* common code for "new style" VFS_CTL sysctl, get the mount. */ + switch (name[0]) { + case VFS_CTL_TIMEO: + case VFS_CTL_QUERY: + req = oldp; + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) + return (error); + mp = vfs_getvfs(&vc.vc_fsid); + if (mp == NULL) + return (ENOENT); + nmp = VFSTONFS(mp); + if (nmp == NULL) + return (ENOENT); + bzero(&vq, sizeof(vq)); + VCTLTOREQ(&vc, req); + } + switch(name[0]) { case NFS_NFSSTATS: if(!oldp) { @@ -1195,8 +1272,9 @@ return ENOMEM; } - rv = copyout(&nfsstats, oldp, sizeof nfsstats); - if(rv) return rv; + error = copyout(&nfsstats, oldp, sizeof nfsstats); + if (error) + return (error); if(newp && newlen != sizeof nfsstats) return EINVAL; @@ -1205,9 +1283,30 @@ return copyin(newp, &nfsstats, sizeof nfsstats); } return 0; - + case VFS_CTL_QUERY: + if ((nmp->nm_state & NFSSTA_TIMEO)) + vq.vq_flags |= VQ_NOTRESP; + error = SYSCTL_OUT(req, &vq, sizeof(vq)); + break; + case VFS_CTL_TIMEO: + if (req->oldptr != NULL) { + error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, + sizeof(nmp->nm_tprintf_initial_delay)); + if (error) + return (error); + } + if (req->newptr != NULL) { + error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, + sizeof(nmp->nm_tprintf_initial_delay)); + if (error) + return (error); + if (nmp->nm_tprintf_initial_delay < 0) + nmp->nm_tprintf_initial_delay = 0; + } + break; default: - return EOPNOTSUPP; + return (ENOTSUP); } + return (error); } diff -urN xnu-344.49/bsd/nfs/nfs_vnops.c xnu-517/bsd/nfs/nfs_vnops.c --- xnu-344.49/bsd/nfs/nfs_vnops.c Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfs_vnops.c Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -72,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -83,7 +82,6 @@ #include #include -#include #include #include @@ -101,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -128,6 +127,15 @@ #define TRUE 1 #define FALSE 0 +#define NFS_FREE_PNBUF(CNP) \ + do { \ + char *tmp = (CNP)->cn_pnbuf; \ + (CNP)->cn_pnbuf = NULL; \ + (CNP)->cn_flags &= ~HASBUF; \ + FREE_ZONE(tmp, (CNP)->cn_pnlen, M_NAMEI); \ + } while (0) + + static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); @@ -158,7 +166,6 @@ static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); -static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *,char *,int,struct ucred *,struct proc *,struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); @@ -167,7 +174,6 @@ static int nfs_pathconf __P((struct vop_pathconf_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); static int nfs_blkatoff __P((struct vop_blkatoff_args *)); -static int nfs_bwrite __P((struct vop_bwrite_args *)); static int nfs_valloc __P((struct vop_valloc_args *)); static int nfs_vfree __P((struct vop_vfree_args *)); static int nfs_truncate __P((struct vop_truncate_args *)); @@ -209,13 +215,13 @@ { &vop_symlink_desc, (vop_t *)nfs_symlink }, /* symlink */ { &vop_readdir_desc, (vop_t *)nfs_readdir }, /* readdir */ { &vop_readlink_desc, (vop_t *)nfs_readlink }, /* readlink */ - { &vop_abortop_desc, (vop_t *)nfs_abortop }, /* abortop */ + { &vop_abortop_desc, (vop_t *)nop_abortop }, /* abortop */ { &vop_inactive_desc, (vop_t *)nfs_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *)nfs_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *)nfs_lock }, /* lock */ { &vop_unlock_desc, (vop_t *)nfs_unlock }, /* unlock */ { &vop_bmap_desc, (vop_t *)nfs_bmap }, /* bmap */ - { &vop_strategy_desc, (vop_t *)nfs_strategy }, /* strategy */ + { &vop_strategy_desc, (vop_t *)err_strategy }, /* strategy */ { &vop_print_desc, (vop_t *)nfs_print }, /* print */ { &vop_islocked_desc, (vop_t *)nfs_islocked }, /* islocked */ { &vop_pathconf_desc, (vop_t *)nfs_pathconf }, /* pathconf */ @@ -226,7 +232,7 @@ { &vop_vfree_desc, (vop_t *)nfs_vfree }, /* vfree */ { &vop_truncate_desc, (vop_t *)nfs_truncate }, /* truncate */ { &vop_update_desc, (vop_t *)nfs_update }, /* update */ - { &vop_bwrite_desc, (vop_t *)nfs_bwrite }, /* bwrite */ + { &vop_bwrite_desc, (vop_t *)err_bwrite }, /* bwrite */ { &vop_pagein_desc, (vop_t *)nfs_pagein }, /* Pagein */ { &vop_pageout_desc, (vop_t *)nfs_pageout }, /* Pageout */ { &vop_copyfile_desc, (vop_t *)err_copyfile }, /* Copyfile */ @@ -363,8 +369,6 @@ VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); #endif -static int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, - struct ucred *cred, struct proc *procp)); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); @@ -387,6 +391,7 @@ struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; +int nfs_ioddelwri = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; @@ -528,30 +533,32 @@ struct ucred *cred) { const int v3 = 1; - u_int32_t *tl; + u_long *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; caddr_t bpos, dpos, cp2; - register int32_t t1, t2; + register long t1, t2; register caddr_t cp; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); u_int64_t xid; + struct timeval now; nfsstats.rpccnt[NFSPROC_ACCESS]++; nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); nfsm_fhtom(vp, v3); - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred, &xid); nfsm_postop_attr(vp, attrflag, &xid); if (!error) { - nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; - np->n_modestamp = time_second; + microuptime(&now); + np->n_modestamp = now.tv_sec; } nfsm_reqdone; return error; @@ -577,6 +584,7 @@ u_long mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); + struct timeval now; /* * For nfs v3, do an access rpc, otherwise you are stuck emulating @@ -615,7 +623,8 @@ * Does our cached result allow us to give a definite yes to * this request? */ - if (time_second < np->n_modestamp + nfsaccess_cache_timeout && + microuptime(&now); + if (now.tv_sec < np->n_modestamp + nfsaccess_cache_timeout && ap->a_cred->cr_uid == np->n_modeuid && (np->n_mode & mode) == mode) { /* nfsstats.accesscache_hits++; */ @@ -708,14 +717,22 @@ error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); + /* if directory changed, purge any name cache entries */ + if ((vp->v_type == VDIR) && + (np->n_mtime != vattr.va_mtime.tv_sec)) + cache_purge(vp); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { np->n_direofoffset = 0; + nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); @@ -771,6 +788,7 @@ { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp; int error = 0; if (vp->v_type == VREG) { @@ -781,8 +799,21 @@ &sp->s_name[0], (unsigned)(sp->s_dvp), (unsigned)vp, (unsigned)ap, (unsigned)np, (unsigned)sp); #endif - if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { + int getlock = !VOP_ISLOCKED(vp); + if (getlock) { + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); + if (!error && !VFSTONFS(vp->v_mount)) { + VOP_UNLOCK(vp, 0, ap->a_p); + error = ENXIO; + } + if (error) + return (error); + } if (NFS_ISV3(vp)) { error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, 1); /* @@ -791,9 +822,12 @@ * NMODIFIED is a hint */ /* np->n_flag &= ~NMODIFIED; */ - } else + } else { error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + } np->n_attrstamp = 0; + if (getlock) + VOP_UNLOCK(vp, 0, ap->a_p); } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; @@ -823,7 +857,7 @@ caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; u_int64_t xid; int avoidfloods; @@ -845,6 +879,12 @@ np->n_flag); return (error); } + + if (!VFSTONFS(vp->v_mount)) { + FSDBG_BOT(513, np->n_size, ENXIO, np->n_vattr.va_size, np->n_flag); + return (ENXIO); + } + v3 = NFS_ISV3(vp); error = 0; if (v3 && nfsaccess_cache_timeout > 0) { @@ -878,13 +918,17 @@ } if (np->n_mtime != ap->a_vap->va_mtime.tv_sec) { FSDBG(513, -1, np, -1, vp); - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { nfs_invaldir(vp); + /* purge name cache entries */ + cache_purge(vp); + } error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); FSDBG(513, -1, np, -2, error); - if (!error) + if (!error) { np->n_mtime = ap->a_vap->va_mtime.tv_sec; + } } } nfsm_reqdone; @@ -973,32 +1017,70 @@ } else if (np->n_size > vap->va_size) { /* shrinking? */ daddr_t obn, bn; int biosize; - struct buf *bp; + struct nfsbuf *bp; - biosize = min(vp->v_mount->mnt_stat.f_iosize, - PAGE_SIZE); + biosize = vp->v_mount->mnt_stat.f_iosize; obn = (np->n_size - 1) / biosize; bn = vap->va_size / biosize; for ( ; obn >= bn; obn--) - if (incore(vp, obn)) { - bp = getblk(vp, obn, biosize, 0, - 0, BLK_READ); - FSDBG(512, bp, bp->b_flags, - 0, obn); - SET(bp->b_flags, B_INVAL); - brelse(bp); + if (nfs_buf_incore(vp, obn)) { + bp = nfs_buf_get(vp, obn, biosize, 0, BLK_READ); + if (!bp) + continue; + if (obn == bn) { + int neweofoff, mustwrite; + mustwrite = 0; + neweofoff = vap->va_size - NBOFF(bp); + /* check for any dirty data before the new EOF */ + if (bp->nb_dirtyend && bp->nb_dirtyoff < neweofoff) { + /* clip dirty range to EOF */ + if (bp->nb_dirtyend > neweofoff) + bp->nb_dirtyend = neweofoff; + mustwrite++; + } + bp->nb_dirty &= (1 << round_page_32(neweofoff)/PAGE_SIZE) - 1; + if (bp->nb_dirty) + mustwrite++; + if (mustwrite) { + /* gotta write out dirty data before invalidating */ + /* (NB_STABLE indicates that data writes should be FILESYNC) */ + /* (NB_NOCACHE indicates buffer should be discarded) */ + CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC | NB_READ)); + SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); + /* + * NFS has embedded ucred so crhold() risks zone corruption + */ + if (bp->nb_wcred == NOCRED) + bp->nb_wcred = crdup(ap->a_cred); + error = nfs_buf_write(bp); + // Note: bp has been released + if (error) { + FSDBG(512, bp, 0xd00dee, 0xbad, error); + np->n_error = error; + np->n_flag |= NWRITEERR; + error = 0; + } + bp = NULL; + } } + if (bp) { + FSDBG(512, bp, bp->nb_flags, 0, obn); + SET(bp->nb_flags, NB_INVAL); + nfs_buf_release(bp); + } + } } tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; - ubc_setsize(vp, (off_t)vap->va_size); /* XXX */ + ubc_setsize(vp, (off_t)vap->va_size); /* XXX error? */ }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && - (np->n_flag & NMODIFIED) && vp->v_type == VREG && - (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, - ap->a_p, 1)) == EINTR) - return (error); + (np->n_flag & NMODIFIED) && vp->v_type == VREG) { + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + if (error == EINTR) + return (error); + } error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); FSDBG_BOT(512, np->n_size, vap->va_size, np->n_vattr.va_size, error); if (error && vap->va_size != VNOVAL) { @@ -1033,8 +1115,13 @@ u_long *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; u_int64_t xid; + struct timeval now; + + if (!VFSTONFS(vp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); @@ -1072,8 +1159,9 @@ nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } + microtime(&now); if (vap->va_atime.tv_sec != VNOVAL) { - if (vap->va_atime.tv_sec != time.tv_sec) { + if (vap->va_atime.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_atime, tl); @@ -1086,7 +1174,7 @@ *tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE); } if (vap->va_mtime.tv_sec != VNOVAL) { - if (vap->va_mtime.tv_sec != time.tv_sec) { + if (vap->va_mtime.tv_sec != now.tv_sec) { nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT); txdr_nfsv3time(&vap->va_mtime, tl); @@ -1121,7 +1209,7 @@ nfsm_request(vp, NFSPROC_SETATTR, procp, cred, &xid); if (v3) { nfsm_wcc_data(vp, wccflag, &xid); - if (!wccflag && vp->v_type != VBAD) /* EINVAL on VBAD node */ + if (!wccflag) VTONFS(vp)->n_attrstamp = 0; } else nfsm_loadattr(vp, (struct vattr *)0, &xid); @@ -1151,7 +1239,6 @@ register u_long *tl; register caddr_t cp; register long t1, t2; - struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; @@ -1160,8 +1247,9 @@ int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; - int worldbuildworkaround = 1; + int unlockdvp = 0; u_int64_t xid; + struct vattr vattr; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) @@ -1169,92 +1257,81 @@ *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); + lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); - nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); - if (worldbuildworkaround) { - /* - * Temporary workaround for world builds to not have dvp go - * VBAD on during server calls in this routine. When - * the real ref counting problem is found take this out. - * Note if this was later and before the nfsm_request - * set up, the workaround did not work (NOTE other difference - * was I only put one VREF in that time. Thus it needs - * to be above the cache_lookup branch or with 2 VREFS. Not - * sure which. Can't play with world builds right now to see - * which. VOP_ACCESS could also make it go to server. - EKN - */ - VREF(dvp); /* hang on to this dvp - EKN */ - VREF(dvp); /* hang on tight - EKN */ - } + /* if directory has changed, purge any name cache entries */ + if (!VOP_GETATTR(dvp, &vattr, cnp->cn_cred, p) && + (np->n_mtime != vattr.va_mtime.tv_sec)) + cache_purge(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { - struct vattr vattr; int vpid; - if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { - *vpp = NULLVP; - goto error_return; - } - - /* got to check to make sure the vnode didn't go away if access went to server */ - if ((*vpp)->v_type == VBAD) { - error = EINVAL; - goto error_return; - } - newvp = *vpp; vpid = newvp->v_id; + /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ + + /* + * Note: we need to make sure to get a lock/ref on newvp + * before we possibly go off to the server in VOP_ACCESS. + */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = vget(newvp, LK_EXCLUSIVE, p); - if (!error && lockparent && (flags & ISLASTCN)) + if (!error) error = vn_lock(dvp, LK_EXCLUSIVE, p); } else { error = vget(newvp, LK_EXCLUSIVE, p); - if (!lockparent || error || !(flags & ISLASTCN)) + if (error) VOP_UNLOCK(dvp, 0, p); } - if (!error) { - if (vpid == newvp->v_id) { - if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) - && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { - nfsstats.lookupcache_hits++; - if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; - error = 0; /* ignore any from VOP_GETATTR */ - goto error_return; - } - cache_purge(newvp); + + if (error) + goto cache_lookup_out; + + if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p))) { + if (dvp == newvp) + vrele(newvp); + else + vput(newvp); + *vpp = NULLVP; + goto error_return; } - vput(newvp); - if (lockparent && dvp != newvp && (flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); + + if ((dvp != newvp) && (!lockparent || !(flags & ISLASTCN))) + VOP_UNLOCK(dvp, 0, p); + + if (vpid == newvp->v_id) { + if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) + && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + error = 0; /* ignore any from VOP_GETATTR */ + goto error_return; + } + cache_purge(newvp); } + vput(newvp); + if ((dvp != newvp) && lockparent && (flags & ISLASTCN)) + VOP_UNLOCK(dvp, 0, p); +cache_lookup_out: error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; if (error) goto error_return; } - /* - * Got to check to make sure the vnode didn't go away if VOP_GETATTR went to server - * or callers prior to this blocked and had it go VBAD. - */ - if (dvp->v_type == VBAD) { - error = EINVAL; - goto error_return; - } - error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; @@ -1304,29 +1381,32 @@ goto error_return; } - if (flags & ISDOTDOT) { + if (NFS_CMPFH(np, fhp, fhsize)) { + VREF(dvp); + newvp = dvp; + } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { + m_freem(mrep); vn_lock(dvp, LK_EXCLUSIVE + LK_RETRY, p); goto error_return; } newvp = NFSTOV(np); - if (lockparent && (flags & ISLASTCN) && - (error = vn_lock(dvp, LK_EXCLUSIVE, p))) { + if (!lockparent || !(flags & ISLASTCN)) + unlockdvp = 1; /* keep dvp locked until after postops */ + if (error = vn_lock(dvp, LK_EXCLUSIVE, p)) { + m_freem(mrep); vput(newvp); goto error_return; } - } else if (NFS_CMPFH(np, fhp, fhsize)) { - VREF(dvp); - newvp = dvp; } else { if ((error = nfs_nget(dvp->v_mount, fhp, fhsize, &np))) { m_freem(mrep); goto error_return; } if (!lockparent || !(flags & ISLASTCN)) - VOP_UNLOCK(dvp, 0, p); + unlockdvp = 1; /* keep dvp locked until after postops */ newvp = NFSTOV(np); } if (v3) { @@ -1345,35 +1425,29 @@ } *vpp = newvp; nfsm_reqdone; + if (unlockdvp) + VOP_UNLOCK(dvp, 0, p); if (error) { if (newvp != NULLVP) { - vrele(newvp); + if (newvp == dvp) + vrele(newvp); + else + vput(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { - if (!lockparent) - VOP_UNLOCK(dvp, 0, p); - if (dvp->v_mount->mnt_flag & MNT_RDONLY) + if (dvp->v_mount && (dvp->v_mount->mnt_flag & MNT_RDONLY)) error = EROFS; else error = EJUSTRETURN; + if (!lockparent) + VOP_UNLOCK(dvp, 0, p); } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } error_return: - /* - * These "vreles" set dvp refcounts back to where they were - * before we took extra 2 VREFS to avoid VBAD vnode on dvp - * during server calls for world builds. Remove when real - * fix is found. - EKN - */ - if (worldbuildworkaround) { - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - vrele(dvp); /* end of hanging on tight to dvp - EKN */ - } - return (error); } @@ -1432,9 +1506,13 @@ caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3; u_int64_t xid; + if (!VFSTONFS(vp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(vp); + nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); @@ -1474,21 +1552,25 @@ caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; - int error = 0, len, retlen, tsiz, eof, attrflag; - int v3 = NFS_ISV3(vp); + int error = 0, len, retlen, tsiz, eof = 0, attrflag; + int v3, nmrsize; u_int64_t xid; -#ifndef nolint - eof = 0; -#endif + FSDBG_TOP(536, vp, uiop->uio_offset, uiop->uio_resid, 0); nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); + nmrsize = nmp->nm_rsize; + tsiz = uiop->uio_resid; - if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && - !v3) + if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { + FSDBG_BOT(536, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); return (EFBIG); + } while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; - len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; + len = (tsiz > nmrsize) ? nmrsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_long *, NFSX_UNSIGNED * 3); @@ -1500,6 +1582,7 @@ *tl++ = txdr_unsigned(len); *tl = 0; } + FSDBG(536, vp, uiop->uio_offset, len, 0); nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred, &xid); if (v3) { nfsm_postop_attr(vp, attrflag, &xid); @@ -1511,7 +1594,7 @@ eof = fxdr_unsigned(int, *(tl + 1)); } else nfsm_loadattr(vp, (struct vattr *)0, &xid); - nfsm_strsiz(retlen, nmp->nm_rsize); + nfsm_strsiz(retlen, nmrsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; @@ -1522,6 +1605,7 @@ tsiz = 0; } nfsmout: + FSDBG_BOT(536, vp, eof, uiop->uio_resid, error); return (error); } @@ -1540,20 +1624,32 @@ register int t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; - int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; + int v3, committed = NFSV3WRITE_FILESYNC; u_int64_t xid; #if DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs_writerpc: iovcnt > 1"); #endif + FSDBG_TOP(537, vp, uiop->uio_offset, uiop->uio_resid, *iomode); + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); *must_commit = 0; tsiz = uiop->uio_resid; - if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) + if (((u_int64_t)uiop->uio_offset + (unsigned int)tsiz > 0xffffffff) && !v3) { + FSDBG_BOT(537, vp, uiop->uio_offset, uiop->uio_resid, EFBIG); return (EFBIG); + } while (tsiz > 0) { + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + error = ENXIO; + break; + } nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, @@ -1571,8 +1667,12 @@ tl += 2; } *tl = txdr_unsigned(len); + FSDBG(537, vp, uiop->uio_offset, len, 0); nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred, &xid); + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + error = ENXIO; if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag, &xid); @@ -1602,10 +1702,10 @@ else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; - if ((nmp->nm_flag & NFSMNT_HASWRITEVERF) == 0) { + if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); - nmp->nm_flag |= NFSMNT_HASWRITEVERF; + nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; @@ -1616,7 +1716,7 @@ } else nfsm_loadattr(vp, (struct vattr *)0, &xid); - if (wccflag && vp->v_type != VBAD) /* EINVAL set on VBAD node */ + if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); /* @@ -1631,16 +1731,12 @@ tsiz -= len; } nfsmout: - /* EKN - * does it make sense to even say it was committed if we had an error? - * okay well just don't on bad vnodes then. EINVAL will be - * returned on bad vnodes - */ - if (vp->v_type != VBAD && (vp->v_mount->mnt_flag & MNT_ASYNC)) + if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; + FSDBG_BOT(537, vp, committed, uiop->uio_resid, error); return (error); } @@ -1735,13 +1831,11 @@ cache_enter(dvp, newvp, cnp); *vpp = newvp; } - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -1874,13 +1968,11 @@ cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -1924,9 +2016,9 @@ gofree = (ubc_isinuse(vp, 1)) ? 0 : 1; else { /* dead or dying vnode.With vnode locking panic instead of error */ - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); vput(dvp); vput(vp); + NFS_FREE_PNBUF(cnp); return (EIO); } } else { @@ -1934,6 +2026,13 @@ if (vp->v_usecount == 1) gofree = 1; } + if ((ap->a_cnp->cn_flags & NODELETEBUSY) && !gofree) { + /* Caller requested Carbon delete semantics, but file is busy */ + vput(dvp); + vput(vp); + NFS_FREE_PNBUF(cnp); + return (EBUSY); + } if (gofree || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && vattr.va_nlink > 1)) { @@ -1964,15 +2063,23 @@ */ if (error == ENOENT) error = 0; + if (!error) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + LIST_REMOVE(np, n_hash); + np->n_flag &= ~NHASHED; + } } else if (!np->n_sillyrename) { error = nfs_sillyrename(dvp, vp, cnp); } - - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); np->n_attrstamp = 0; vput(dvp); VOP_UNLOCK(vp, 0, cnp->cn_proc); + NFS_FREE_PNBUF(cnp); ubc_uncache(vp); vrele(vp); @@ -2008,9 +2115,13 @@ caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(dvp); + int v3; u_int64_t xid; + if (!VFSTONFS(dvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(dvp); + nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); @@ -2020,11 +2131,9 @@ if (v3) nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; return (error); } @@ -2091,6 +2200,7 @@ if (inuse && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR) { if (error = nfs_sillyrename(tdvp, tvp, tcnp)) { /* sillyrename failed. Instead of pressing on, return error */ + VOP_UNLOCK(tvp, 0, tcnp->cn_proc); goto out; /* should not be ENOENT. */ } else { /* sillyrename succeeded.*/ @@ -2105,6 +2215,16 @@ tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); + if (!error && tvp && tvp != fvp && !VTONFS(tvp)->n_sillyrename) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + LIST_REMOVE(VTONFS(tvp), n_hash); + VTONFS(tvp)->n_flag &= ~NHASHED; + } + if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) { cache_purge(tdvp); @@ -2172,9 +2292,13 @@ caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(fdvp); + int v3; u_int64_t xid; + if (!VFSTONFS(fdvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(fdvp); + nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + @@ -2191,16 +2315,12 @@ nfsm_wcc_data(tdvp, twccflag, &txid); } nfsm_reqdone; - if (fdvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(fdvp)->n_flag |= NMODIFIED; - if (!fwccflag) - VTONFS(fdvp)->n_attrstamp = 0; - } - if (tdvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(tdvp)->n_flag |= NMODIFIED; - if (!twccflag) - VTONFS(tdvp)->n_attrstamp = 0; - } + VTONFS(fdvp)->n_flag |= NMODIFIED; + if (!fwccflag) + VTONFS(fdvp)->n_attrstamp = 0; + VTONFS(tdvp)->n_flag |= NMODIFIED; + if (!twccflag) + VTONFS(tdvp)->n_attrstamp = 0; return (error); } @@ -2224,24 +2344,38 @@ caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; - int v3 = NFS_ISV3(vp); + int v3, didhold; u_int64_t xid; if (vp->v_mount != tdvp->v_mount) { VOP_ABORTOP(vp, cnp); - if (tdvp == vp) - vrele(tdvp); - else - vput(tdvp); + vput(tdvp); return (EXDEV); } + /* need to get vnode lock for vp before calling VOP_FSYNC() */ + if (error = vn_lock(vp, LK_EXCLUSIVE, cnp->cn_proc)) { + VOP_ABORTOP(vp, cnp); + vput(tdvp); + return (error); + } + + if (!VFSTONFS(vp->v_mount)) { + VOP_UNLOCK(vp, 0, cnp->cn_proc); + VOP_ABORTOP(vp, cnp); + vput(tdvp); + return (ENXIO); + } + v3 = NFS_ISV3(vp); + /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ + didhold = ubc_hold(vp); VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); + VOP_UNLOCK(vp, 0, cnp->cn_proc); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, @@ -2257,14 +2391,16 @@ nfsm_wcc_data(tdvp, wccflag, &txid); } nfsm_reqdone; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); VTONFS(tdvp)->n_flag |= NMODIFIED; - if (!attrflag && vp->v_type != VBAD) /* EINVAL set on VBAD vnode */ + if (!attrflag) VTONFS(vp)->n_attrstamp = 0; - if (!wccflag && tdvp->v_type != VBAD) /* EINVAL set on VBAD vnode */ + if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; + if (didhold) + ubc_rele(vp); vput(tdvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ @@ -2333,13 +2469,12 @@ nfsm_reqdone; if (newvp) vput(newvp); - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on VBAD vnode */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; vput(dvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ @@ -2409,18 +2544,16 @@ if (v3) nfsm_wcc_data(dvp, wccflag, &dxid); nfsm_reqdone; - if (dvp->v_type != VBAD) { /* EINVAL set on this case */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { - vrele(newvp); + vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, @@ -2433,11 +2566,11 @@ } if (error) { if (newvp) - vrele(newvp); + vput(newvp); } else *ap->a_vpp = newvp; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); vput(dvp); + NFS_FREE_PNBUF(cnp); return (error); } @@ -2473,16 +2606,14 @@ if (v3) nfsm_wcc_data(dvp, wccflag, &xid); nfsm_reqdone; - FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); - if (dvp->v_type != VBAD) { /* EINVAL set on this case */ - VTONFS(dvp)->n_flag |= NMODIFIED; - if (!wccflag) - VTONFS(dvp)->n_attrstamp = 0; - } + VTONFS(dvp)->n_flag |= NMODIFIED; + if (!wccflag) + VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); vput(vp); vput(dvp); + NFS_FREE_PNBUF(cnp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ @@ -2520,10 +2651,13 @@ nfsstats.direofcache_hits++; return (0); } - } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && - np->n_mtime == vattr.va_mtime.tv_sec) { - nfsstats.direofcache_hits++; - return (0); + } else if (!VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp)) { + if (np->n_mtime == vattr.va_mtime.tv_sec) { + nfsstats.direofcache_hits++; + return (0); + } + /* directory changed, purge any name cache entries */ + cache_purge(vp); } } @@ -2558,12 +2692,12 @@ caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; - int v3 = NFS_ISV3(vp); + int v3, nmreaddirsize; u_int64_t xid; #ifndef nolint @@ -2574,6 +2708,11 @@ (uiop->uio_resid & (NFS_DIRBLKSIZ - 1))) panic("nfs_readdirrpc: bad uio"); #endif + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + v3 = NFS_ISV3(vp); + nmreaddirsize = nmp->nm_readdirsize; /* * If there is no cookie, assume directory was stale. @@ -2603,7 +2742,7 @@ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } - *tl = txdr_unsigned(nmp->nm_readdirsize); + *tl = txdr_unsigned(nmreaddirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred, &xid); if (v3) { nfsm_postop_attr(vp, attrflag, &xid); @@ -2746,12 +2885,12 @@ struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; - int attrflag, fhsize; + int attrflag, fhsize, nmreaddirsize, nmrsize; u_int64_t xid, savexid; #ifndef nolint @@ -2762,6 +2901,12 @@ (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs_readdirplusrpc: bad uio"); #endif + nmp = VFSTONFS(vp->v_mount); + if (!nmp) + return (ENXIO); + nmreaddirsize = nmp->nm_readdirsize; + nmrsize = nmp->nm_rsize; + ndp->ni_dvp = vp; newvp = NULLVP; @@ -2788,8 +2933,8 @@ *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; - *tl++ = txdr_unsigned(nmp->nm_readdirsize); - *tl = txdr_unsigned(nmp->nm_rsize); + *tl++ = txdr_unsigned(nmreaddirsize); + *tl = txdr_unsigned(nmrsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred, &xid); savexid = xid; @@ -2877,6 +3022,20 @@ VREF(vp); newvp = vp; np = dnp; + } else if (!bigenough || + (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && + cnp->cn_nameptr[0] == '.')) { + /* + * don't doit if we can't guarantee + * that this entry is NOT ".." because + * we would have to drop the lock on + * the directory before getting the + * (lock on) the ".." vnode... and we + * don't want to drop the dvp lock in + * the middle of a readdirplus. + */ + doit = 0; } else { if ((error = nfs_nget(vp->v_mount, fhp, fhsize, &np))) @@ -2885,7 +3044,7 @@ newvp = NFSTOV(np); } } - if (doit) { + if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; @@ -2911,7 +3070,10 @@ nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { - vrele(newvp); + if (newvp == vp) + vrele(newvp); + else + vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); @@ -2970,6 +3132,11 @@ * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ + +/* format of "random" names and next name to try */ +/* (note: shouldn't exceed size of sillyrename.s_name) */ +static char sillyrename_name[] = ".nfsAAA%04x4.4"; + static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; @@ -2980,6 +3147,7 @@ int error; short pid; struct ucred *cred; + int i, j, k; cache_purge(dvp); np = VTONFS(vp); @@ -2995,17 +3163,39 @@ /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; - sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); + sp->s_namlen = sprintf(sp->s_name, sillyrename_name, pid); /* Try lookitups until we get one that isn't there */ + i = j = k = 0; while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { - sp->s_name[4]++; - if (sp->s_name[4] > 'z') { - error = EINVAL; - goto bad; + if (sp->s_name[4]++ >= 'z') + sp->s_name[4] = 'A'; + if (++i > ('z' - 'A' + 1)) { + i = 0; + if (sp->s_name[5]++ >= 'z') + sp->s_name[5] = 'A'; + if (++j > ('z' - 'A' + 1)) { + j = 0; + if (sp->s_name[6]++ >= 'z') + sp->s_name[6] = 'A'; + if (++k > ('z' - 'A' + 1)) { + error = EINVAL; + goto bad; + } + } + } + } + /* make note of next "random" name to try */ + if ((sillyrename_name[4] = (sp->s_name[4] + 1)) > 'z') { + sillyrename_name[4] = 'A'; + if ((sillyrename_name[5] = (sp->s_name[5] + 1)) > 'z') { + sillyrename_name[5] = 'A'; + if ((sillyrename_name[6] = (sp->s_name[6] + 1)) > 'z') + sillyrename_name[6] = 'A'; } } + /* now, do the rename */ if ((error = nfs_renameit(dvp, cnp, sp))) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, @@ -3021,7 +3211,7 @@ cred = sp->s_cred; sp->s_cred = NOCRED; crfree(cred); - _FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); + FREE_ZONE((caddr_t)sp, sizeof (struct sillyrename), M_NFSREQ); return (error); } @@ -3051,9 +3241,13 @@ int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; - int v3 = NFS_ISV3(dvp); + int v3; u_int64_t xid; + if (!VFSTONFS(dvp->v_mount)) + return (ENXIO); + v3 = NFS_ISV3(dvp); + nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); @@ -3065,7 +3259,7 @@ if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { - _FREE_ZONE((caddr_t)np->n_fhp, + FREE_ZONE((caddr_t)np->n_fhp, np->n_fhsize, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) @@ -3115,7 +3309,7 @@ /* * Nfs Version 3 commit rpc */ -static int +int nfs_commit(vp, offset, cnt, cred, procp) register struct vnode *vp; u_quad_t offset; @@ -3132,8 +3326,10 @@ struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_int64_t xid; - FSDBG(521, vp, offset, cnt, nmp->nm_flag); - if ((nmp->nm_flag & NFSMNT_HASWRITEVERF) == 0) + FSDBG(521, vp, offset, cnt, nmp->nm_state); + if (!nmp) + return (ENXIO); + if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); @@ -3157,15 +3353,6 @@ return (error); } -/* - * Kludge City.. - * - make nfs_bmap() essentially a no-op that does no translation - * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc - * (Maybe I could use the process's page mapping, but I was concerned that - * Kernel Write might not be enabled and also figured copyout() would do - * a lot more work than bcopy() and also it currently happens in the - * context of the swapper process (2). - */ static int nfs_bmap(ap) struct vop_bmap_args /* { @@ -3182,9 +3369,12 @@ if (ap->a_vpp != NULL) *ap->a_vpp = vp; - if (ap->a_bnp != NULL) + if (ap->a_bnp != NULL) { + if (!vp->v_mount) + return (ENXIO); *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize, devBlockSize); + } if (ap->a_runp != NULL) *ap->a_runp = 0; #ifdef notyet @@ -3195,41 +3385,6 @@ } /* - * Strategy routine. - * For async requests when nfsiod(s) are running, queue the request by - * calling nfs_asyncio(), otherwise just all nfs_doio() to do the - * request. - */ -static int -nfs_strategy(ap) - struct vop_strategy_args *ap; -{ - register struct buf *bp = ap->a_bp; - struct ucred *cr; - struct proc *p; - int error = 0; - - if (ISSET(bp->b_flags, B_PHYS)) - panic("nfs_strategy: physio"); - if (ISSET(bp->b_flags, B_ASYNC)) - p = (struct proc *)0; - else - p = current_proc(); /* XXX */ - if (ISSET(bp->b_flags, B_READ)) - cr = bp->b_rcred; - else - cr = bp->b_wcred; - /* - * If the op is asynchronous and an i/o daemon is waiting - * queue the request, wake it up and wait for completion - * otherwise just do it ourselves. - */ - if (!ISSET(bp->b_flags, B_ASYNC) || nfs_asyncio(bp, NOCRED)) - error = nfs_doio(bp, cr, p); - return (error); -} - -/* * Mmap a file * * NB Currently unsupported. @@ -3264,270 +3419,284 @@ { return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } - -/* - * Flush all the blocks associated with a vnode. - * Walk through the buffer pool and push any dirty pages - * associated with the vnode. - */ -static int -nfs_flush(vp, cred, waitfor, p, commit) - register struct vnode *vp; - struct ucred *cred; - int waitfor; - struct proc *p; - int commit; + +int +nfs_flushcommits(struct vnode *vp, struct proc *p) { - register struct nfsnode *np = VTONFS(vp); - register struct buf *bp; - register int i; - struct buf *nbp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos, err; - int passone = 1; + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp, *nbp; + int i, s, error = 0, retv, bvecpos, wcred_set; u_quad_t off, endoff, toff; - struct ucred* wcred = NULL; - struct buf **bvec = NULL; -#ifndef NFS_COMMITBVECSIZ + struct ucred* wcred; + struct nfsbuf **bvec = NULL; #define NFS_COMMITBVECSIZ 20 -#endif - struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; - int bvecsize = 0, bveccount; - kern_return_t kret; - upl_t upl; +#define NFS_MAXCOMMITBVECSIZ 1024 + struct nfsbuf *bvec_on_stack[NFS_COMMITBVECSIZ]; + int bvecsize = NFS_MAXCOMMITBVECSIZ; - FSDBG_TOP(517, vp, np, waitfor, commit); - - if (nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - if (!commit) - passone = 0; + FSDBG_TOP(557, vp, np, 0, 0); /* - * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the + * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server - * yet. On the first pass, the byte range is worked out and the commit - * rpc is done. On the second pass, nfs_writebp() is called to do the - * job. + * yet. The byte range is worked out for as many nfsbufs as we can handle + * and the commit rpc is done. */ -again: - FSDBG(518, vp->v_dirtyblkhd.lh_first, np->n_flag, 0, 0); - if (vp->v_dirtyblkhd.lh_first) + if (np->n_dirtyblkhd.lh_first) np->n_flag |= NMODIFIED; + off = (u_quad_t)-1; endoff = 0; bvecpos = 0; - if (NFS_ISV3(vp) && commit) { - s = splbio(); - /* - * Count up how many buffers waiting for a commit. - * This is an upper bound - any with dirty pages must be - * written not commited. - */ - bveccount = 0; - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - == (B_DELWRI | B_NEEDCOMMIT)) - bveccount++; - FSDBG(519, bp, bp->b_flags, bveccount, 0); - } - /* - * Allocate space to remember the list of bufs to commit. It is - * important to use M_NOWAIT here to avoid a race with nfs_write - * If we can't get memory (for whatever reason), we will end up - * committing the buffers one-by-one in the loop below. - */ - if (bvec != NULL && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); - if (bveccount > NFS_COMMITBVECSIZ) { - MALLOC(bvec, struct buf **, - bveccount * sizeof(struct buf *), M_TEMP, - M_NOWAIT); - if (bvec == NULL) { - bvec = bvec_on_stack; - bvecsize = NFS_COMMITBVECSIZ; - } else - bvecsize = bveccount; - } else { - bvec = bvec_on_stack; - bvecsize = NFS_COMMITBVECSIZ; - } - FSDBG(519, 0, bvecsize, bveccount, 0); + wcred_set = 0; - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; + if (!VFSTONFS(vp->v_mount)) { + error = ENXIO; + goto done; + } + if (!NFS_ISV3(vp)) { + error = EINVAL; + goto done; + } + s = splbio(); - FSDBG(520, bp, bp->b_flags, bvecpos, bp->b_bufsize); - FSDBG(520, bp->b_validoff, bp->b_validend, - bp->b_dirtyoff, bp->b_dirtyend); - if (bvecpos >= bvecsize) - break; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - != (B_DELWRI | B_NEEDCOMMIT)) - continue; + /* + * Allocate space to remember the list of bufs to commit. It is + * important to use M_NOWAIT here to avoid a race with nfs_write + */ + MALLOC(bvec, struct nfsbuf **, + bvecsize * sizeof(struct nfsbuf *), M_TEMP, + M_NOWAIT); + if (bvec == NULL) { + bvec = bvec_on_stack; + bvecsize = NFS_COMMITBVECSIZ; + } + for (bp = np->n_dirtyblkhd.lh_first; bp && bvecpos < bvecsize; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; - bremfree(bp); - SET(bp->b_flags, B_BUSY); - /* - * we need a upl to see if the page has been - * dirtied (think mmap) since the unstable write, and - * so to prevent vm from paging during our commit rpc - */ - if (ISSET(bp->b_flags, B_PAGELIST)) { - upl = bp->b_pagelist; - } else { - kret = ubc_create_upl(vp, ubc_blktooff(vp, bp->b_lblkno), - bp->b_bufsize, &upl, - NULL, UPL_PRECIOUS); - if (kret != KERN_SUCCESS) - panic("nfs_flush: create upl %d", kret); -#ifdef UBC_DEBUG - upl_ubc_alias_set(upl, current_act(), 1); -#endif /* UBC_DEBUG */ - } - if (upl_dirty_page(ubc_upl_pageinfo(upl), 0)) { - if (!ISSET(bp->b_flags, B_PAGELIST)) { - err = ubc_upl_abort(upl, NULL); - if (err) - printf("nfs_flush: upl abort %d\n", err); - } - /* - * Any/all of it may be modified... - */ - bp->b_dirtyoff = bp->b_validoff; - bp->b_dirtyend = bp->b_validend; - CLR(bp->b_flags, B_NEEDCOMMIT); - /* blocking calls were made, re-evaluate nbp */ - nbp = bp->b_vnbufs.le_next; - brelse(bp); /* XXX may block. Is using nbp ok??? */ - continue; - } - if (!ISSET(bp->b_flags, B_PAGELIST)) { - bp->b_pagelist = upl; - SET(bp->b_flags, B_PAGELIST); - ubc_upl_map(upl, (vm_address_t *)&bp->b_data); + if (((bp->nb_flags & (NB_BUSY | NB_DELWRI | NB_NEEDCOMMIT)) + != (NB_DELWRI | NB_NEEDCOMMIT))) + continue; + + nfs_buf_remfree(bp); + SET(bp->nb_flags, NB_BUSY); + /* + * we need a upl to see if the page has been + * dirtied (think mmap) since the unstable write, and + * also to prevent vm from paging it during our commit rpc + */ + if (!ISSET(bp->nb_flags, NB_PAGELIST)) { + retv = nfs_buf_upl_setup(bp); + if (retv) { + /* unable to create upl */ + /* vm object must no longer exist */ + /* this could be fatal if we need */ + /* to write the data again, we'll see... */ + printf("nfs_flushcommits: upl create failed %d\n", retv); + bp->nb_valid = bp->nb_dirty = 0; } + } + nfs_buf_upl_check(bp); - /* blocking calls were made, re-evaluate nbp */ - nbp = bp->b_vnbufs.le_next; + FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); + FSDBG(557, bp->nb_validoff, bp->nb_validend, + bp->nb_dirtyoff, bp->nb_dirtyend); - /* - * Work out if all buffers are using the same cred - * so we can deal with them all with one commit. - */ - if (wcred == NULL) - wcred = bp->b_wcred; - else if (wcred != bp->b_wcred) - wcred = NOCRED; - SET(bp->b_flags, B_WRITEINPROG); + /* + * We used to check for dirty pages here; if there were any + * we'd abort the commit and force the entire buffer to be + * written again. + * + * Instead of doing that, we now go ahead and commit the dirty + * range, and then leave the buffer around with dirty pages + * that will be written out later. + */ + + /* in case blocking calls were made, re-evaluate nbp */ + nbp = bp->nb_vnbufs.le_next; - /* - * A list of these buffers is kept so that the - * second loop knows which buffers have actually - * been committed. This is necessary, since there - * may be a race between the commit rpc and new - * uncommitted writes on the file. - */ - bvec[bvecpos++] = bp; - toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + - bp->b_dirtyoff; - if (toff < off) - off = toff; - toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); - if (toff > endoff) - endoff = toff; - } - splx(s); - } - if (bvecpos > 0) { /* - * Commit data on the server, as required. - * If all bufs are using the same wcred, then use that with - * one call for all of them, otherwise commit each one - * separately. + * Work out if all buffers are using the same cred + * so we can deal with them all with one commit. */ - if (wcred != NOCRED) - retv = nfs_commit(vp, off, (int)(endoff - off), - wcred, p); - else { - retv = 0; - for (i = 0; i < bvecpos; i++) { - off_t off, size; - bp = bvec[i]; - FSDBG(522, bp, bp->b_blkno * DEV_BSIZE, - bp->b_dirtyoff, bp->b_dirtyend); - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + - bp->b_dirtyoff; - size = (u_quad_t)(bp->b_dirtyend - - bp->b_dirtyoff); - retv = nfs_commit(vp, off, (int)size, - bp->b_wcred, p); - if (retv) break; - } + if (wcred_set == 0) { + wcred = bp->nb_wcred; + if (wcred == NOCRED) + panic("nfs: needcommit w/out wcred"); + wcred_set = 1; + } else if ((wcred_set == 1) && crcmp(wcred, bp->nb_wcred)) { + wcred_set = -1; } - - if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); + SET(bp->nb_flags, NB_WRITEINPROG); /* - * Now, either mark the blocks I/O done or mark the - * blocks dirty, depending on whether the commit - * succeeded. + * A list of these buffers is kept so that the + * second loop knows which buffers have actually + * been committed. This is necessary, since there + * may be a race between the commit rpc and new + * uncommitted writes on the file. */ + bvec[bvecpos++] = bp; + toff = NBOFF(bp) + bp->nb_dirtyoff; + if (toff < off) + off = toff; + toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + if (toff > endoff) + endoff = toff; + } + splx(s); + + if (bvecpos == 0) { + error = ENOBUFS; + goto done; + } + + /* + * Commit data on the server, as required. + * If all bufs are using the same wcred, then use that with + * one call for all of them, otherwise commit each one + * separately. + */ + if (wcred_set == 1) + retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); + else { + retv = 0; + for (i = 0; i < bvecpos; i++) { + off_t off, size; bp = bvec[i]; - FSDBG(523, bp, retv, bp->b_flags, 0); - CLR(bp->b_flags, (B_NEEDCOMMIT | B_WRITEINPROG)); - if (retv) { - brelse(bp); - } else { - int oldflags = bp->b_flags; + off = NBOFF(bp) + bp->nb_dirtyoff; + size = (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); + retv = nfs_commit(vp, off, (int)size, bp->nb_wcred, p); + if (retv) break; + } + } + if (retv == NFSERR_STALEWRITEVERF) + nfs_clearcommit(vp->v_mount); - s = splbio(); - vp->v_numoutput++; - SET(bp->b_flags, B_ASYNC); - CLR(bp->b_flags, - (B_READ|B_DONE|B_ERROR|B_DELWRI)); - if (ISSET(oldflags, B_DELWRI)) { - extern int nbdwrite; - nbdwrite--; - wakeup((caddr_t)&nbdwrite); - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - reassignbuf(bp, vp); - splx(s); - biodone(bp); + /* + * Now, either mark the blocks I/O done or mark the + * blocks dirty, depending on whether the commit + * succeeded. + */ + for (i = 0; i < bvecpos; i++) { + bp = bvec[i]; + FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); + + CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); + + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); + + if (retv) { + nfs_buf_release(bp); + } else { + s = splbio(); + vp->v_numoutput++; + + if (ISSET(bp->nb_flags, NB_DELWRI)) { + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); + } + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + /* if block still has dirty pages, we don't want it to */ + /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ + if (!bp->nb_dirty) + SET(bp->nb_flags, NB_ASYNC); + + /* move to clean list */ + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); + + bp->nb_dirtyoff = bp->nb_dirtyend = 0; + splx(s); + + nfs_buf_iodone(bp); + if (bp->nb_dirty) { + /* throw it back in as a delayed write buffer */ + CLR(bp->nb_flags, NB_DONE); + nfs_buf_write_delayed(bp); } } + } +done: + if (bvec != NULL && bvec != bvec_on_stack) + _FREE(bvec, M_TEMP); + FSDBG_BOT(557, vp, np, 0, error); + return (error); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +static int +nfs_flush(vp, cred, waitfor, p, commit) + register struct vnode *vp; + struct ucred *cred; + int waitfor; + struct proc *p; + int commit; +{ + struct nfsnode *np = VTONFS(vp); + struct nfsbuf *bp, *nbp; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + int i, s, error = 0, error2, slptimeo = 0, slpflag = 0; + int passone = 1; + + FSDBG_TOP(517, vp, np, waitfor, commit); + + if (!nmp) { + error = ENXIO; + goto done; } + if (nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + if (!commit) + passone = 0; + /* - * Start/do any write(s) that are required. There is a window here - * where B_BUSY protects the buffer. The vm pages have been freed up, - * yet B_BUSY is set. Don't think you will hit any busy/incore problems - * while we sleep, but not absolutely sure. Keep an eye on it. Otherwise - * we will have to hold vm page across this locked. - EKN + * On the first pass, commit all the bufs that can be. + * On the second pass, nfs_buf_write() is called to do the job. */ -loop: - if (current_thread_aborted()) { - error = EINTR; +again: + FSDBG(518, np->n_dirtyblkhd.lh_first, np->n_flag, 0, 0); + if (np->n_dirtyblkhd.lh_first) + np->n_flag |= NMODIFIED; + if (!VFSTONFS(vp->v_mount)) { + error = ENXIO; goto done; } + if (NFS_ISV3(vp) && commit) { + /* loop while it looks like there are still buffers to be */ + /* commited and nfs_flushcommits() seems to be handling them. */ + while (np->n_needcommitcnt) + if (nfs_flushcommits(vp, p)) + break; + } + + /* Start/do any write(s) that are required. */ +loop: s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (ISSET(bp->b_flags, B_BUSY)) { - FSDBG(524, bp, waitfor, passone, bp->b_flags); + for (bp = np->n_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->nb_vnbufs.le_next; + if (ISSET(bp->nb_flags, NB_BUSY)) { + FSDBG(524, bp, waitfor, passone, bp->nb_flags); if (waitfor != MNT_WAIT || passone) continue; - SET(bp->b_flags, B_WANTED); + SET(bp->nb_flags, NB_WANTED); error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); splx(s); if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { - error = EINTR; + error2 = nfs_sigintr(VFSTONFS(vp->v_mount), + (struct nfsreq *)0, p); + if (error2) { + error = error2; goto done; } if (slpflag == PCATCH) { @@ -3537,34 +3706,45 @@ } goto loop; } - if (!ISSET(bp->b_flags, B_DELWRI)) + if (!ISSET(bp->nb_flags, NB_DELWRI)) panic("nfs_fsync: not dirty"); - FSDBG(525, bp, passone, commit, bp->b_flags); - if ((passone || !commit) && ISSET(bp->b_flags, B_NEEDCOMMIT)) + FSDBG(525, bp, passone, commit, bp->nb_flags); + if ((passone || !commit) && ISSET(bp->nb_flags, NB_NEEDCOMMIT)) + continue; + nfs_buf_remfree(bp); + if (ISSET(bp->nb_flags, NB_ERROR)) { + np->n_error = bp->nb_error ? bp->nb_error : EIO; + np->n_flag |= NWRITEERR; + nfs_buf_release(bp); continue; - bremfree(bp); + } if (passone || !commit) - SET(bp->b_flags, B_BUSY|B_ASYNC); - else - SET(bp->b_flags, - B_BUSY|B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT); + SET(bp->nb_flags, NB_BUSY|NB_ASYNC); + else { + /* the NB_STABLE forces this to be written FILESYNC */ + SET(bp->nb_flags, NB_BUSY|NB_ASYNC|NB_STABLE); + } splx(s); - VOP_BWRITE(bp); + nfs_buf_write(bp); goto loop; } splx(s); + if (passone) { passone = 0; goto again; } + if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { - error = EINTR; + error2 = nfs_sigintr(VFSTONFS(vp->v_mount), + (struct nfsreq *)0, p); + if (error2) { + error = error2; goto done; } if (slpflag == PCATCH) { @@ -3573,7 +3753,7 @@ } } } - if (vp->v_dirtyblkhd.lh_first && commit) { + if (np->n_dirtyblkhd.lh_first && commit) { goto loop; } } @@ -3584,8 +3764,6 @@ } done: FSDBG_BOT(517, vp, np, error, 0); - if (bvec != NULL && bvec != bvec_on_stack) - _FREE(bvec, M_TEMP); return (error); } @@ -3609,8 +3787,7 @@ } /* - * NFS advisory byte-level locks. - * Currently unsupported. + * NFS advisory byte-level locks (client) */ static int nfs_advlock(ap) @@ -3622,21 +3799,7 @@ int a_flags; } */ *ap; { -#ifdef __FreeBSD__ - register struct nfsnode *np = VTONFS(ap->a_vp); - - /* - * The following kludge is to allow diskless support to work - * until a real NFS lockd is implemented. Basically, just pretend - * that this is a local lock. - */ - return (lf_advlock(ap, &(np->n_lockf), np->n_size)); -#else -#if DIAGNOSTIC - printf("nfs_advlock: pid %d comm %s\n", current_proc()->p_pid, current_proc()->p_comm); -#endif - return (EOPNOTSUPP); -#endif + return (nfs_dolock(ap)); } /* @@ -3756,187 +3919,74 @@ return (EOPNOTSUPP); } -int nfs_aio_threads = 0; /* 1 per nfd (arbitrary) */ -struct slock nfs_aio_slock; -TAILQ_HEAD(bqueues, buf) nfs_aio_bufq; -int nfs_aio_bufq_len = 0; /* diagnostic only */ - -void -nfs_aio_thread() -{ /* see comment below in nfs_bwrite() for some rationale */ - struct buf *bp; - boolean_t funnel_state; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - for(;;) { - simple_lock(&nfs_aio_slock); - if ((bp = nfs_aio_bufq.tqh_first)) { - TAILQ_REMOVE(&nfs_aio_bufq, bp, b_freelist); - nfs_aio_bufq_len--; - simple_unlock(&nfs_aio_slock); - nfs_writebp(bp, 1); - } else { /* nothing to do - goodnight */ - assert_wait(&nfs_aio_bufq, THREAD_UNINT); - simple_unlock(&nfs_aio_slock); - (void)tsleep((caddr_t)0, PRIBIO+1, "nfs_aio_bufq", 0); - } - } - (void) thread_funnel_set(kernel_flock, FALSE); -} - - -void -nfs_aio_thread_init() -{ - if (nfs_aio_threads++ == 0) { - simple_lock_init(&nfs_aio_slock); - TAILQ_INIT(&nfs_aio_bufq); - } - kernel_thread(kernel_task, nfs_aio_thread); -} - - /* - * Just call nfs_writebp() with the force argument set to 1. - */ -static int -nfs_bwrite(ap) - struct vop_bwrite_args /* { - struct vnode *a_bp; - } */ *ap; -{ - extern void wakeup_one(caddr_t chan); - - /* - * nfs_writebp will issue a synchronous rpc to if B_ASYNC then - * to avoid distributed deadlocks we handoff the write to the - * nfs_aio threads. Doing so allows us to complete the - * current request, rather than blocking on a server which may - * be ourself (or blocked on ourself). - * - * Note the loopback deadlocks happened when the thread - * invoking us was nfsd, and also when it was the pagedaemon. - * - * This solution has one known problem. If *ALL* buffers get - * on the nfs_aio queue then no forward progress can be made - * until one of those writes complete. And if the current - * nfs_aio writes-in-progress block due to a non-responsive server we - * are in a deadlock circle. Probably the cure is to limit the - * async write concurrency in getnewbuf as in FreeBSD 3.2. - */ - if (nfs_aio_threads && ISSET(ap->a_bp->b_flags, B_ASYNC)) { - simple_lock(&nfs_aio_slock); - nfs_aio_bufq_len++; - TAILQ_INSERT_TAIL(&nfs_aio_bufq, ap->a_bp, b_freelist); - simple_unlock(&nfs_aio_slock); - wakeup_one((caddr_t)&nfs_aio_bufq); - return (0); - } - return (nfs_writebp(ap->a_bp, 1)); -} - -/* - * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. + * write (or commit) the given NFS buffer */ int -nfs_writebp(bp, force) - register struct buf *bp; - int force; +nfs_buf_write(struct nfsbuf *bp) { int s; - register int oldflags = bp->b_flags, retv = 1; + int oldflags = bp->nb_flags, rv = 0; off_t off; - upl_t upl; - kern_return_t kret; - struct vnode *vp = bp->b_vp; - upl_page_info_t *pl; + struct vnode *vp = bp->nb_vp; + struct ucred *cr; + struct proc *p = current_proc(); - if(!ISSET(bp->b_flags, B_BUSY)) - panic("nfs_writebp: buffer is not busy???"); + FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); + + if (!ISSET(bp->nb_flags, NB_BUSY)) + panic("nfs_buf_write: buffer is not busy???"); s = splbio(); - CLR(bp->b_flags, (B_READ|B_DONE|B_ERROR|B_DELWRI)); - if (ISSET(oldflags, B_DELWRI)) { - extern int nbdwrite; - nbdwrite--; - wakeup((caddr_t)&nbdwrite); + CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); + if (ISSET(oldflags, NB_DELWRI)) { + nfs_nbdwrite--; + NFSBUFCNTCHK(); + wakeup((caddr_t)&nfs_nbdwrite); } - if (ISSET(oldflags, (B_ASYNC|B_DELWRI))) { - reassignbuf(bp, vp); + /* move to clean list */ + if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); } vp->v_numoutput++; - current_proc()->p_stats->p_ru.ru_oublock++; + if (p && p->p_stats) + p->p_stats->p_ru.ru_oublock++; splx(s); - - /* - * Since the B_BUSY flag is set, we need to lock the page before doing - * nfs_commit. Otherwise we may block and get a busy incore pages - * during a vm pageout. Move the existing code up before the commit. - */ - if (!ISSET(bp->b_flags, B_META) && UBCISVALID(vp) && - !ISSET(bp->b_flags, B_PAGELIST)) { - kret = ubc_create_upl(vp, ubc_blktooff(vp, bp->b_lblkno), - bp->b_bufsize, &upl, &pl, UPL_PRECIOUS); - if (kret != KERN_SUCCESS) - panic("nfs_writebp: ubc_create_upl %d", kret); -#ifdef UBC_DEBUG - upl_ubc_alias_set(upl, current_act(), 2); -#endif /* UBC_DEBUG */ - s = splbio(); - bp->b_pagelist = upl; - SET(bp->b_flags, B_PAGELIST); - splx(s); - - kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data)); - if (kret != KERN_SUCCESS) - panic("nfs_writebp: ubc_upl_map %d", kret); - if(bp->b_data == 0) - panic("nfs_writebp: ubc_upl_map mapped 0"); - if (!upl_page_present(pl, 0)) /* even more paranoia */ - panic("nfs_writebp: nopage"); - } /* - * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be scheduled via. VOP_STRATEGY(). - * If B_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - SET(bp->b_flags, B_WRITEINPROG); - retv = nfs_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_wcred, bp->b_proc); - CLR(bp->b_flags, B_WRITEINPROG); - if (!retv) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - CLR(bp->b_flags, B_NEEDCOMMIT); - biodone(bp); /* on B_ASYNC will brelse the buffer */ - - } else if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); - } - if (retv) { - if (force) - SET(bp->b_flags, B_WRITEINPROG); - VOP_STRATEGY(bp); - } - - if( (oldflags & B_ASYNC) == 0) { - int rtval = biowait(bp); - - if (oldflags & B_DELWRI) { + * For async requests when nfsiod(s) are running, queue the request by + * calling nfs_asyncio(), otherwise just all nfs_doio() to do the request. + */ + if (ISSET(bp->nb_flags, NB_ASYNC)) + p = (struct proc *)0; + if (ISSET(bp->nb_flags, NB_READ)) + cr = bp->nb_rcred; + else + cr = bp->nb_wcred; + if (!ISSET(bp->nb_flags, NB_ASYNC) || nfs_asyncio(bp, NOCRED)) + rv = nfs_doio(bp, cr, p); + + if ((oldflags & NB_ASYNC) == 0) { + rv = nfs_buf_iowait(bp); + /* move to clean list */ + if (oldflags & NB_DELWRI) { s = splbio(); - reassignbuf(bp, vp); + if (bp->nb_vnbufs.le_next != NFSNOLIST) + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&VTONFS(vp)->n_cleanblkhd, bp, nb_vnbufs); splx(s); } - brelse(bp); - return (rtval); + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, rv); + nfs_buf_release(bp); + return (rv); } - return (0); + FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, rv); + return (rv); } /* @@ -3967,7 +4017,7 @@ * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ - if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { + if ((mode & VWRITE) && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); @@ -4015,13 +4065,15 @@ } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set access flag. */ np->n_flag |= NACC; - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } @@ -4038,13 +4090,15 @@ } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set update flag. */ np->n_flag |= NUPD; - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } @@ -4068,7 +4122,7 @@ if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; - if (vp->v_usecount == 1 && + if (vp->v_usecount == 1 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) @@ -4095,13 +4149,15 @@ { extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set access flag. */ np->n_flag |= NACC; - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } @@ -4119,13 +4175,15 @@ { extern vop_t **fifo_vnodeop_p; register struct nfsnode *np = VTONFS(ap->a_vp); + struct timeval now; /* * Set update flag. */ np->n_flag |= NUPD; - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + microtime(&now); + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } @@ -4146,19 +4204,21 @@ register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; + struct timeval now; extern vop_t **fifo_vnodeop_p; if (np->n_flag & (NACC | NUPD)) { + microtime(&now); if (np->n_flag & NACC) { - np->n_atim.tv_sec = time.tv_sec; - np->n_atim.tv_nsec = time.tv_usec * 1000; + np->n_atim.tv_sec = now.tv_sec; + np->n_atim.tv_nsec = now.tv_usec * 1000; } if (np->n_flag & NUPD) { - np->n_mtim.tv_sec = time.tv_sec; - np->n_mtim.tv_nsec = time.tv_usec * 1000; + np->n_mtim.tv_sec = now.tv_sec; + np->n_mtim.tv_nsec = now.tv_usec * 1000; } np->n_flag |= NCHG; - if (vp->v_usecount == 1 && + if (vp->v_usecount == 1 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) @@ -4194,7 +4254,6 @@ return (1); } -/* XXX Eliminate use of struct bp here */ /* * Vnode op for pagein using getblk_pages * derived from nfs_bioread() @@ -4219,21 +4278,20 @@ vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; struct ucred *cred; - register struct nfsnode *np = VTONFS(vp); - register int biosize; - register int iosize; - register int xsize; + struct nfsnode *np = VTONFS(vp); + int biosize, xsize, iosize; struct vattr vattr; struct proc *p = current_proc(); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct nfsmount *nmp; int error = 0; vm_offset_t ioaddr; struct uio auio; struct iovec aiov; struct uio * uio = &auio; int nofreeupl = flags & UPL_NOCOMMIT; + upl_page_info_t *plinfo; - FSDBG(322, f_offset, size, pl, pl_offset); + FSDBG(322, vp, f_offset, size, flags); if (pl == (upl_t)NULL) panic("nfs_pagein: no upl"); @@ -4251,8 +4309,7 @@ (void) ubc_upl_abort(pl, NULL); return (EINVAL); } - if (f_offset < 0 || f_offset >= np->n_size || - (f_offset & PAGE_MASK_64)) { + if (f_offset < 0 || f_offset >= np->n_size || (f_offset & PAGE_MASK_64)) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset, size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); @@ -4267,27 +4324,38 @@ auio.uio_rw = UIO_READ; auio.uio_procp = NULL; - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + nmp = VFSTONFS(vp->v_mount); + if (!nmp) { + if (!nofreeupl) + ubc_upl_abort_range(pl, pl_offset, size, + UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY); + return (ENXIO); + } + if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); - biosize = min(vp->v_mount->mnt_stat.f_iosize, size); - - if (biosize & PAGE_MASK) - panic("nfs_pagein(%x): biosize not page aligned", biosize); + biosize = vp->v_mount->mnt_stat.f_iosize; + plinfo = ubc_upl_pageinfo(pl); ubc_upl_map(pl, &ioaddr); ioaddr += pl_offset; xsize = size; do { + /* + * It would be nice to be able to issue all these requests + * in parallel instead of waiting for each one to complete + * before sending the next one. + * XXX Should we align these requests to block boundaries? + */ iosize = min(biosize, xsize); uio->uio_resid = iosize; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; aiov.iov_len = iosize; aiov.iov_base = (caddr_t)ioaddr; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; FSDBG(322, uio->uio_offset, uio->uio_resid, ioaddr, xsize); -#warning our nfs_pagein does not support NQNFS +// XXX #warning our nfs_pagein does not support NQNFS /* * With UBC we get here only when the file data is not in the VM * page cache, so go ahead and read in. @@ -4319,7 +4387,8 @@ } else FSDBG(322, uio->uio_offset, uio->uio_resid, error, -1); - if (p && (vp->v_flag & VTEXT) && + nmp = VFSTONFS(vp->v_mount); + if (p && (vp->v_flag & VTEXT) && nmp && ((nmp->nm_flag & NFSMNT_NQNFS && NQNFS_CKINVALID(vp, np, ND_READ) && np->n_lrev != np->n_brev) || @@ -4372,11 +4441,10 @@ vm_offset_t pl_offset = ap->a_pl_offset; int flags = ap->a_flags; int ioflag = ap->a_flags; - register int biosize; struct proc *p = current_proc(); struct nfsnode *np = VTONFS(vp); register struct ucred *cred; - struct buf *bp; + struct nfsbuf *bp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; int n = 0, on, error = 0, iomode, must_commit, s; @@ -4384,10 +4452,8 @@ vm_offset_t ioaddr; struct uio auio; struct iovec aiov; - struct uio * uio = &auio; int nofreeupl = flags & UPL_NOCOMMIT; - int iosize; - int pgsize; + int biosize, iosize, pgsize, xsize; FSDBG(323, f_offset, size, pl, pl_offset); @@ -4397,7 +4463,7 @@ if (UBCINVALID(vp)) { printf("nfs_pageout: invalid vnode 0x%x", (int)vp); if (!nofreeupl) - (void) ubc_upl_abort(pl, NULL); + ubc_upl_abort(pl, 0); return (EIO); } UBCINFOCHECK("nfs_pageout", vp); @@ -4405,42 +4471,90 @@ if (size <= 0) { printf("nfs_pageout: invalid size %d", size); if (!nofreeupl) - (void) ubc_upl_abort(pl, NULL); + ubc_upl_abort(pl, 0); return (EINVAL); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - biosize = min(vp->v_mount->mnt_stat.f_iosize, size); - - if (biosize & PAGE_MASK) - panic("nfs_pageout(%x): biosize not page aligned", biosize); + if (!nmp) { + if (!nofreeupl) + ubc_upl_abort(pl, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + return (ENXIO); + } + biosize = vp->v_mount->mnt_stat.f_iosize; /* - * Check to see whether the buffer is incore - * If incore and not busy invalidate it from the cache - * we should not find it BUSY, since we always do a - * vm_fault_list_request in 'getblk' before returning - * which would block on the page busy status + * Check to see whether the buffer is incore. + * If incore and not busy, invalidate it from the cache. */ - lbn = f_offset / PAGE_SIZE; /* to match the size getblk uses */ - - for (iosize = size; iosize > 0; iosize -= PAGE_SIZE, lbn++) { + for (iosize = 0; iosize < size; iosize += xsize) { + off = f_offset + iosize; + /* need make sure we do things on block boundaries */ + xsize = biosize - (off % biosize); + if (off + xsize > f_offset + size) + xsize = f_offset + size - off; + lbn = ubc_offtoblk(vp, off); s = splbio(); - if (bp = incore(vp, lbn)) { - FSDBG(323, lbn*PAGE_SIZE, 1, bp, bp->b_flags); - if (ISSET(bp->b_flags, B_BUSY)) { + if (bp = nfs_buf_incore(vp, lbn)) { + FSDBG(323, off, 1, bp, bp->nb_flags); + if (ISSET(bp->nb_flags, NB_BUSY)) { /* no panic. just tell vm we are busy */ if (!nofreeupl) - (void) ubc_upl_abort(pl, NULL); - return(EBUSY); + ubc_upl_abort(pl, 0); + return (EBUSY); + } + if (bp->nb_dirtyend > 0) { + /* + * if there's a dirty range in the buffer, check to + * see if it extends beyond the pageout region + * + * if the dirty region lies completely within the + * pageout region, we just invalidate the buffer + * because it's all being written out now anyway. + * + * if any of the dirty region lies outside the + * pageout region, we'll try to clip the dirty + * region to eliminate the portion that's being + * paged out. If that's not possible, because + * the dirty region extends before and after the + * pageout region, then we'll just return EBUSY. + */ + off_t boff, start, end; + boff = NBOFF(bp); + start = off; + end = off + xsize; + /* clip end to EOF */ + if (end > np->n_size) + end = np->n_size; + start -= boff; + end -= boff; + if ((bp->nb_dirtyoff < start) && + (bp->nb_dirtyend > end)) { + /* not gonna be able to clip the dirty region */ + FSDBG(323, vp, bp, 0xd00deebc, EBUSY); + if (!nofreeupl) + ubc_upl_abort(pl, 0); + return (EBUSY); + } + if ((bp->nb_dirtyoff < start) || + (bp->nb_dirtyend > end)) { + /* clip dirty region, if necessary */ + if (bp->nb_dirtyoff < start) + bp->nb_dirtyend = min(bp->nb_dirtyend, start); + if (bp->nb_dirtyend > end) + bp->nb_dirtyoff = max(bp->nb_dirtyoff, end); + FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00); + /* we're leaving this block dirty */ + continue; + } + } + nfs_buf_remfree(bp); + SET(bp->nb_flags, (NB_BUSY | NB_INVAL)); + if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { + CLR(bp->nb_flags, NB_NEEDCOMMIT); + np->n_needcommitcnt--; + CHECK_NEEDCOMMITCNT(np); } - bremfree(bp); - SET(bp->b_flags, (B_BUSY | B_INVAL)); - brelse(bp); + nfs_buf_release(bp); } splx(s); } @@ -4456,11 +4570,12 @@ UPL_ABORT_FREE_ON_EMPTY); return (np->n_error); } - if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) + if ((nmp->nm_flag & NFSMNT_NFSV3) && + !(nmp->nm_state & NFSSTA_GOTFSINFO)) (void)nfs_fsinfo(nmp, vp, cred, p); if (f_offset < 0 || f_offset >= np->n_size || - f_offset & PAGE_MASK_64 || size & PAGE_MASK) { + f_offset & PAGE_MASK_64 || size & PAGE_MASK_64) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset, size, UPL_ABORT_FREE_ON_EMPTY); @@ -4468,30 +4583,21 @@ } ubc_upl_map(pl, &ioaddr); + ioaddr += pl_offset; if (f_offset + size > np->n_size) - iosize = np->n_size - f_offset; + xsize = np->n_size - f_offset; else - iosize = size; - - pgsize = (iosize + (PAGE_SIZE - 1)) & ~PAGE_MASK; + xsize = size; + pgsize = round_page_64(xsize); if (size > pgsize) { if (!nofreeupl) ubc_upl_abort_range(pl, pl_offset + pgsize, size - pgsize, UPL_ABORT_FREE_ON_EMPTY); } - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = f_offset; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_resid = iosize; - auio.uio_procp = NULL; - aiov.iov_len = iosize; - aiov.iov_base = (caddr_t)ioaddr + pl_offset; /* * check for partial page and clear the * contents past end of the file before @@ -4499,45 +4605,47 @@ */ if (f_offset < np->n_size && f_offset + size > np->n_size) { size_t io = np->n_size - f_offset; - - bzero((caddr_t)(ioaddr + pl_offset + io), size - io); - + bzero((caddr_t)(ioaddr + io), size - io); FSDBG(321, np->n_size, f_offset, f_offset + io, size - io); } + auio.uio_offset = f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = NULL; + do { -#warning our nfs_pageout does not support NQNFS + /* + * It would be nice to be able to issue all these requests + * in parallel instead of waiting for each one to complete + * before sending the next one. + * XXX Should we align these requests to block boundaries? + */ + iosize = min(biosize, xsize); + auio.uio_resid = iosize; + aiov.iov_len = iosize; + aiov.iov_base = (caddr_t)ioaddr; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + + FSDBG(323, auio.uio_offset, auio.uio_resid, ioaddr, xsize); +// XXX #warning our nfs_pageout does not support NQNFS nfsstats.pageouts++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); -again: -#if 0 - /* (removed for UBC) */ - bufsize = biosize; - if ((off_t)(lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - (off_t)lbn * biosize; - bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); - } -#endif + vp->v_numoutput++; /* NMODIFIED would be set here if doing unstable writes */ iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); + error = nfs_writerpc(vp, &auio, cred, &iomode, &must_commit); if (must_commit) nfs_clearcommit(vp->v_mount); vpwakeup(vp); - if (error) goto cleanup; - - if (n > 0) { - uio->uio_resid -= n; - uio->uio_offset += n; - uio->uio_iov->iov_base += n; - uio->uio_iov->iov_len -= n; - } - } while (uio->uio_resid > 0 && n > 0); + /* Note: no need to check uio_resid, because */ + /* it'll only be set if there was an error. */ + ioaddr += iosize; + xsize -= iosize; + } while (xsize > 0); cleanup: ubc_upl_unmap(pl); @@ -4619,9 +4727,12 @@ int biosize; register struct vnode *vp = ap->a_vp; - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */ + if (!vp->v_mount) + return (ENXIO); + + biosize = vp->v_mount->mnt_stat.f_iosize; - *ap->a_offset = (off_t)ap->a_lblkno * biosize; + *ap->a_offset = (off_t)ap->a_lblkno * biosize; return (0); } @@ -4637,9 +4748,12 @@ int biosize; register struct vnode *vp = ap->a_vp; - biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */ + if (!vp->v_mount) + return (ENXIO); + + biosize = vp->v_mount->mnt_stat.f_iosize; - *ap->a_lblkno = (daddr_t)(ap->a_offset / biosize); + *ap->a_lblkno = (daddr_t)(ap->a_offset / biosize); return (0); } diff -urN xnu-344.49/bsd/nfs/nfsm_subs.h xnu-517/bsd/nfs/nfsm_subs.h --- xnu-344.49/bsd/nfs/nfsm_subs.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfsm_subs.h Sat Oct 25 00:25:55 2003 @@ -334,7 +334,12 @@ */ #define nfsm_request(v, t, p, c, x) \ { \ - int nfsv3 = (VFSTONFS((v)->v_mount))->nm_flag & NFSMNT_NFSV3; \ + int nfsv3; \ + if (!VFSTONFS((v)->v_mount)) { \ + error = ENXIO; \ + goto nfsmout; \ + } \ + nfsv3 = (VFSTONFS((v)->v_mount))->nm_flag & NFSMNT_NFSV3; \ if ((error = nfs_request((v), mreq, (t), (p), \ (c), &mrep, &md, &dpos, (x)))) { \ if (error & NFSERR_RETERR) \ @@ -342,11 +347,6 @@ else \ goto nfsmout; \ } \ - else if ((v)->v_type==VBAD) { \ - error = EINVAL; \ - if (!nfsv3) \ - goto nfsmout; \ - } \ } #define nfsm_strtom(a,s,m) \ @@ -446,7 +446,9 @@ nfsm_srvpostopattr(nfsd, (r), (a), &mb, &bpos) #define nfsm_srvsattr(a) \ - { nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ + { \ + struct timeval now; \ + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ if (*tl == nfs_true) { \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ (a)->va_mode = nfstov_mode(*tl); \ @@ -467,14 +469,15 @@ fxdr_hyper(tl, &(a)->va_size); \ } \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ + microtime(&now); \ switch (fxdr_unsigned(int, *tl)) { \ case NFSV3SATTRTIME_TOCLIENT: \ nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); \ fxdr_nfsv3time(tl, &(a)->va_atime); \ break; \ case NFSV3SATTRTIME_TOSERVER: \ - (a)->va_atime.tv_sec = time.tv_sec; \ - (a)->va_atime.tv_nsec = time.tv_usec * 1000; \ + (a)->va_atime.tv_sec = now.tv_sec; \ + (a)->va_atime.tv_nsec = now.tv_usec * 1000; \ break; \ }; \ nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); \ @@ -484,8 +487,8 @@ fxdr_nfsv3time(tl, &(a)->va_mtime); \ break; \ case NFSV3SATTRTIME_TOSERVER: \ - (a)->va_mtime.tv_sec = time.tv_sec; \ - (a)->va_mtime.tv_nsec = time.tv_usec * 1000; \ + (a)->va_mtime.tv_sec = now.tv_sec; \ + (a)->va_mtime.tv_nsec = now.tv_usec * 1000; \ break; \ }; } diff -urN xnu-344.49/bsd/nfs/nfsmount.h xnu-517/bsd/nfs/nfsmount.h --- xnu-344.49/bsd/nfs/nfsmount.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfsmount.h Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ */ struct nfsmount { int nm_flag; /* Flags for soft/hard... */ + int nm_state; /* Internal state flags */ struct mount *nm_mountp; /* Vfs structure for this filesystem */ int nm_numgrps; /* Max. size of groupslist */ struct vnode *nm_dvp; /* root directory vnode pointer */ @@ -110,17 +111,28 @@ int nm_numuids; /* Number of nfsuid mappings */ TAILQ_HEAD(, nfsuid) nm_uidlruhead; /* Lists of nfsuid mappings */ LIST_HEAD(, nfsuid) nm_uidhashtbl[NFS_MUIDHASHSIZ]; - TAILQ_HEAD(, buf) nm_bufq; /* async io buffer queue */ + TAILQ_HEAD(, nfsbuf) nm_bufq; /* async io buffer queue */ short nm_bufqlen; /* number of buffers in queue */ short nm_bufqwant; /* process wants to add to the queue */ int nm_bufqiods; /* number of iods processing queue */ + int nm_tprintf_initial_delay; /* delay first "server down" */ + int nm_tprintf_delay; /* delay between "server down" */ }; + #if defined(KERNEL) /* * Convert mount ptr to nfsmount ptr. */ -#define VFSTONFS(mp) ((struct nfsmount *)((mp)->mnt_data)) +#define VFSTONFS(mp) ((mp) ? ((struct nfsmount *)((mp)->mnt_data)) : NULL) + +#ifndef NFS_TPRINTF_INITIAL_DELAY +#define NFS_TPRINTF_INITIAL_DELAY 12 +#endif + +#ifndef NFS_TPRINTF_DELAY +#define NFS_TPRINTF_DELAY 30 +#endif #endif /* KERNEL */ diff -urN xnu-344.49/bsd/nfs/nfsnode.h xnu-517/bsd/nfs/nfsnode.h --- xnu-344.49/bsd/nfs/nfsnode.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfsnode.h Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -103,6 +103,96 @@ }; /* + * The nfsbuf is the nfs equivalent to a struct buf. + */ +struct nfsbuf { + LIST_ENTRY(nfsbuf) nb_hash; /* hash chain */ + LIST_ENTRY(nfsbuf) nb_vnbufs; /* vnode's nfsbuf chain */ + TAILQ_ENTRY(nfsbuf) nb_free; /* free list position if not active. */ + volatile long nb_flags; /* NB_* flags. */ + long nb_bufsize; /* buffer size */ + daddr_t nb_lblkno; /* logical block number. */ + int nb_error; /* errno value. */ + u_int32_t nb_valid; /* valid pages in buf */ + u_int32_t nb_dirty; /* dirty pages in buf */ + int nb_validoff; /* offset in buffer of valid region. */ + int nb_validend; /* offset of end of valid region. */ + int nb_dirtyoff; /* offset in buffer of dirty region. */ + int nb_dirtyend; /* offset of end of dirty region. */ + caddr_t nb_data; /* mapped buffer */ + struct vnode * nb_vp; /* device vnode */ + struct proc * nb_proc; /* associated proc; NULL if kernel. */ + struct ucred * nb_rcred; /* read credentials reference */ + struct ucred * nb_wcred; /* write credentials reference */ + void * nb_pagelist; /* upl */ +}; + +/* + * These flags are kept in nb_flags and they're (purposefully) + * very similar to the B_* flags for struct buf. + */ +#define NB_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ +#define NB_ASYNC 0x00000004 /* Start I/O, do not wait. */ +#define NB_BUSY 0x00000010 /* I/O in progress. */ +#define NB_CACHE 0x00000020 /* Bread found us in the cache. */ +#define NB_STABLE 0x00000040 /* write FILESYNC not UNSTABLE. */ +#define NB_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define NB_DONE 0x00000200 /* I/O completed. */ +#define NB_EINTR 0x00000400 /* I/O was interrupted */ +#define NB_ERROR 0x00000800 /* I/O error occurred. */ +#define NB_WASDIRTY 0x00001000 /* page was found dirty in the VM cache */ +#define NB_INVAL 0x00002000 /* Does not contain valid info. */ +#define NB_NOCACHE 0x00008000 /* Do not cache block after use. */ +#define NB_READ 0x00100000 /* Read buffer. */ +#define NB_PAGELIST 0x00400000 /* Buffer describes pagelist I/O. */ +#define NB_WANTED 0x00800000 /* Process wants this buffer. */ +#define NB_WRITE 0x00000000 /* Write buffer (pseudo flag). */ +#define NB_WRITEINPROG 0x01000000 /* Write in progress. */ +#define NB_META 0x40000000 /* buffer contains meta-data. */ +#define NB_IOD 0x80000000 /* buffer being handled by nfsiod. */ + + +#define NBOFF(BP) ((off_t)(BP)->nb_lblkno * (off_t)(BP)->nb_bufsize) +#define NBPGVALID(BP,P) (((BP)->nb_valid >> (P)) & 0x1) +#define NBPGDIRTY(BP,P) (((BP)->nb_dirty >> (P)) & 0x1) +#define NBPGVALID_SET(BP,P) ((BP)->nb_valid |= (1 << (P))) +#define NBPGDIRTY_SET(BP,P) ((BP)->nb_dirty |= (1 << (P))) + +#define NFS_BUF_MAP(BP) \ + do { \ + if (!(BP)->nb_data && nfs_buf_map(BP)) \ + panic("nfs_buf_map failed"); \ + } while (0) + +LIST_HEAD(nfsbuflists, nfsbuf); +TAILQ_HEAD(nfsbuffreehead, nfsbuf); + +#define NFSNOLIST ((struct nfsbuf *)0xdeadbeef) + +extern int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax; +extern int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer; +extern int nfs_nbdwrite; +extern struct nfsbuffreehead nfsbuffree, nfsbufdelwri; + +#define NFSBUFCNTCHK() \ + do { \ + if ( (nfsbufcnt < 0) || \ + (nfsbufcnt > nfsbufmax) || \ + (nfsbuffreecnt < 0) || \ + (nfsbuffreecnt > nfsbufmax) || \ + (nfsbuffreecnt > nfsbufcnt) || \ + (nfsbufdelwricnt < 0) || \ + (nfsbufdelwricnt > nfsbufmax) || \ + (nfsbufdelwricnt > nfsbufcnt) || \ + (nfs_nbdwrite < 0) || \ + (nfs_nbdwrite > nfsbufcnt) || \ + 0) \ + panic("nfsbuf count error: max %d cnt %d free %d delwr %d bdw %d\n", \ + nfsbufmax, nfsbufcnt, nfsbuffreecnt, \ + nfsbufdelwricnt, nfs_nbdwrite); \ + } while (0) + +/* * The nfsnode is the nfs equivalent to ufs's inode. Any similarity * is purely coincidental. * There is a unique nfsnode allocated for each active file, @@ -131,7 +221,10 @@ time_t n_ctime; /* Prev create time. */ time_t n_expiry; /* Lease expiry time */ nfsfh_t *n_fhp; /* NFS File Handle */ - struct vnode *n_vnode; /* associated vnode */ + union { + struct vnode *n_vp; /* associated vnode */ + struct mount *n_mp; /* associated mount (NINIT) */ + } n_un0; struct lockf *n_lockf; /* Locking record of file */ int n_error; /* Save write error value */ union { @@ -150,8 +243,21 @@ short n_flag; /* Flag for locking.. */ nfsfh_t n_fh; /* Small File Handle */ u_int64_t n_xid; /* last xid to loadattr */ + struct nfsbuflists n_cleanblkhd; /* clean blocklist head */ + struct nfsbuflists n_dirtyblkhd; /* dirty blocklist head */ + int n_needcommitcnt;/* # bufs that need committing */ }; +#define CHECK_NEEDCOMMITCNT(np) \ + do { \ + if ((np)->n_needcommitcnt < 0) { \ + printf("nfs: n_needcommitcnt negative\n"); \ + (np)->n_needcommitcnt = 0; \ + } \ + } while (0) + +#define n_vnode n_un0.n_vp +#define n_mount n_un0.n_mp #define n_atim n_un1.nf_atim #define n_mtim n_un2.nf_mtim #define n_sillyrename n_un3.nf_silly @@ -172,8 +278,9 @@ #define NACC 0x0100 /* Special file accessed */ #define NUPD 0x0200 /* Special file updated */ #define NCHG 0x0400 /* Special file times changed */ -#define NLOCKED 0x0800 /* node is locked */ -#define NWANTED 0x0100 /* someone wants to lock */ +#define NHASHED 0x1000 /* someone wants to lock */ +#define NINIT 0x2000 /* node is being initialized */ +#define NWINIT 0x4000 /* someone waiting for init to complete */ /* * Convert between nfsnode pointers and vnode pointers @@ -204,7 +311,6 @@ int nqnfs_vop_lease_check __P((struct vop_lease_args *)); #define nfs_revoke vop_revoke #define nfs_seek ((int (*) __P((struct vop_seek_args *)))nullop) -int nfs_abortop __P((struct vop_abortop_args *)); int nfs_inactive __P((struct vop_inactive_args *)); int nfs_reclaim __P((struct vop_reclaim_args *)); int nfs_lock __P((struct vop_lock_args *)); @@ -221,6 +327,18 @@ void nfs_invaldir __P((struct vnode *)); #define nqnfs_lease_updatetime lease_updatetime + +/* nfsbuf functions */ +void nfs_nbinit(void); +void nfs_buf_remfree(struct nfsbuf *); +struct nfsbuf * nfs_buf_incore(struct vnode *, daddr_t); +struct nfsbuf * nfs_buf_get(struct vnode *, daddr_t, int, struct proc *, int); +int nfs_buf_upl_setup(struct nfsbuf *bp); +void nfs_buf_upl_check(struct nfsbuf *bp); +void nfs_buf_release(struct nfsbuf *); +int nfs_buf_iowait(struct nfsbuf *); +void nfs_buf_iodone(struct nfsbuf *); +void nfs_buf_write_delayed(struct nfsbuf *); #endif /* KERNEL */ diff -urN xnu-344.49/bsd/nfs/nfsproto.h xnu-517/bsd/nfs/nfsproto.h --- xnu-344.49/bsd/nfs/nfsproto.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/nfs/nfsproto.h Sat Oct 25 00:25:55 2003 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -88,12 +88,13 @@ #define NFS_VER3 3 #define NFS_V2MAXDATA 8192 #define NFS_MAXDGRAMDATA 16384 -#define NFS_MAXDATA 32768 +#define NFS_MAXDATA (60*1024) // XXX not ready for 64K-128K #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 #define NFS_MAXPKTHDR 404 #define NFS_MAXPACKET (NFS_MAXPKTHDR + NFS_MAXDATA) #define NFS_MINPACKET 20 +#define NFS_MAXSOCKBUF (224*1024) #define NFS_FABLKSIZE 512 /* Size in bytes of a block wrt fa_blocks */ /* Stat numbers for rpc returns (version 2 and 3) */ diff -urN xnu-344.49/bsd/nfs/nlminfo.h xnu-517/bsd/nfs/nlminfo.h --- xnu-344.49/bsd/nfs/nlminfo.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/nfs/nlminfo.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI nlminfo.h,v 2.1 1998/03/18 01:30:38 don Exp + * $FreeBSD: src/sys/nfsclient/nlminfo.h,v 1.1 2001/04/17 20:45:22 alfred Exp $ + */ + +#include + +#ifdef __APPLE_API_PRIVATE + +/* + * Misc NLM information, some needed for the master lockd process, and some + * needed by every process doing nlm based locking. + */ +struct nlminfo { + /* these are used by any process doing nlm locking */ + int msg_seq; /* sequence counter for lock requests */ + int retcode; /* return code for lock requests */ + int set_getlk; + int getlk_pid; + off_t getlk_start; + off_t getlk_len; + struct timeval pid_start; /* process starting time */ + struct timeval nlm_lockstart; /* XXX debug */ +}; + +extern void nlminfo_release(struct proc *p); +#endif /* __APPLE_API_PRIVATE */ diff -urN xnu-344.49/bsd/ppc/param.h xnu-517/bsd/ppc/param.h --- xnu-344.49/bsd/ppc/param.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/ppc/param.h Sat Oct 25 00:25:55 2003 @@ -113,7 +113,7 @@ #define bdbtofsb(bn) ((bn) / (BLKDEV_IOSIZE/DEV_BSIZE)) /* from machdep/ppc/proc_reg.h */ -#if __BIG_ENDIAN__ +#ifdef __BIG_ENDIAN__ #define ENDIAN_MASK(val,size) (1 << (size-1 - val)) #else #error code not ported to little endian targets yet diff -urN xnu-344.49/bsd/ppc/ucontext.h xnu-517/bsd/ppc/ucontext.h --- xnu-344.49/bsd/ppc/ucontext.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/ppc/ucontext.h Sat Oct 25 00:25:55 2003 @@ -40,4 +40,14 @@ typedef struct mcontext * mcontext_t; +struct mcontext64 { + ppc_exception_state_t es; + ppc_thread_state64_t ss; + ppc_float_state_t fs; + ppc_vector_state_t vs; +}; +#define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) + +typedef struct mcontext64 * mcontext64_t; + #endif /* _PPC_UCONTEXT_H_ */ diff -urN xnu-344.49/bsd/ppc/vmparam.h xnu-517/bsd/ppc/vmparam.h --- xnu-344.49/bsd/ppc/vmparam.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/ppc/vmparam.h Sat Oct 25 00:25:55 2003 @@ -40,7 +40,7 @@ #define MAXDSIZ (RLIM_INFINITY) /* max data size */ #endif #ifndef DFLSSIZ -#define DFLSSIZ (512*1024) /* initial stack size limit */ +#define DFLSSIZ (8*1024*1024) /* initial stack size limit */ #endif #ifndef MAXSSIZ #define MAXSSIZ (64*1024*1024) /* max stack size */ diff -urN xnu-344.49/bsd/sys/Makefile xnu-517/bsd/sys/Makefile --- xnu-344.49/bsd/sys/Makefile Thu Sep 18 03:15:26 2003 +++ xnu-517/bsd/sys/Makefile Tue Oct 21 21:24:55 2003 @@ -20,10 +20,11 @@ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - appleapiopts.h \ - acct.h attr.h buf.h callout.h cdefs.h clist.h conf.h \ + appleapiopts.h acct.h aio.h attr.h \ + audit.h bsm_kevents.h bsm_token.h bsm_uevents.h \ + buf.h callout.h cdefs.h clist.h conf.h \ dir.h dirent.h disk.h disklabel.h disktab.h dkstat.h dmap.h domain.h \ - errno.h ev.h exec.h fcntl.h file.h filedesc.h filio.h gmon.h ioccom.h ioctl.h \ + errno.h ev.h event.h exec.h fcntl.h file.h filedesc.h filio.h gmon.h ioccom.h ioctl.h \ ioctl_compat.h ipc.h kernel.h kern_event.h ktrace.h loadable_fs.h lock.h lockf.h mach_swapon.h malloc.h \ kdebug.h linker_set.h md5.h kern_control.h \ mbuf.h mman.h mount.h msgbuf.h mtio.h namei.h netport.h param.h paths.h \ @@ -32,7 +33,7 @@ syscall.h sysctl.h syslimits.h syslog.h systm.h sys_domain.h termios.h time.h \ timeb.h times.h tprintf.h trace.h tty.h ttychars.h ttycom.h \ ttydefaults.h ttydev.h types.h ubc.h ucontext.h ucred.h uio.h un.h unistd.h unpcb.h \ - user.h utfconv.h utsname.h ux_exception.h vadvise.h vcmd.h version.h vlimit.h \ + user.h utfconv.h utsname.h ux_exception.h vadvise.h vcmd.h version.h \ vm.h vmmeter.h vmparam.h vnioctl.h vnode.h vnode_if.h vstat.h wait.h INSTALL_MI_LIST = ${DATAFILES} diff -urN xnu-344.49/bsd/sys/aio.h xnu-517/bsd/sys/aio.h --- xnu-344.49/bsd/sys/aio.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/aio.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * File: sys/aio.h + * Author: Umesh Vaishampayan [umeshv@apple.com] + * 05-Feb-2003 umeshv Created. + * + * Header file for POSIX Asynchronous IO APIs + * + */ + +#ifndef _SYS_AIO_H_ +#define _SYS_AIO_H_ + +#include + +struct aiocb { + int aio_fildes; /* File descriptor */ + off_t aio_offset; /* File offset */ + volatile void *aio_buf; /* Location of buffer */ + size_t aio_nbytes; /* Length of transfer */ + int aio_reqprio; /* Request priority offset */ + struct sigevent aio_sigevent; /* Signal number and value */ + int aio_lio_opcode; /* Operation to be performed */ +}; + +/* + * aio_cancel() return values + */ + +/* + * none of the requested operations could be canceled since they are + * already complete. + */ +#define AIO_ALLDONE 0x1 + +/* all requested operations have been canceled */ +#define AIO_CANCELED 0x2 + +/* + * some of the requested operations could not be canceled since + * they are in progress + */ +#define AIO_NOTCANCELED 0x4 + + +/* + * lio_listio operation options + */ + +#define LIO_NOP 0x0 /* option indicating that no transfer is requested */ +#define LIO_READ 0x1 /* option requesting a read */ +#define LIO_WRITE 0x2 /* option requesting a write */ + +/* + * lio_listio() modes + */ + +/* + * A lio_listio() synchronization operation indicating + * that the calling thread is to continue execution while + * the lio_listio() operation is being performed, and no + * notification is given when the operation is complete + */ +#define LIO_NOWAIT 0x1 + +/* + * A lio_listio() synchronization operation indicating + * that the calling thread is to suspend until the + * lio_listio() operation is complete. + */ +#define LIO_WAIT 0x2 + +/* + * Maximum number of operations in single lio_listio call + */ +#define AIO_LISTIO_MAX 16 + +/* + * A aio_fsync() options + * that the calling thread is to continue execution while + * the lio_listio() operation is being performed, and no + * notification is given when the operation is complete + */ + +#define O_SYNC 0x0 /* queued IO is completed as if by fsync() */ +#if 0 /* O_DSYNC - NOT SUPPORTED */ +#define O_DSYNC 0x1 /* queued async IO is completed as if by fdatasync() */ +#endif + +#ifndef KERNEL +/* + * Prototypes + */ + +/* + * Attempt to cancel one or more asynchronous I/O requests currently outstanding + * against file descriptor fd. The aiocbp argument points to the asynchronous I/O + * control block for a particular request to be canceled. If aiocbp is NULL, then + * all outstanding cancelable asynchronous I/O requests against fd shall be canceled. + */ +int aio_cancel( int fd, + struct aiocb * aiocbp ); + +/* + * Return the error status associated with the aiocb structure referenced by the + * aiocbp argument. The error status for an asynchronous I/O operation is the errno + * value that would be set by the corresponding read(), write(), or fsync() + * operation. If the operation has not yet completed, then the error status shall + * be equal to [EINPROGRESS]. + */ +int aio_error( const struct aiocb * aiocbp ); + +/* + * Asynchronously force all I/O operations associated with the file indicated by + * the file descriptor aio_fildes member of the aiocb structure referenced by the + * aiocbp argument and queued at the time of the call to aio_fsync() to the + * synchronized I/O completion state. The function call shall return when the + * synchronization request has been initiated or queued. op O_SYNC is the only + * supported opertation at this time. + * The aiocbp argument refers to an asynchronous I/O control block. The aiocbp + * value may be used as an argument to aio_error() and aio_return() in order to + * determine the error status and return status, respectively, of the asynchronous + * operation while it is proceeding. When the request is queued, the error status + * for the operation is [EINPROGRESS]. When all data has been successfully + * transferred, the error status shall be reset to reflect the success or failure + * of the operation. + */ +int aio_fsync( int op, + struct aiocb * aiocbp ); + +/* + * Read aiocbp->aio_nbytes from the file associated with aiocbp->aio_fildes into + * the buffer pointed to by aiocbp->aio_buf. The function call shall return when + * the read request has been initiated or queued. + * The aiocbp value may be used as an argument to aio_error() and aio_return() in + * order to determine the error status and return status, respectively, of the + * asynchronous operation while it is proceeding. If an error condition is + * encountered during queuing, the function call shall return without having + * initiated or queued the request. The requested operation takes place at the + * absolute position in the file as given by aio_offset, as if lseek() were called + * immediately prior to the operation with an offset equal to aio_offset and a + * whence equal to SEEK_SET. After a successful call to enqueue an asynchronous + * I/O operation, the value of the file offset for the file is unspecified. + */ +int aio_read( struct aiocb * aiocbp ); + +/* + * Return the return status associated with the aiocb structure referenced by + * the aiocbp argument. The return status for an asynchronous I/O operation is + * the value that would be returned by the corresponding read(), write(), or + * fsync() function call. If the error status for the operation is equal to + * [EINPROGRESS], then the return status for the operation is undefined. The + * aio_return() function may be called exactly once to retrieve the return status + * of a given asynchronous operation; thereafter, if the same aiocb structure + * is used in a call to aio_return() or aio_error(), an error may be returned. + * When the aiocb structure referred to by aiocbp is used to submit another + * asynchronous operation, then aio_return() may be successfully used to + * retrieve the return status of that operation. + */ +ssize_t aio_return( struct aiocb * aiocbp ); + +/* + * Suspend the calling thread until at least one of the asynchronous I/O + * operations referenced by the aiocblist argument has completed, until a signal + * interrupts the function, or, if timeout is not NULL, until the time + * interval specified by timeout has passed. If any of the aiocb structures + * in the aiocblist correspond to completed asynchronous I/O operations (that is, + * the error status for the operation is not equal to [EINPROGRESS]) at the + * time of the call, the function shall return without suspending the calling + * thread. The aiocblist argument is an array of pointers to asynchronous I/O + * control blocks. The nent argument indicates the number of elements in the + * array. Each aiocb structure pointed to has been used in initiating an + * asynchronous I/O request via aio_read(), aio_write(), or lio_listio(). This + * array may contain NULL pointers, which are ignored. + */ +int aio_suspend( const struct aiocb *const aiocblist[], + int nent, + const struct timespec * timeoutp ); + +/* + * Write aiocbp->aio_nbytes to the file associated with aiocbp->aio_fildes from + * the buffer pointed to by aiocbp->aio_buf. The function shall return when the + * write request has been initiated or, at a minimum, queued. + * The aiocbp argument may be used as an argument to aio_error() and aio_return() + * in order to determine the error status and return status, respectively, of the + * asynchronous operation while it is proceeding. + */ +int aio_write( struct aiocb * aiocbp ); + +/* + * Initiate a list of I/O requests with a single function call. The mode + * argument takes one of the values LIO_WAIT or LIO_NOWAIT and determines whether + * the function returns when the I/O operations have been completed, or as soon + * as the operations have been queued. If the mode argument is LIO_WAIT, the + * function shall wait until all I/O is complete and the sig argument shall be + * ignored. + * If the mode argument is LIO_NOWAIT, the function shall return immediately, and + * asynchronous notification shall occur, according to the sig argument, when all + * the I/O operations complete. If sig is NULL, then no asynchronous notification + * shall occur. + */ +int lio_listio( int mode, + struct aiocb *const aiocblist[], + int nent, + struct sigevent *sigp ); +#endif /* KERNEL */ +#endif /* _SYS_AIO_H_ */ diff -urN xnu-344.49/bsd/sys/aio_kern.h xnu-517/bsd/sys/aio_kern.h --- xnu-344.49/bsd/sys/aio_kern.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/aio_kern.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * File: sys/aio_kern.h + * Author: Jerry Cottingham [jerryc@apple.com] + * + * Header file for kernel only portion of POSIX Asynchronous IO APIs + * + */ + +#include + +#ifndef _SYS_AIO_KERN_H_ +#define _SYS_AIO_KERN_H_ + +#ifdef KERNEL + +struct aio_workq_entry +{ + TAILQ_ENTRY( aio_workq_entry ) aio_workq_link; + struct proc *procp; /* user proc that queued this request */ + struct aiocb *uaiocbp; /* pointer passed in from user land */ + struct aiocb *fsyncp; /* not NULL means this request must complete */ + /* before an aio_fsync call can proceed. */ + vm_map_t aio_map; /* user land map we have a reference to */ + ssize_t returnval; /* return value from read / write request */ + int errorval; /* error value from read / write request */ + int flags; + long group_tag; /* identifier used to group IO requests */ + struct aiocb aiocb; /* copy of aiocb from user land */ +}; +typedef struct aio_workq_entry aio_workq_entry; + +/* + * definitions for aio_workq_entry.flags + */ +#define AIO_READ 0x00000001 +#define AIO_WRITE 0x00000002 +#define AIO_FSYNC 0x00000004 /* aio_fsync with op = O_SYNC */ +#define AIO_DSYNC 0x00000008 /* aio_fsync with op = O_DSYNC (not supported yet) */ +#define AIO_LIO 0x00000010 /* lio_listio generated IO */ +#define AIO_DO_FREE 0x00000800 /* entry needs to be freed */ +#define AIO_COMPLETION 0x00001000 /* entry is in completion processing (not freeable yet) */ +#define AIO_DISABLE 0x00002000 /* process is trying to exit or exec and we need */ + /* to disable normal completion notification */ +#define AIO_WAITING 0x00004000 /* process is trying to exit, exec, or close and is */ + /* waiting for one or more active IO requests to */ + /* complete */ + + +__private_extern__ void _aio_close( struct proc *p, int fd ); +__private_extern__ void _aio_exit( struct proc *p ); +__private_extern__ void _aio_exec( struct proc *p ); +__private_extern__ void _aio_create_worker_threads( int num ); + +#endif /* KERNEL */ + +#endif /* _SYS_AIO_KERN_H_ */ diff -urN xnu-344.49/bsd/sys/attr.h xnu-517/bsd/sys/attr.h --- xnu-344.49/bsd/sys/attr.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/sys/attr.h Sat Oct 25 00:25:55 2003 @@ -113,14 +113,129 @@ vol_capabilities_set_t valid; } vol_capabilities_attr_t; +/* + * VOL_CAP_FMT_PERSISTENTOBJECTIDS: When set, the volume has object IDs + * that are persistent (retain their values even when the volume is + * unmounted and remounted), and a file or directory can be looked up + * by ID. Volumes that support VolFS and can support Carbon File ID + * references should set this bit. + * + * VOL_CAP_FMT_SYMBOLICLINKS: When set, the volume supports symbolic + * links. The symlink(), readlink(), and lstat() calls all use this + * symbolic link. + * + * VOL_CAP_FMT_HARDLINKS: When set, the volume supports hard links. + * The link() call creates hard links. + * + * VOL_CAP_FMT_JOURNAL: When set, the volume is capable of supporting + * a journal used to speed recovery in case of unplanned shutdown + * (such as a power outage or crash). This bit does not necessarily + * mean the volume is actively using a journal for recovery. + * + * VOL_CAP_FMT_JOURNAL_ACTIVE: When set, the volume is currently using + * a journal for use in speeding recovery after an unplanned shutdown. + * This bit can be set only if VOL_CAP_FMT_JOURNAL is also set. + * + * VOL_CAP_FMT_NO_ROOT_TIMES: When set, the volume format does not + * store reliable times for the root directory, so you should not + * depend on them to detect changes, etc. + * + * VOL_CAP_FMT_SPARSE_FILES: When set, the volume supports sparse files. + * That is, files which can have "holes" that have never been written + * to, and are not allocated on disk. Sparse files may have an + * allocated size that is less than the file's logical length. + * + * VOL_CAP_FMT_ZERO_RUNS: For security reasons, parts of a file (runs) + * that have never been written to must appear to contain zeroes. When + * this bit is set, the volume keeps track of allocated but unwritten + * runs of a file so that it can substitute zeroes without actually + * writing zeroes to the media. This provides performance similar to + * sparse files, but not the space savings. + * + * VOL_CAP_FMT_CASE_SENSITIVE: When set, file and directory names are + * case sensitive (upper and lower case are different). When clear, + * an upper case character is equivalent to a lower case character, + * and you can't have two names that differ solely in the case of + * the characters. + * + * VOL_CAP_FMT_CASE_PRESERVING: When set, file and directory names + * preserve the difference between upper and lower case. If clear, + * the volume may change the case of some characters (typically + * making them all upper or all lower case). A volume that sets + * VOL_CAP_FMT_CASE_SENSITIVE should also set VOL_CAP_FMT_CASE_PRESERVING. + * + * VOL_CAP_FMT_FAST_STATFS: This bit is used as a hint to upper layers + * (especially Carbon) that statfs() is fast enough that its results + * need not be cached by those upper layers. A volume that caches + * the statfs information in its in-memory structures should set this bit. + * A volume that must always read from disk or always perform a network + * transaction should not set this bit. + */ #define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 #define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 #define VOL_CAP_FMT_HARDLINKS 0x00000004 - +#define VOL_CAP_FMT_JOURNAL 0x00000008 +#define VOL_CAP_FMT_JOURNAL_ACTIVE 0x00000010 +#define VOL_CAP_FMT_NO_ROOT_TIMES 0x00000020 +#define VOL_CAP_FMT_SPARSE_FILES 0x00000040 +#define VOL_CAP_FMT_ZERO_RUNS 0x00000080 +#define VOL_CAP_FMT_CASE_SENSITIVE 0x00000100 +#define VOL_CAP_FMT_CASE_PRESERVING 0x00000200 +#define VOL_CAP_FMT_FAST_STATFS 0x00000400 + + +/* + * VOL_CAP_INT_SEARCHFS: When set, the volume implements the + * searchfs() system call (the VOP_SEARCHFS vnode operation). + * + * VOL_CAP_INT_ATTRLIST: When set, the volume implements the + * getattrlist() and setattrlist() system calls (VOP_GETATTRLIST + * and VOP_SETATTRLIST vnode operations) for the volume, files, + * and directories. The volume may or may not implement the + * readdirattr() system call. XXX Is there any minimum set + * of attributes that should be supported? To determine the + * set of supported attributes, get the ATTR_VOL_ATTRIBUTES + * attribute of the volume. + * + * VOL_CAP_INT_NFSEXPORT: When set, the volume implements exporting + * of NFS volumes. + * + * VOL_CAP_INT_READDIRATTR: When set, the volume implements the + * readdirattr() system call (VOP_READDIRATTR vnode operation). + * + * VOL_CAP_INT_EXCHANGEDATA: When set, the volume implements the + * exchangedata() system call (VOP_EXCHANGE vnode operation). + * + * VOL_CAP_INT_COPYFILE: When set, the volume implements the + * VOP_COPYFILE vnode operation. (XXX There should be a copyfile() + * system call in .) + * + * VOL_CAP_INT_ALLOCATE: When set, the volume implements the + * VOP_ALLOCATE vnode operation, which means it implements the + * F_PREALLOCATE selector of fcntl(2). + * + * VOL_CAP_INT_VOL_RENAME: When set, the volume implements the + * ATTR_VOL_NAME attribute for both getattrlist() and setattrlist(). + * The volume can be renamed by setting ATTR_VOL_NAME with setattrlist(). + * + * VOL_CAP_INT_ADVLOCK: When set, the volume implements POSIX style + * byte range locks via VOP_ADVLOCK (accessible from fcntl(2)). + * + * VOL_CAP_INT_FLOCK: When set, the volume implements whole-file flock(2) + * style locks via VOP_ADVLOCK. This includes the O_EXLOCK and O_SHLOCK + * flags of the open(2) call. + * + */ #define VOL_CAP_INT_SEARCHFS 0x00000001 #define VOL_CAP_INT_ATTRLIST 0x00000002 #define VOL_CAP_INT_NFSEXPORT 0x00000004 #define VOL_CAP_INT_READDIRATTR 0x00000008 +#define VOL_CAP_INT_EXCHANGEDATA 0x00000010 +#define VOL_CAP_INT_COPYFILE 0x00000020 +#define VOL_CAP_INT_ALLOCATE 0x00000040 +#define VOL_CAP_INT_VOL_RENAME 0x00000080 +#define VOL_CAP_INT_ADVLOCK 0x00000100 +#define VOL_CAP_INT_FLOCK 0x00000200 typedef struct vol_attributes_attr { attribute_set_t validattr; @@ -218,8 +333,13 @@ #define SRCHFS_MATCHPARTIALNAMES 0x00000002 #define SRCHFS_MATCHDIRS 0x00000004 #define SRCHFS_MATCHFILES 0x00000008 +#define SRCHFS_SKIPLINKS 0x00000010 +#define SRCHFS_SKIPINVISIBLE 0x00000020 +#define SRCHFS_SKIPPACKAGES 0x00000040 +#define SRCHFS_SKIPINAPPROPRIATE 0x00000080 + #define SRCHFS_NEGATEPARAMS 0x80000000 -#define SRCHFS_VALIDOPTIONSMASK 0x8000000F +#define SRCHFS_VALIDOPTIONSMASK 0x800000FF struct fssearchblock { struct attrlist *returnattrs; diff -urN xnu-344.49/bsd/sys/audit.h xnu-517/bsd/sys/audit.h --- xnu-344.49/bsd/sys/audit.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/audit.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_AUDIT_H +#define _SYS_AUDIT_H + +#include +#include +#include +#include +#include + +#define AUDIT_RECORD_MAGIC 0x828a0f1b +#define MAX_AUDIT_RECORDS 20 +#define MAX_AUDIT_RECORD_SIZE 4096 + +/* + * Define the masks for the classes of audit events. + */ +#define AU_NULL 0x00000000 +#define AU_FREAD 0x00000001 +#define AU_FWRITE 0x00000002 +#define AU_FACCESS 0x00000004 +#define AU_FMODIFY 0x00000008 +#define AU_FCREATE 0x00000010 +#define AU_FDELETE 0x00000020 +#define AU_CLOSE 0x00000040 +#define AU_PROCESS 0x00000080 +#define AU_NET 0x00000100 +#define AU_IPC 0x00000200 +#define AU_NONAT 0x00000400 +#define AU_ADMIN 0x00000800 +#define AU_LOGIN 0x00001000 +#define AU_TFM 0x00002000 +#define AU_APPL 0x00004000 +#define AU_SETL 0x00008000 +#define AU_IFLOAT 0x00010000 +#define AU_PRIV 0x00020000 +#define AU_MAC_RW 0x00040000 +#define AU_XCONN 0x00080000 +#define AU_XCREATE 0x00100000 +#define AU_XDELETE 0x00200000 +#define AU_XIFLOAT 0x00400000 +#define AU_XPRIVS 0x00800000 +#define AU_XPRIVF 0x01000000 +#define AU_XMOVE 0x02000000 +#define AU_XDACF 0x04000000 +#define AU_XMACF 0x08000000 +#define AU_XSECATTR 0x10000000 +#define AU_IOCTL 0x20000000 +#define AU_EXEC 0x40000000 +#define AU_OTHER 0x80000000 +#define AU_ALL 0xffffffff + +/* + * IPC types + */ +#define AT_IPC_MSG ((u_char)1) /* message IPC id */ +#define AT_IPC_SEM ((u_char)2) /* semaphore IPC id */ +#define AT_IPC_SHM ((u_char)3) /* shared mem IPC id */ + +/* + * Audit conditions. + */ +#define AUC_UNSET 0 +#define AUC_AUDITING 1 +#define AUC_NOAUDIT 2 +#define AUC_DISABLED -1 + +/* + * auditon(2) commands. + */ +#define A_GETPOLICY 2 +#define A_SETPOLICY 3 +#define A_GETKMASK 4 +#define A_SETKMASK 5 +#define A_GETQCTRL 6 +#define A_SETQCTRL 7 +#define A_GETCWD 8 +#define A_GETCAR 9 +#define A_GETSTAT 12 +#define A_SETSTAT 13 +#define A_SETUMASK 14 +#define A_SETSMASK 15 +#define A_GETCOND 20 +#define A_SETCOND 21 +#define A_GETCLASS 22 +#define A_SETCLASS 23 +#define A_GETPINFO 24 +#define A_SETPMASK 25 +#define A_SETFSIZE 26 +#define A_GETFSIZE 27 +#define A_GETPINFO_ADDR 28 +#define A_GETKAUDIT 29 +#define A_SETKAUDIT 30 + +/* + * Audit policy controls. + */ +#define AUDIT_CNT 0x0001 +#define AUDIT_AHLT 0x0002 +#define AUDIT_ARGV 0x0004 +#define AUDIT_ARGE 0x0008 +#define AUDIT_PASSWD 0x0010 +#define AUDIT_SEQ 0x0020 +#define AUDIT_WINDATA 0x0040 +#define AUDIT_USER 0x0080 +#define AUDIT_GROUP 0x0100 +#define AUDIT_TRAIL 0x0200 +#define AUDIT_PATH 0x0400 + +typedef uid_t au_id_t; +typedef pid_t au_asid_t; +typedef u_int16_t au_event_t; +typedef u_int16_t au_emod_t; +typedef u_int32_t au_class_t; + +struct au_tid { + dev_t port; + u_int32_t machine; +}; +typedef struct au_tid au_tid_t; + +struct au_tid_addr { + dev_t at_port; + u_int32_t at_type; + u_int32_t at_addr[4]; +}; +typedef struct au_tid_addr au_tid_addr_t; + +struct au_mask { + unsigned int am_success; /* success bits */ + unsigned int am_failure; /* failure bits */ +}; +typedef struct au_mask au_mask_t; + +struct auditinfo { + au_id_t ai_auid; /* Audit user ID */ + au_mask_t ai_mask; /* Audit masks */ + au_tid_t ai_termid; /* Terminal ID */ + au_asid_t ai_asid; /* Audit session ID */ +}; +typedef struct auditinfo auditinfo_t; + +struct auditinfo_addr { + au_id_t ai_auid; /* Audit user ID */ + au_mask_t ai_mask; /* Audit masks */ + au_tid_addr_t ai_termid; /* Terminal ID */ + au_asid_t ai_asid; /* Audit session ID */ +}; +typedef struct auditinfo_addr auditinfo_addr_t; + +/* Token and record structures */ + +struct au_token { + u_char *t_data; + size_t len; + TAILQ_ENTRY(au_token) tokens; +}; +typedef struct au_token token_t; + +struct au_record { + char used; /* Is this record currently being used */ + int desc; /* The descriptor associated with this record */ + TAILQ_HEAD(, au_token) token_q; /* queue of BSM tokens */ + u_char *data; + size_t len; + LIST_ENTRY(au_record) au_rec_q; +}; +typedef struct au_record au_record_t; + +#ifndef KERNEL +#include + +__BEGIN_DECLS +int audit (const void *, int); +int auditon (int, void *, int); +int auditsvc (int, int); +int auditctl (const char *); +int getauid (au_id_t *); +int setauid (const au_id_t *); +int getaudit (struct auditinfo *); +int setaudit (const struct auditinfo *); +int getaudit_addr (struct auditinfo_addr *, int); +int setaudit_addr (const struct auditinfo_addr *, int); +__END_DECLS +#endif /* !KERNEL */ + +#endif /* !_SYS_AUDIT_H */ diff -urN xnu-344.49/bsd/sys/bsm_kevents.h xnu-517/bsd/sys/bsm_kevents.h --- xnu-344.49/bsd/sys/bsm_kevents.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/bsm_kevents.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _BSM_KEVENTS_H_ +#define _BSM_KEVENTS_H_ + +/* + * Values marked as AUE_NULL are not required to be audited as per CAPP + * + * The second value within comments is the syscall number in Darwin + * + * Values in the third column are the values assigned by BSM for obsolete + * or old system calls + * + * Values marked as XXX in the third column do not have an + * event number assigned as yet, and have (temporarily) been assigned + * value of AUE_NULL + */ + +#define AUE_NULL 0 +#define AUE_EXIT 1 /*1*/ +#define AUE_FORK 2 /*2*/ +#define AUE_READ AUE_NULL /*3*/ +#define AUE_WRITE AUE_NULL /*4*/ +#define AUE_OPEN_R 72 /*5*/ +#define AUE_OPEN_RC 73 /*5*/ +#define AUE_OPEN_RTC 75 /*5*/ +#define AUE_OPEN_RT 74 /*5*/ +#define AUE_OPEN_RW 80 /*5*/ +#define AUE_OPEN_RWC 81 /*5*/ +#define AUE_OPEN_RWTC 83 /*5*/ +#define AUE_OPEN_RWT 82 /*5*/ +#define AUE_OPEN_W 76 /*5*/ +#define AUE_OPEN_WC 77 /*5*/ +#define AUE_OPEN_WTC 79 /*5*/ +#define AUE_OPEN_WT 78 /*5*/ +#define AUE_CLOSE 112 /*6*/ +#define AU_WAIT4 AUE_NULL /*7*/ +#define AUE_O_CREAT AUE_NULL /*8*/ /*4*/ +#define AUE_LINK 5 /*9*/ +#define AUE_UNLINK 6 /*10*/ +#define AUE_O_EXECV AUE_NULL /*11*/ +#define AUE_CHDIR 8 /*12*/ +#define AUE_FCHDIR 68 /*13*/ +#define AUE_MKNOD 9 /*14*/ +#define AUE_CHMOD 10 /*15*/ +#define AUE_CHOWN 11 /*16*/ +#define AUE_O_SBREAK AUE_NULL /*17*/ +#define AUE_GETFSSTAT 301 /*18*/ +#define AUE_O_LSEEK AUE_NULL /*19*/ +#define AUE_GETPID AUE_NULL /*20*/ +#define AUE_O_MOUNT AUE_NULL /*21*/ +#define AUE_O_UMOUNT AUE_NULL /*22*/ +#define AUE_SETUID 200 /*23*/ +#define AUE_GETUID AUE_NULL /*24*/ +#define AUE_GETEUID AUE_NULL /*25*/ +#define AUE_PTRACE 302 /*26*/ +#define AUE_RECVMSG 190 /*27*/ +#define AUE_SENDMSG 188 /*28*/ +#define AUE_RECVFROM 191 /*29*/ +#define AUE_ACCEPT 33 /*30*/ +#define AUE_GETPEERNAME AUE_NULL /*31*/ +#define AUE_GETSOCKNAME AUE_NULL /*32*/ +#define AUE_ACCESS 14 /*33*/ +#define AUE_CHFLAGS 303 /*34*/ +#define AUE_FCHFLAGS 304 /*35*/ +#define AUE_SYNC AUE_NULL /*36*/ +#define AUE_KILL 15 /*37*/ +#define AUE_O_STAT AUE_NULL /*38*/ +#define AUE_GETPPID AUE_NULL /*39*/ +#define AUE_O_LSTAT AUE_NULL /*40*/ +#define AUE_DUP AUE_NULL /*41*/ +#define AUE_PIPE 185 /*42*/ +#define AUE_GETEGID AUE_NULL /*43*/ +#define AUE_PROFILE 305 /*44*/ +#define AUE_KTRACE 306 /*45*/ +#define AUE_REBOOT 308 +#define AUE_SIGACTION AUE_NULL /*46*/ /*XXX*/ +#define AUE_GETGID AUE_NULL /*47*/ +#define AUE_SIGPROCMASK AUE_NULL /*48*/ /*XXX*/ +#define AUE_GETLOGIN AUE_NULL /*49*/ +#define AUE_SETLOGIN 307 /*50*/ +#define AUE_ACCT 18 /*51*/ +#define AUE_SIGPENDING AUE_NULL /*52*/ /*XXX*/ +#define AUE_SIGALTSTACK AUE_NULL /*53*/ /*XXX*/ +#define AUE_IOCTL 158 /*54*/ +#define AUE_SYSTEMBOOT 113 /*55*/ +#define AUE_REVOKE 309 /*56*/ +#define AUE_SYMLINK 21 /*57*/ +#define AUE_READLINK 22 /*58*/ +#define AUE_EXECVE 23 /*59*/ +#define AUE_UMASK 310 /*60*/ +#define AUE_CHROOT 24 /*61*/ +#define AUE_O_FSTAT AUE_NULL /*62*/ + +#define AUE_O_GETPAGESIZE AUE_NULL /*64*/ +#define AUE_MSYNC AUE_NULL /*65*/ +#define AUE_VFORK 25 /*66*/ +#define AUE_O_VREAD AUE_NULL /*67*/ +#define AUE_O_VWRITE AUE_NULL /*68*/ +#define AUE_SBRK AUE_NULL /*69*/ /*EOPNOTSUP*/ +#define AUE_SSTK AUE_NULL /*70*/ /*EOPNOTSUP*/ +#define AUE_O_MMAN AUE_NULL /*71*/ +#define AUE_O_VADVISE AUE_NULL /*72*/ +#define AUE_MUNMAP 213 /*73*/ +#define AUE_MPROTECT 311 /*74*/ +#define AUE_MADVISE AUE_NULL /*75*/ +#define AUE_O_VHANGUP AUE_NULL /*76*/ +#define AUE_O_VLIMIT AUE_NULL /*77*/ +#define AUE_MINCORE AUE_NULL /*78*/ +#define AUE_GETGROUPS AUE_NULL /*79*/ +#define AUE_SETGROUPS 26 /*80*/ +#define AUE_GETPGRP AUE_NULL /*81*/ +#define AUE_SETPGRP 27 /*82*/ +#define AUE_SETITIMER AUE_NULL /*83*/ /*XXX*/ +#define AUE_O_WAIT AUE_NULL /*84*/ +#define AUE_SWAPON AUE_NULL /*85*/ /*EOPNOTSUP*/ +#define AUE_GETITIMER AUE_NULL /*86*/ +#define AUE_O_GETHOSTNAME AUE_NULL /*87*/ +#define AUE_O_SETHOSTNAME AUE_NULL /*88*/ +#define AUE_GETDTABLESIZE AUE_NULL /*89*/ +#define AUE_DUP2 AUE_NULL /*90*/ +#define AUE_O_GETDOPT AUE_NULL /*91*/ +#define AUE_FCNTL 30 /*92*/ +#define AUE_SELECT AUE_NULL /*93*/ +#define AUE_O_SETDOPT AUE_NULL /*94*/ +#define AUE_FSYNC AUE_NULL /*95*/ +#define AUE_SETPRIORITY 312 /*96*/ +#define AUE_SOCKET 183 /*97*/ +#define AUE_CONNECT 32 /*98*/ +#define AUE_O_ACCEPT AUE_NULL /*99*/ +#define AUE_GETPRIORITY AUE_NULL /*100*/ +#define AUE_O_SEND AUE_NULL /*101*/ +#define AUE_O_RECV AUE_NULL /*102*/ +#define AUE_SIGRETURN AUE_NULL /*103*/ /*XXX*/ +#define AUE_BIND 34 /*104*/ +#define AUE_SETSOCKOPT 35 /*105*/ +#define AUE_LISTEN AUE_NULL /*106*/ +#define AUE_O_VTIMES AUE_NULL /*107*/ +#define AUE_O_SIGVEC AUE_NULL /*108*/ +#define AUE_O_SIGBLOCK AUE_NULL /*109*/ +#define AUE_O_SIGSETMASK AUE_NULL /*110*/ +#define AUE_SIGSUSPEND AUE_NULL /*111*/ /*XXX*/ +#define AUE_O_SIGSTACK AUE_NULL /*112*/ +#define AUE_O_RECVMSG AUE_NULL /*113*/ +#define AUE_O_SENDMSG AUE_NULL /*114*/ +#define AUE_O_VTRACE AUE_NULL /*115*/ /*36*/ +#define AUE_GETTIMEOFDAY AUE_NULL /*116*/ +#define AUE_GETRUSAGE AUE_NULL /*117*/ +#define AUE_GTSOCKOPT AUE_NULL /*118*/ +#define AUE_O_RESUBA AUE_NULL /*119*/ +#define AUE_READV AUE_NULL /*120*/ +#define AUE_WRITEV AUE_NULL /*121*/ +#define AUE_SETTIMEOFDAY 313 /*122*/ +#define AUE_FCHOWN 38 /*123*/ +#define AUE_FCHMOD 39 /*124*/ +#define AUE_O_RECVFROM AUE_NULL /*125*/ +#define AUE_O_SETREUID AUE_NULL /*126*/ /*40*/ +#define AUE_O_SETREGID AUE_NULL /*127*/ /*41*/ +#define AUE_RENAME 42 /*128*/ +#define AUE_O_TRUNCATE AUE_NULL /*129*/ +#define AUE_O_FTRUNCATE AUE_NULL /*130*/ +#define AUE_FLOCK 314 /*131*/ +#define AUE_MKFIFO 315 /*132*/ +#define AUE_SENDTO 184 /*133*/ +#define AUE_SHUTDOWN 46 /*134*/ +#define AUE_SOCKETPAIR 317 /*135*/ +#define AUE_MKDIR 47 /*136*/ +#define AUE_RMDIR 48 /*137*/ +#define AUE_UTIMES 49 /*138*/ +#define AUE_FUTIMES 318 /*139*/ +#define AUE_ADJTIME 50 /*140*/ +#define AUE_O_GETPEERNAME AUE_NULL /*141*/ +#define AUE_O_GETHOSTID AUE_NULL /*142*/ +#define AUE_O_SETHOSTID AUE_NULL /*143*/ +#define AUE_O_GETRLIMIT AUE_NULL /*144*/ +#define AUE_O_SETRLIMIT AUE_NULL /*145*/ +#define AUE_O_KILLPG AUE_NULL /*146*/ +#define AUE_SETSID 319 /*147*/ +#define AUE_O_SETQUOTA AUE_NULL /*148*/ +#define AUE_O_QUOTA AUE_NULL /*149*/ +#define AUE_O_GETSOCKNAME AUE_NULL /*150*/ +#define AUE_GETPGID AUE_NULL /*151*/ +#define AUE_SETPRIVEXEC 320 /*152*/ +#define AUE_PREAD AUE_NULL /*153*/ +#define AUE_PWRITE AUE_NULL /*154*/ +#define AUE_NFSSVC 321 /*155*/ +#define AUE_O_GETDIRENTRIES AUE_NULL /*156*/ +#define AUE_STATFS 54 /*157*/ +#define AUE_FSTATFS 55 /*158*/ +#define AUE_UMOUNT 12 /*159*/ +#define AUE_O_ASYNCDAEMON AUE_NULL /*160*/ +#define AUE_GETFH 322 /*161*/ +#define AUE_O_GETDOMAINNAME AUE_NULL /*162*/ +#define AUE_O_SETDOMAINNAME AUE_NULL /*163*/ +#define AUE_O_PCFS_MOUNT AUE_NULL /*164*/ +#define AUE_QUOTACTL 323 /*165*/ +#define AUE_O_EXPORTFS AUE_NULL /*166*/ +#define AUE_MOUNT 62 /*167*/ +#define AUE_O_USTATE AUE_NULL /*168*/ +#define AUE_TABLE AUE_NULL /*170*/ /*ENOSYS*/ +#define AUE_O_WAIT3 AUE_NULL /*171*/ +#define AUE_O_RPAUSE AUE_NULL /*172*/ +#define AUE_O_GETDENTS AUE_NULL /*174*/ +#define AUE_GCCONTROL AUE_NULL /*175*/ /*ENOSYS*/ +#define AUE_ADDPROFILE 324 /*176*/ + +#define AUE_KDBUGTRACE 325 /*180*/ +#define AUE_SETGID 205 /*181*/ +#define AUE_SETEGID 214 /*182*/ +#define AUE_SETEUID 215 /*183*/ + +#define AUE_STAT 16 /*188*/ +#define AUE_FSTAT 326 /*189*/ +#define AUE_LSTAT 17 /*190*/ +#define AUE_PATHCONF 71 /*191*/ +#define AUE_FPATHCONF 327 /*192*/ +#define AUE_GETRLIMIT AUE_NULL /*194*/ +#define AUE_SETRLIMIT 51 /*195*/ +#define AUE_GETDIRENTRIES 328 /*196*/ +#define AUE_MMAP 210 /*197*/ +#define AUE_SYSCALL AUE_NULL /*198*/ /*ENOSYS*/ +#define AUE_LSEEK AUE_NULL /*199*/ +#define AUE_TRUNCATE 329 /*200*/ +#define AUE_FTRUNCATE 330 /*201*/ +#define AUE_SYSCTL 331 /*202*/ +#define AUE_MLOCK 332 /*203*/ +#define AUE_MUNLOCK 333 /*204*/ +#define AUE_UNDELETE 334 /*205*/ + +#define AUE_MKCOMPLEX AUE_NULL /*216*/ /*XXX*/ +#define AUE_STATV AUE_NULL /*217*/ /*EOPNOTSUPP*/ +#define AUE_LSTATV AUE_NULL /*218*/ /*EOPNOTSUPP*/ +#define AUE_FSTATV AUE_NULL /*219*/ /*EOPNOTSUPP*/ +#define AUE_GETATTRLIST 335 /*220*/ +#define AUE_SETATTRLIST 336 /*221*/ +#define AUE_GETDIRENTRIESATTR 337 /*222*/ +#define AUE_EXCHANGEDATA 338 /*223*/ +#define AUE_CHECKUSERACCESS AUE_NULL /*224*/ /* To Be Removed */ +#define AUE_SEARCHFS 339 /*225*/ + +#define AUE_DELETE AUE_NULL /*226*/ /* reserved */ +#define AUE_COPYFILE AUE_NULL /*227*/ /* reserved */ +#define AUE_WATCHEVENT AUE_NULL /*231*/ /* reserved */ +#define AUE_WAITEVENT AUE_NULL /*232*/ /* reserved */ +#define AUE_MODWATCH AUE_NULL /*233*/ /* reserved */ +#define AUE_FSCTL AUE_NULL /*242*/ /* reserved */ + +#define AUE_MINHERIT 340 /*250*/ +#define AUE_SEMSYS AUE_NULL /*251*/ /* To Be Removed */ +#define AUE_MSGSYS AUE_NULL /*252*/ /* To Be Removed */ +#define AUE_SHMSYS AUE_NULL /*253*/ +#define AUE_SEMCTL 98 /*254*/ +#define AUE_SEMCTL_GETALL 105 /*254*/ +#define AUE_SEMCTL_GETNCNT 102 /*254*/ +#define AUE_SEMCTL_GETPID 103 /*254*/ +#define AUE_SEMCTL_GETVAL 104 /*254*/ +#define AUE_SEMCTL_GETZCNT 106 /*254*/ +#define AUE_SEMCTL_RMID 99 /*254*/ +#define AUE_SEMCTL_SET 100 /*254*/ +#define AUE_SEMCTL_SETALL 108 /*254*/ +#define AUE_SEMCTL_SETVAL 107 /*254*/ +#define AUE_SEMCTL_STAT 101 /*254*/ +#define AUE_SEMGET 109 /*255*/ +#define AUE_SEMOP 110 /*256*/ +#define AUE_SEMCONFIG 341 /*257*/ +#define AUE_MSGCL AUE_NULL /*258*/ /*EOPNOTSUPP*/ +#define AUE_MSGGET 88 /*259*/ /*88-EOPNOTSUPP*/ +#define AUE_MSGRCV 89 /*261*/ /*89-EOPNOTSUPP*/ +#define AUE_MSGSND 90 /*260*/ /*90-EOPNOTSUPP*/ +#define AUE_SHMAT 96 /*262*/ +#define AUE_SHMCTL 91 /*263*/ +#define AUE_SHMCTL_RMID 92 /*263*/ +#define AUE_SHMCTL_SET 93 /*263*/ +#define AUE_SHMCTL_STAT 94 /*263*/ +#define AUE_SHMDT 97 /*264*/ +#define AUE_SHMGET 95 /*265*/ +#define AUE_SHMOPEN 345 /*266*/ +#define AUE_SHMUNLINK 346 /*267*/ +#define AUE_SEMOPEN 342 /*268*/ +#define AUE_SEMCLOSE 343 /*269*/ +#define AUE_SEMUNLINK 344 /*270*/ +#define AUE_SEMWAIT AUE_NULL /*271*/ +#define AUE_SEMTRYWAIT AUE_NULL /*272*/ +#define AUE_SEMPOST AUE_NULL /*273*/ +#define AUE_SEMGETVALUE AUE_NULL /*274*/ /*ENOSYS*/ +#define AUE_SEMINIT AUE_NULL /*275*/ /*ENOSYS*/ +#define AUE_SEMDESTROY AUE_NULL /*276*/ /*ENOSYS*/ + +#define AUE_LOADSHFILE 347 /*296*/ +#define AUE_RESETSHFILE 348 /*297*/ +#define AUE_NEWSYSTEMSHREG 349 /*298*/ + +#define AUE_GETSID AUE_NULL /*310*/ + +#define AUE_MLOCKALL AUE_NULL /*324*/ /*ENOSYS*/ +#define AUE_MUNLOCKALL AUE_NULL /*325*/ /*ENOSYS*/ + +#define AUE_ISSETUGID AUE_NULL /*327*/ +#define AUE_PTHREADKILL 350 /*328*/ +#define AUE_PTHREADSIGMASK 351 /*329*/ +#define AUE_SIGWAIT AUE_NULL /*330*/ /*XXX*/ + + + +// BSM events - Have to identify which ones are relevant to MacOSX +#define AUE_ACLSET 251 +#define AUE_AUDIT 211 +#define AUE_AUDITON_GETCAR 224 +#define AUE_AUDITON_GETCLASS 231 +#define AUE_AUDITON_GETCOND 229 +#define AUE_AUDITON_GETCWD 223 +#define AUE_AUDITON_GETKMASK 221 +#define AUE_AUDITON_GETSTAT 225 +#define AUE_AUDITON_GPOLICY 114 +#define AUE_AUDITON_GQCTRL 145 +#define AUE_AUDITON_SETCLASS 232 +#define AUE_AUDITON_SETCOND 230 +#define AUE_AUDITON_SETKMASK 222 +#define AUE_AUDITON_SESKMASK 228 +#define AUE_AUDITON_SETSTAT 226 +#define AUE_AUDITON_SETUMASK 227 +#define AUE_AUDITON_SPOLICY 147 +#define AUE_AUDITON_SQCTRL 146 +#define AUE_AUDITSVC 136 +#define AUE_DOORFS_DOOR_BIND 260 +#define AUE_DOORFS_DOOR_CALL 254 +#define AUE_DOORFS_DOOR_CREATE 256 +#define AUE_DOORFS_DOOR_CRED 259 +#define AUE_DOORFS_DOOR_INFO 258 +#define AUE_DOORFS_DOOR_RETURN 255 +#define AUE_DOORFS_DOOR_REVOKE 257 +#define AUE_DOORFS_DOOR_UNBIND 261 +#define AUE_ENTERPROM 153 +#define AUE_EXEC 7 +#define AUE_EXITPROM 154 +#define AUE_FACLSET 252 +#define AUE_FCHROOT 69 +#define AUE_FORK1 241 +#define AUE_GETAUDIT 132 +#define AUE_GETAUDIT_ADDR 267 +#define AUE_GETAUID 130 +#define AUE_GETMSG 217 +#define AUE_SOCKACCEPT 247 +#define AUE_SOCKRECEIVE 250 +#define AUE_GETPMSG 219 +#define AUE_GETPORTAUDIT 149 +#define AUE_INST_SYNC 264 +#define AUE_LCHOWN 237 +#define AUE_LXSTAT 236 +#define AUE_MEMCNTL 238 +#define AUE_MODADDMAJ 246 +#define AUE_MODCONFIG 245 +#define AUE_MODLOAD 243 +#define AUE_MODUNLOAD 244 +#define AUE_MSGCTL 84 +#define AUE_MSGCTL_RMID 85 +#define AUE_MSGCTL_SET 86 +#define AUE_MSGCTL_STAT 87 +#define AUE_NICE 203 +#define AUE_P_ONLINE 262 +#define AUE_PRIOCNTLSYS 212 +#define AUE_CORE 111 +#define AUE_PROCESSOR_BIND 263 +#define AUE_PUTMSG 216 +#define AUE_SOCKCONNECT 248 +#define AUE_SOCKSEND 249 +#define AUE_PUTPMSG 218 +#define AUE_SETAUDIT 133 +#define AUE_SETAUDIT_ADDR 266 +#define AUE_SETAUID 131 +#define AUE_SOCKCONFIG 183 +#define AUE_STATVFS 234 +#define AUE_STIME 201 +#define AUE_SYSINFO 39 +#define AUE_UTIME 202 +#define AUE_UTSYS 233 +#define AUE_XMKNOD 240 +#define AUE_XSTAT 235 + +#endif /* !_BSM_KEVENTS_H_ */ diff -urN xnu-344.49/bsd/sys/bsm_klib.h xnu-517/bsd/sys/bsm_klib.h --- xnu-344.49/bsd/sys/bsm_klib.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/bsm_klib.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _BSM_KLIB_H_ +#define _BSM_KLIB_H_ + +#define AU_PRS_SUCCESS 1 +#define AU_PRS_FAILURE 2 +#define AU_PRS_BOTH (AU_PRS_SUCCESS|AU_PRS_FAILURE) + +#ifdef KERNEL +int au_preselect(au_event_t event, au_mask_t *mask_p, int sorf); +au_event_t flags_to_openevent(int oflags); +void fill_vattr(struct vattr *v, struct vnode_au_info *vn_info); +void canon_path(struct proc *p, char *path, char *cpath); +/* + * Define a system call to audit event mapping table. + */ +extern au_event_t sys_au_event[]; +extern int nsys_au_event; /* number of entries in this table */ + +#endif /*KERNEL*/ + +#endif /* ! _BSM_KLIB_H_ */ diff -urN xnu-344.49/bsd/sys/bsm_token.h xnu-517/bsd/sys/bsm_token.h --- xnu-344.49/bsd/sys/bsm_token.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/bsm_token.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _BSM_TOKEN_H_ +#define _BSM_TOKEN_H_ + +#include +#include +#include +#include +#include +#include + +/* We could determined the header and trailer sizes by + * defining appropriate structures. We hold off that approach + * till we have a consistant way of using structures for all tokens. + * This is not straightforward since these token structures may + * contain pointers of whose contents we dont know the size + * (e.g text tokens) + */ +#define HEADER_SIZE 18 +#define TRAILER_SIZE 7 + +#define ADD_U_CHAR(loc, val) \ + do {\ + *loc = val;\ + loc += sizeof(u_char);\ + }while(0) + + +#define ADD_U_INT16(loc, val) \ + do { \ + memcpy(loc, (u_char *)&val, sizeof(u_int16_t));\ + loc += sizeof(u_int16_t); \ + }while(0) + +#define ADD_U_INT32(loc, val) \ + do { \ + memcpy(loc, (u_char *)&val, sizeof(u_int32_t));\ + loc += sizeof(u_int32_t); \ + }while(0) + +#define ADD_U_INT64(loc, val)\ + do {\ + memcpy(loc, (u_char *)&val, sizeof(u_int64_t));\ + loc += sizeof(u_int64_t); \ + }while(0) + +#define ADD_MEM(loc, data, size) \ + do { \ + memcpy(loc, data, size);\ + loc += size;\ + }while(0) + +#define ADD_STRING(loc, data, size) ADD_MEM(loc, data, size) + + +/* Various token id types */ + +/* + * Values inside the comments are not documented in the BSM pages and + * have been picked up from the header files + */ + +/* + * Values marked as XXX do not have a value defined in the BSM header files + */ + +/* + * Control token types + +#define AUT_OTHER_FILE ((char)0x11) +#define AUT_OTHER_FILE32 AUT_OTHER_FILE +#define AUT_OHEADER ((char)0x12) + + */ + +#define AUT_INVALID 0x00 +#define AU_FILE_TOKEN 0x11 +#define AU_TRAILER_TOKEN 0x13 +#define AU_HEADER_32_TOKEN 0x14 +#define AU_HEADER_EX_32_TOKEN 0x15 + + +/* + * Data token types +#define AUT_SERVER ((char)0x25) +#define AUT_SERVER32 AUT_SERVER + */ + +#define AU_DATA_TOKEN 0x21 +#define AU_ARB_TOKEN AU_DATA_TOKEN +#define AU_IPC_TOKEN 0x22 +#define AU_PATH_TOKEN 0x23 +#define AU_SUBJECT_32_TOKEN 0x24 +#define AU_PROCESS_32_TOKEN 0x26 +#define AU_RETURN_32_TOKEN 0x27 +#define AU_TEXT_TOKEN 0x28 +#define AU_OPAQUE_TOKEN 0x29 +#define AU_IN_ADDR_TOKEN 0x2A +#define AU_IP_TOKEN 0x2B +#define AU_IPORT_TOKEN 0x2C +#define AU_ARG32_TOKEN 0x2D +#define AU_SOCK_TOKEN 0x2E +#define AU_SEQ_TOKEN 0x2F + +/* + * Modifier token types + +#define AUT_ACL ((char)0x30) +#define AUT_LABEL ((char)0x33) +#define AUT_GROUPS ((char)0x34) +#define AUT_ILABEL ((char)0x35) +#define AUT_SLABEL ((char)0x36) +#define AUT_CLEAR ((char)0x37) +#define AUT_PRIV ((char)0x38) +#define AUT_UPRIV ((char)0x39) +#define AUT_LIAISON ((char)0x3A) + + */ + +#define AU_ATTR_TOKEN 0x31 +#define AU_IPCPERM_TOKEN 0x32 +#define AU_NEWGROUPS_TOKEN 0x3B +#define AU_EXEC_ARG_TOKEN 0x3C +#define AU_EXEC_ENV_TOKEN 0x3D +#define AU_ATTR32_TOKEN 0x3E + + +/* + * Command token types + */ + +#define AU_CMD_TOKEN 0x51 +#define AU_EXIT_TOKEN 0x52 + + + +/* + * Miscellaneous token types + +#define AUT_HOST ((char)0x70) + + */ + +/* + * 64bit token types + +#define AUT_SERVER64 ((char)0x76) +#define AUT_OTHER_FILE64 ((char)0x78) + + */ + +#define AU_ARG64_TOKEN 0x71 +#define AU_RETURN_64_TOKEN 0x72 +#define AU_ATTR64_TOKEN 0x73 +#define AU_HEADER_64_TOKEN 0x74 +#define AU_SUBJECT_64_TOKEN 0x75 +#define AU_PROCESS_64_TOKEN 0x77 + + + +/* + * Extended network address token types + */ + +#define AU_HEADER_EX_64_TOKEN 0x79 +#define AU_SUBJECT_32_EX_TOKEN 0x7a +#define AU_PROCESS_32_EX_TOKEN 0x7b +#define AU_SUBJECT_64_EX_TOKEN 0x7c +#define AU_PROCESS_64_EX_TOKEN 0x7d +#define AU_IN_ADDR_EX_TOKEN 0x7e +#define AU_SOCK_EX32_TOKEN 0x7f +#define AU_SOCK_EX128_TOKEN AUT_INVALID /*XXX*/ +#define AU_IP_EX_TOKEN AUT_INVALID /*XXX*/ + + +/* + * The values for the following token ids is not + * defined by BSM + */ +#define AU_SOCK_INET_32_TOKEN 0x80 /*XXX*/ +#define AU_SOCK_INET_128_TOKEN 0x81 /*XXX*/ +#define AU_SOCK_UNIX_TOKEN 0x82 /*XXX*/ + +/* print values for the arbitrary token */ +#define AUP_BINARY 0 +#define AUP_OCTAL 1 +#define AUP_DECIMAL 2 +#define AUP_HEX 3 +#define AUP_STRING 4 + + +/* data-types for the arbitrary token */ +#define AUR_BYTE 0 +#define AUR_SHORT 1 +#define AUR_LONG 2 + +/* ... and their sizes */ +#define AUR_BYTE_SIZE sizeof(u_char) +#define AUR_SHORT_SIZE sizeof(u_int16_t) +#define AUR_LONG_SIZE sizeof(u_int32_t) + +/* Modifiers for the header token */ +#define PAD_NOTATTR 0x4000 /* nonattributable event */ +#define PAD_FAILURE 0x8000 /* fail audit event */ + + +#define MAX_GROUPS 16 +#define HEADER_VERSION 1 +#define TRAILER_PAD_MAGIC 0xB105 + +/* BSM library calls */ + +int au_open(void); +int au_write(int d, token_t *m); +int au_close(int d, int keep, short event); +token_t *au_to_file(char *file); +token_t *au_to_header(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_header32(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_header64(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_me(void); + +token_t *au_to_arg(char n, char *text, u_int32_t v); +token_t *au_to_arg32(char n, char *text, u_int32_t v); +token_t *au_to_arg64(char n, char *text, u_int64_t v); +token_t *au_to_attr(struct vattr *attr); +token_t *au_to_attr32(struct vattr *attr); +token_t *au_to_attr64(struct vattr *attr); +token_t *au_to_data(char unit_print, char unit_type, + char unit_count, char *p); +token_t *au_to_exit(int retval, int err); +token_t *au_to_groups(int *groups); +token_t *au_to_newgroups(u_int16_t n, gid_t *groups); +token_t *au_to_in_addr(struct in_addr *internet_addr); +token_t *au_to_in_addr_ex(struct in6_addr *internet_addr); +token_t *au_to_ip(struct ip *ip); +token_t *au_to_ipc(char type, int id); +token_t *au_to_ipc_perm(struct ipc_perm *perm); +token_t *au_to_iport(u_int16_t iport); +token_t *au_to_opaque(char *data, u_int16_t bytes); +token_t *au_to_path(char *path); +token_t *au_to_process(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_process32_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_process64_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_return(char status, u_int32_t ret); +token_t *au_to_return32(char status, u_int32_t ret); +token_t *au_to_return64(char status, u_int64_t ret); +token_t *au_to_seq(long audit_count); +token_t *au_to_socket(struct socket *so); +token_t *au_to_socket_ex_32(struct socket *so); +token_t *au_to_socket_ex_128(struct socket *so); +token_t *au_to_sock_inet(struct sockaddr_in *so); +token_t *au_to_sock_inet32(struct sockaddr_in *so); +token_t *au_to_sock_inet128(struct sockaddr_in6 *so); +token_t *au_to_sock_unix(struct sockaddr_un *so); +token_t *au_to_subject(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_subject32_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_subject64_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_exec_args(const char **); +token_t *au_to_exec_env(const char **); +token_t *au_to_text(char *text); +token_t *au_to_trailer(int rec_size); + +#endif /* ! _BSM_TOKEN_H_ */ diff -urN xnu-344.49/bsd/sys/bsm_token.save.h xnu-517/bsd/sys/bsm_token.save.h --- xnu-344.49/bsd/sys/bsm_token.save.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/bsm_token.save.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _BSM_TOKEN_H_ +#define _BSM_TOKEN_H_ + +#include +#include +#include +#include +#include + +/* We could determined the header and trailer sizes by + * defining appropriate structures. We hold off that approach + * till we have a consistant way of using structures for all tokens. + * This is not straightforward since these token structures may + * contain pointers of whose contents we dont know the size + * (e.g text tokens) + */ +#define HEADER_SIZE 18 +#define TRAILER_SIZE 7 + +#define ADD_U_CHAR(loc, val) \ + do {\ + *loc = val;\ + loc += sizeof(u_char);\ + }while(0) + + +#define ADD_U_INT16(loc, val) \ + do { \ + memcpy(loc, (u_char *)&val, sizeof(u_int16_t));\ + loc += sizeof(u_int16_t); \ + }while(0) + +#define ADD_U_INT32(loc, val) \ + do { \ + memcpy(loc, (u_char *)&val, sizeof(u_int32_t));\ + loc += sizeof(u_int32_t); \ + }while(0) + +#define ADD_U_INT64(loc, val)\ + do {\ + memcpy(loc, (u_char *)&val, sizeof(u_int64_t));\ + loc += sizeof(u_int64_t); \ + }while(0) + +#define ADD_MEM(loc, data, size) \ + do { \ + memcpy(loc, data, size);\ + loc += size;\ + }while(0) + +#define ADD_STRING(loc, data, size) ADD_MEM(loc, data, size) + + +/* Various token id types */ + +/* + * Values inside the comments are not documented in the BSM pages and + * have been picked up from the header files + */ + +/* + * Values marked as XXX do not have a value defined in the BSM header files + */ + +/* + * Control token types + +#define AUT_OTHER_FILE ((char)0x11) +#define AUT_OTHER_FILE32 AUT_OTHER_FILE +#define AUT_OHEADER ((char)0x12) + + */ + +#define AUT_INVALID 0x00 +#define AU_FILE_TOKEN 0x11 +#define AU_TRAILER_TOKEN 0x13 +#define AU_HEADER_32_TOKEN 0x14 +#define AU_HEADER_EX_32_TOKEN 0x15 + + +/* + * Data token types +#define AUT_SERVER ((char)0x25) +#define AUT_SERVER32 AUT_SERVER + */ + +#define AU_DATA_TOKEN 0x21 +#define AU_ARB_TOKEN AU_DATA_TOKEN +#define AU_IPC_TOKEN 0x22 +#define AU_PATH_TOKEN 0x23 +#define AU_SUBJECT_32_TOKEN 0x24 +#define AU_PROCESS_32_TOKEN 0x26 +#define AU_RETURN_32_TOKEN 0x27 +#define AU_TEXT_TOKEN 0x28 +#define AU_OPAQUE_TOKEN 0x29 +#define AU_IN_ADDR_TOKEN 0x2A +#define AU_IP_TOKEN 0x2B +#define AU_IPORT_TOKEN 0x2C +#define AU_ARG32_TOKEN 0x2D +#define AU_SOCK_TOKEN 0x2E +#define AU_SEQ_TOKEN 0x2F + +/* + * Modifier token types + +#define AUT_ACL ((char)0x30) +#define AUT_LABEL ((char)0x33) +#define AUT_GROUPS ((char)0x34) +#define AUT_ILABEL ((char)0x35) +#define AUT_SLABEL ((char)0x36) +#define AUT_CLEAR ((char)0x37) +#define AUT_PRIV ((char)0x38) +#define AUT_UPRIV ((char)0x39) +#define AUT_LIAISON ((char)0x3A) + + */ + +#define AU_ATTR_TOKEN 0x31 +#define AU_IPCPERM_TOKEN 0x32 +#define AU_NEWGROUPS_TOKEN 0x3B +#define AU_EXEC_ARG_TOKEN 0x3C +#define AU_EXEC_ENV_TOKEN 0x3D +#define AU_ATTR32_TOKEN 0x3E + + +/* + * Command token types + */ + +#define AU_CMD_TOKEN 0x51 +#define AU_EXIT_TOKEN 0x52 + + + +/* + * Miscellaneous token types + +#define AUT_HOST ((char)0x70) + + */ + +/* + * 64bit token types + +#define AUT_SERVER64 ((char)0x76) +#define AUT_OTHER_FILE64 ((char)0x78) + + */ + +#define AU_ARG64_TOKEN 0x71 +#define AU_RETURN_64_TOKEN 0x72 +#define AU_ATTR64_TOKEN 0x73 +#define AU_HEADER_64_TOKEN 0x74 +#define AU_SUBJECT_64_TOKEN 0x75 +#define AU_PROCESS_64_TOKEN 0x77 + + + +/* + * Extended network address token types + */ + +#define AU_HEADER_EX_64_TOKEN 0x79 +#define AU_SUBJECT_32_EX_TOKEN 0x7a +#define AU_PROCESS_32_EX_TOKEN 0x7b +#define AU_SUBJECT_64_EX_TOKEN 0x7c +#define AU_PROCESS_64_EX_TOKEN 0x7d +#define AU_IN_ADDR_EX_TOKEN 0x7e +#define AU_SOCK_EX32_TOKEN 0x7f +#define AU_SOCK_EX128_TOKEN AUT_INVALID /*XXX*/ +#define AU_IP_EX_TOKEN AUT_INVALID /*XXX*/ + + +/* + * The values for the following token ids is not + * defined by BSM + */ +#define AU_SOCK_INET_32_TOKEN 0x80 /*XXX*/ +#define AU_SOCK_INET_128_TOKEN 0x81 /*XXX*/ + +/* print values for the arbitrary token */ +#define AUP_BINARY 0 +#define AUP_OCTAL 1 +#define AUP_DECIMAL 2 +#define AUP_HEX 3 +#define AUP_STRING 4 + + +/* data-types for the arbitrary token */ +#define AUR_BYTE 0 +#define AUR_SHORT 1 +#define AUR_LONG 2 + +/* ... and their sizes */ +#define AUR_BYTE_SIZE sizeof(u_char) +#define AUR_SHORT_SIZE sizeof(u_int16_t) +#define AUR_LONG_SIZE sizeof(u_int32_t) + +/* Modifiers for the header token */ +#define PAD_NOTATTR 0x4000 /* nonattributable event */ +#define PAD_FAILURE 0x8000 /* fail audit event */ + + +#define MAX_GROUPS 16 +#define HEADER_VERSION 1 +#define TRAILER_PAD_MAGIC 0xB105 + +/* BSM system calls */ + +#ifdef KERNEL +#else +int au_open(void); +int au_write(int d, token_t *m); +int au_close(int d, int keep, short event); +token_t *au_to_file(char *file); +token_t *au_to_header(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_header32(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_header64(int rec_size, au_event_t e_type, + au_emod_t e_mod); +token_t *au_to_me(void); +#endif /* !KERNEL */ + +token_t *au_to_arg(char n, char *text, u_int32_t v); +token_t *au_to_arg32(char n, char *text, u_int32_t v); +token_t *au_to_arg64(char n, char *text, u_int64_t v); +token_t *au_to_attr(struct vattr *attr); +token_t *au_to_attr32(struct vattr *attr); +token_t *au_to_attr64(struct vattr *attr); +token_t *au_to_data(char unit_print, char unit_type, + char unit_count, char *p); +token_t *au_to_exit(int retval, int err); +token_t *au_to_groups(int *groups); +token_t *au_to_newgroups(u_int16_t n, gid_t *groups); +token_t *au_to_in_addr(struct in_addr *internet_addr); +token_t *au_to_in_addr_ex(struct in6_addr *internet_addr); +token_t *au_to_ip(struct ip *ip); +token_t *au_to_ipc(char type, int id); +token_t *au_to_ipc_perm(struct ipc_perm *perm); +token_t *au_to_iport(u_int16_t iport); +token_t *au_to_opaque(char *data, u_int16_t bytes); +token_t *au_to_path(char *path); +token_t *au_to_process(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_process_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_process32_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_process64_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_return(char status, u_int32_t ret); +token_t *au_to_return32(char status, u_int32_t ret); +token_t *au_to_return64(char status, u_int64_t ret); +token_t *au_to_seq(long audit_count); +token_t *au_to_socket(struct socket *so); +token_t *au_to_socket_ex_32(struct socket *so); +token_t *au_to_socket_ex_128(struct socket *so); +token_t *au_to_sock_inet(struct sockaddr_in *so); +token_t *au_to_sock_inet32(struct sockaddr_in *so); +token_t *au_to_sock_inet128(struct sockaddr_in6 *so); +token_t *au_to_subject(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject32(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject64(au_id_t auid, uid_t euid, gid_t egid, + uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_t *tid); +token_t *au_to_subject_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_subject32_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_subject64_ex(au_id_t auid, uid_t euid, + gid_t egid, uid_t ruid, gid_t rgid, pid_t pid, + au_asid_t sid, au_tid_addr_t *tid); +token_t *au_to_exec_args(const char **); +token_t *au_to_exec_env(const char **); +token_t *au_to_text(char *text); +token_t *au_to_trailer(int rec_size); + +#endif /* ! _BSM_TOKEN_H_ */ diff -urN xnu-344.49/bsd/sys/bsm_uevents.h xnu-517/bsd/sys/bsm_uevents.h --- xnu-344.49/bsd/sys/bsm_uevents.h Thu Jan 1 01:00:00 1970 +++ xnu-517/bsd/sys/bsm_uevents.h Sat Oct 25 00:25:55 2003 @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _BSM_UEVENTS_H_ +#define _BSM_UEVENTS_H_ + +/* + * User level audit event numbers + * + * Range of audit event numbers: + * 0 Reserved, invalid + * 1 - 2047 Reserved for kernel events + * 2048 - 32767 Defined by BSM for user events + * 32768 - 36864 Reserved for Mac OS-X applications + * 36865 - 65535 Reserved for applications + * + */ +#define AUE_at_create 6144 +#define AUE_at_delete 6145 +#define AUE_at_perm 6146 +#define AUE_cron_invoke 6147 +#define AUE_crontab_create 6148 +#define AUE_crontab_delete 6149 +#define AUE_crontab_perm 6150 +#define AUE_inetd_connect 6151 +#define AUE_login 6152 +#define AUE_logout 6153 +#define AUE_telnet 6154 +#define AUE_rlogin 6155 +#define AUE_mountd_mount 6156 +#define AUE_mountd_umount 6157 +#define AUE_rshd 6158 +#define AUE_su 6159 +#define AUE_halt 6160 +#define AUE_reboot 6161 +#define AUE_rexecd 6162 +#define AUE_passwd 6163 +#define AUE_rexd 6164 +#define AUE_ftpd 6165 +#define AUE_init 6166 +#define AUE_uadmin 6167 +#define AUE_shutdown 6168 +#define AUE_poweroff 6169 +#define AUE_crontab_mod 6170 +#define AUE_allocate_succ 6200 +#define AUE_allocate_fail 6201 +#define AUE_deallocate_succ 6202 +#define AUE_deallocate_fail 6203 +#define AUE_listdevice_succ 6205 +#define AUE_listdevice_fail 6206 +#define AUE_create_user 6207 +#define AUE_modify_user 6208 +#define AUE_delete_user 6209 +#define AUE_disable_user 6210 +#define AUE_enable_user 6211 + +#endif /* !_BSM_UEVENTS_H_ */ diff -urN xnu-344.49/bsd/sys/buf.h xnu-517/bsd/sys/buf.h --- xnu-344.49/bsd/sys/buf.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/sys/buf.h Sat Oct 25 00:25:55 2003 @@ -251,6 +251,8 @@ struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); +int meta_breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, + struct ucred *, struct buf **)); void brelse __P((struct buf *)); void bremfree __P((struct buf *)); void bufinit __P((void)); diff -urN xnu-344.49/bsd/sys/cdefs.h xnu-517/bsd/sys/cdefs.h --- xnu-344.49/bsd/sys/cdefs.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/sys/cdefs.h Sat Oct 25 00:25:55 2003 @@ -133,10 +133,18 @@ */ #if defined(__MWERKS__) && (__MWERKS__ > 0x2400) /* newer Metrowerks compilers support __attribute__() */ -#elif !defined(__GNUC__) || __GNUC__ < 2 || \ - (__GNUC__ == 2 && __GNUC_MINOR__ < 5) +#elif __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 5 +#define __dead2 __attribute__((__noreturn__)) +#define __pure2 __attribute__((__const__)) +#if __GNUC__ == 2 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 7 +#define __unused /* no attribute */ +#else +#define __unused __attribute__((__unused__)) +#endif +#else #define __attribute__(x) /* delete __attribute__ if non-gcc or gcc1 */ #if defined(__GNUC__) && !defined(__STRICT_ANSI__) +/* __dead and __pure are depreciated. Use __dead2 and __pure2 instead */ #define __dead __volatile #define __pure __const #endif @@ -147,9 +155,13 @@ #define __dead #define __pure #endif +#ifndef __dead2 +#define __dead2 +#define __pure2 +#define __unused +#endif -#define __IDSTRING(name,string) \ - static const char name[] __attribute__((__unused__)) = string +#define __IDSTRING(name,string) static const char name[] __unused = string #ifndef __COPYRIGHT #define __COPYRIGHT(s) __IDSTRING(copyright,s) diff -urN xnu-344.49/bsd/sys/conf.h xnu-517/bsd/sys/conf.h --- xnu-344.49/bsd/sys/conf.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/sys/conf.h Sat Oct 25 00:25:55 2003 @@ -67,6 +67,7 @@ #define _SYS_CONF_H_ 1 #include +#include /* * Definitions of device driver entry switches @@ -106,6 +107,13 @@ #define d_read_t read_write_fcn_t #define d_write_t read_write_fcn_t #define d_ioctl_t ioctl_fcn_t +#define d_stop_t stop_fcn_t +#define d_reset_t reset_fcn_t +#define d_select_t select_fcn_t +#define d_mmap_t mmap_fcn_t +#define d_strategy_t strategy_fcn_t +#define d_getc_t getc_fcn_t +#define d_putc_t putc_fcn_t __BEGIN_DECLS int enodev (); /* avoid actual prototype for multiple use */ @@ -201,7 +209,7 @@ { \ eno_opcl, eno_opcl, eno_rdwrt, eno_rdwrt, \ eno_ioctl, eno_stop, eno_reset, 0, \ - seltrue, eno_mmap, eno_strat, eno_getc, \ + (select_fcn_t *)seltrue, eno_mmap, eno_strat, eno_getc, \ eno_putc, 0 \ } #endif /* KERNEL */ diff -urN xnu-344.49/bsd/sys/disk.h xnu-517/bsd/sys/disk.h --- xnu-344.49/bsd/sys/disk.h Thu Sep 18 21:01:31 2003 +++ xnu-517/bsd/sys/disk.h Sat Oct 25 00:25:55 2003 @@ -29,33 +29,82 @@ #include #include +/* + * Definitions + * + * ioctl description + * -------------------------------- -------------------------------------------- + * DKIOCEJECT eject media + * DKIOCSYNCHRONIZECACHE flush media + * + * DKIOCFORMAT format media + * DKIOCGETFORMATCAPACITIES get media's formattable capacities + * + * DKIOCGETBLOCKSIZE get media's block size + * DKIOCGETBLOCKCOUNT get media's block count + * DKIOCGETFIRMWAREPATH get media's firmware path + * + * DKIOCISFORMATTED is media formatted? + * DKIOCISWRITABLE is media writable? + * + * DKIOCGETMAXBLOCKCOUNTREAD get maximum block count for reads + * DKIOCGETMAXBLOCKCOUNTWRITE get maximum block count for writes + * DKIOCGETMAXBYTECOUNTREAD get maximum byte count for reads + * DKIOCGETMAXBYTECOUNTWRITE get maximum byte count for writes + * DKIOCGETMAXSEGMENTCOUNTREAD get maximum segment count for reads + * DKIOCGETMAXSEGMENTCOUNTWRITE get maximum segment count for writes + * DKIOCGETMAXSEGMENTBYTECOUNTREAD get maximum segment byte count for reads + * DKIOCGETMAXSEGMENTBYTECOUNTWRITE get maximum segment byte count for writes + */ + typedef struct { char path[128]; } dk_firmware_path_t; -#define DKIOCEJECT _IO('d', 21) -#define DKIOCSYNCHRONIZECACHE _IO('d', 22) +typedef struct +{ + u_int64_t blockCount; + u_int32_t blockSize; + + u_int8_t reserved0096[4]; /* reserved, clear to zero */ +} dk_format_capacity_t; + +typedef struct +{ + dk_format_capacity_t * capacities; + u_int32_t capacitiesCount; /* use zero to probe count */ + + u_int8_t reserved0064[8]; /* reserved, clear to zero */ +} dk_format_capacities_t; + +#define DKIOCEJECT _IO('d', 21) +#define DKIOCSYNCHRONIZECACHE _IO('d', 22) + +#define DKIOCFORMAT _IOW('d', 26, dk_format_capacity_t) +#define DKIOCGETFORMATCAPACITIES _IOWR('d', 26, dk_format_capacities_t) -#define DKIOCGETBLOCKSIZE _IOR('d', 24, u_int32_t) -#define DKIOCGETBLOCKCOUNT _IOR('d', 25, u_int64_t) -#define DKIOCGETBLOCKCOUNT32 _IOR('d', 25, u_int32_t) -#define DKIOCGETFIRMWAREPATH _IOR('d', 28, dk_firmware_path_t) +#define DKIOCGETBLOCKSIZE _IOR('d', 24, u_int32_t) +#define DKIOCGETBLOCKCOUNT _IOR('d', 25, u_int64_t) +#define DKIOCGETFIRMWAREPATH _IOR('d', 28, dk_firmware_path_t) -#define DKIOCISFORMATTED _IOR('d', 23, u_int32_t) -#define DKIOCISWRITABLE _IOR('d', 29, u_int32_t) +#define DKIOCISFORMATTED _IOR('d', 23, u_int32_t) +#define DKIOCISWRITABLE _IOR('d', 29, u_int32_t) -#define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, u_int64_t) -#define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, u_int64_t) +#define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, u_int64_t) +#define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, u_int64_t) #define DKIOCGETMAXBYTECOUNTREAD _IOR('d', 70, u_int64_t) #define DKIOCGETMAXBYTECOUNTWRITE _IOR('d', 71, u_int64_t) -#define DKIOCGETMAXSEGMENTCOUNTREAD _IOR('d', 66, u_int64_t) -#define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t) +#define DKIOCGETMAXSEGMENTCOUNTREAD _IOR('d', 66, u_int64_t) +#define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t) #define DKIOCGETMAXSEGMENTBYTECOUNTRE