Added the new version for dummynet.
marta [Wed, 6 Jan 2010 19:18:48 +0000 (19:18 +0000)]
The new code is located into the dummynet2 directory and the spec file
was changed to used this latest version.

Major changes related to PlanetLab are the new table lookup support,
a little fix to accept packets after the reinjection and code cleanup.
The new table lookup support will allow a PlanetLab user to jump
directly to their own rule section, avoiding to scan the whole
ruleset list.

41 files changed:
Makefile
Makefile.openwrt
README
dummynet/Makefile
dummynet/bsd_compat.c
dummynet/include/sys/kernel.h
dummynet/include/sys/mbuf.h
dummynet/include/sys/module.h
dummynet/ip_dummynet.c
dummynet/ip_fw2.c
dummynet/ip_fw_pfil.c
dummynet/ipfw2_mod.c
dummynet/missing.h
dummynet/radix.c
dummynet2/Makefile [new file with mode: 0644]
dummynet2/bsd_compat.c [new file with mode: 0644]
dummynet2/in_cksum.c [new file with mode: 0644]
dummynet2/include/netgraph/ng_ipfw.h [new file with mode: 0644]
dummynet2/include/netinet/ip_dummynet.h [new file with mode: 0644]
dummynet2/include/netinet/ip_fw.h [new file with mode: 0644]
dummynet2/include/netinet/ipfw/ip_fw_private.h [new file with mode: 0644]
dummynet2/ip_dummynet.c [new file with mode: 0644]
dummynet2/ip_fw2.c [new file with mode: 0644]
dummynet2/ip_fw_dynamic.c [new file with mode: 0644]
dummynet2/ip_fw_log.c [new file with mode: 0644]
dummynet2/ip_fw_lookup.c [new file with mode: 0644]
dummynet2/ip_fw_nat.c [new file with mode: 0644]
dummynet2/ip_fw_pfil.c [new file with mode: 0644]
dummynet2/ip_fw_sockopt.c [new file with mode: 0644]
dummynet2/ip_fw_table.c [new file with mode: 0644]
dummynet2/ipfw2_mod.c [new file with mode: 0644]
dummynet2/missing.h [new file with mode: 0644]
dummynet2/radix.c [new file with mode: 0644]
glue.h
ipfw/Makefile
ipfw/dummynet.c
ipfw/glue.c
ipfw/include_e/libutil.h [new file with mode: 0644]
ipfw/include_e/sys/sockio.h [new file with mode: 0644]
ipfw/ipfw2.c
planetlab/ipfwroot.spec

index f863838..51a00a9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ all clean distclean:
        echo target is $(@)
        (cd ipfw && $(MAKE) $(@) )
        (cd dummynet && $(MAKE) $(@) )
+       (cd dummynet2 && $(MAKE) $(@) )
 
 snapshot:
        (cd ..; tar cvzhf /tmp/$(SNAPSHOT_NAME).tgz --exclude .svn \
index 50dae83..b618a52 100644 (file)
@@ -44,7 +44,9 @@ define Build/Prepare
   # $(warning Preparing ipfw sources)
        mkdir -p $(PKG_BUILD_DIR)
        $(CP) -Rp $(IPFW_DIR)/* $(PKG_BUILD_DIR)/
+       (cd $(PKG_BUILD_DIR)/ipfw && $(MAKE) include_e )
        (cd $(PKG_BUILD_DIR)/dummynet && $(MAKE) include_e )
+       (cd $(PKG_BUILD_DIR)/dummynet2 && $(MAKE) include_e )
 endef
 
 define Build/Compile
@@ -54,10 +56,15 @@ define Build/Compile
                ARCH="$(LINUX_KARCH)" \
                SUBDIRS="$(PKG_BUILD_DIR)/dummynet" \
                VER=openwrt modules
+       $(MAKE) -C "$(LINUX_DIR)" \
+               CROSS_COMPILE="$(TARGET_CROSS)" \
+               ARCH="$(LINUX_KARCH)" \
+               SUBDIRS="$(PKG_BUILD_DIR)/dummynet2" \
+               VER=openwrt modules
        # compile the userland part for openwrt
        $(MAKE) -C $(PKG_BUILD_DIR)/ipfw \
                $(TARGET_CONFIGURE_OPTS) \
-               CFLAGS="$(TARGET_CFLAGS) -I./include -include ../glue.h" \
+               CFLAGS="$(TARGET_CFLAGS) -I./include_e -I./include -include ../glue.h" \
                VER=openwrt all
 endef
 
diff --git a/README b/README
index 7ab66bf..0c3b4e8 100644 (file)
--- a/README
+++ b/README
@@ -14,6 +14,9 @@ version in RELENG_7 and HEAD as of December 2009), plus some glue code
 and headers written from scratch.
 Unless specified otherwise, all the code here is under a BSD license.
 
+Note:
+       - the linux version miss the "one_pass" feature
+
 =================== BUILD INSTRUCTIONS ==========================
 
 ***** Linux 2.6.x ******
@@ -35,6 +38,10 @@ Unless specified otherwise, all the code here is under a BSD license.
            Networking options  --->
               [*] Network packet filtering framework (Netfilter)
 
+       If you have not yet compiled your kernel source, you need to
+       prepare the build environment:
+
+       (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)
 
 ***** Linux 2.4.x *****
 
@@ -114,6 +121,10 @@ Unless specified otherwise, all the code here is under a BSD license.
     rmmod ipfw_mod.o                            # remove the module
 
 ***** PLANETLAB BUILD (within a slice) *****
+These instruction can be used by PlanetLab developers to compile the dummynet module
+on a node. To install the module on the node users need root access in root context.
+PlanetLab users that want to use the dummynet package should ask to PlanetLab support
+for nodes with dummynet emulation capabilities.
 
     Follow the instructions below. You can just cut&paste
 
index cac1958..6c6d9f6 100644 (file)
@@ -5,7 +5,6 @@
 #
 # The defaults are set to build without modifications on PlanetLab
 # and possibly 2.6 versions.
-#
 
 # Some variables need to have specific names, because they are used
 # by the build infrastructure on Linux and OpenWrt. They are:
@@ -33,27 +32,40 @@ $(warning including dummynet/Makefile)
 # lets default for 2.6 for planetlab builds
 VER ?= 2.6
 
-# General values
+#--- General values for all types of build ---
+# obj-m is the target module
 obj-m := ipfw_mod.o
 
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
+IPFW_SRCS += radix.c 
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
 # generic cflags used on all systems
 #ipfw-cflags += -DIPFW_HASHTABLES
-ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT -DTRACE
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
 # _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
 ipfw-cflags += -D_BSD_SOURCE
 ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
 # the two header trees for empty and override files
-ipfw-cflags += -I $(M)/include_e -I $(M)/include
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
 ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
 
 $(warning "---- Building dummynet kernel module for Version $(VER)")
+
 # We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
-#
+
 ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
   M=.
-  obj-y := ipfw2_mod.o bsd_compat.o \
-       in_cksum.o ip_dummynet.o ip_fw2.o ip_fw_pfil.o radix.o
-  O_TARGET := ipfw_mod.o
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
 
   # xcflags-y is a temporary variable where we store build options
   xcflags-y += -O1 -DLINUX_24
@@ -72,22 +84,22 @@ else        # !openwrt, below we do linux builds for 2.4 and 2.6
   # We can override it from the command line, or let the system guess.
 
 ifneq ($(shell echo $(VER)|grep '2.4'),)
-  # The linux 2.4 version
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
   # guess the kernel path -- or is it under /lib/modules ?
-  KERNELPATH ?= /usr/src/`uname -r`/build
-
-  # Guess the gcc include directory
-  # The gcc version is in the last line returned by gcc -v
-  # gcc version 4.3.2 (Debian 4.3.2-1.1)
-  MYGCC_VER ?= $(shell gcc -v 2>&1 |tail -n 1 | cut -d " " -f 3)
-  # We don't know the exact directory unde /usr/lib/gcc so we guess
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
   MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
   $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
 
   # additional warning
-  #WARN = -Wp,-MD,/home/luigi/ports-luigi/dummynet-branches/ipfw_mod/dummynet/.ipfw2_mod.o.d
-  #WARN += -Iinclude  -include include/linux/autoconf.h
-
   WARN += -Wall -Wundef
   WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
   WARN += -fno-common -Werror-implicit-function-declaration
@@ -96,22 +108,29 @@ ifneq ($(shell echo $(VER)|grep '2.4'),)
   WARN += -m32 -msoft-float # -mregparm=3
   #WARN += -freg-struct-return -mpreferred-stack-boundary=2
   WARN += -Wno-sign-compare
-  WARN += -Wdeclaration-after-statement -Wno-pointer-sign
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+       WARN += -Wno-pointer-sign
+  endif
 
   ccflags-y += -O1 -DLINUX_24
   CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
-       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) ${ccflags-y}
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
   # The Main target
 all: mod24
 
-else
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
 ifeq ($(IPFW_PLANETLAB),1)
   $(warning "---- Building for PlanetLab")
   ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
 endif
   # if not set, use the version from the installed system
   KERNELPATH ?= /lib/modules/`uname -r`/build
-  # the latest kernel
+  # Otherwise, if you have kernel sources, try something like this:
   #KERNELPATH = /usr/src/linux-2.6.22
   $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
   WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
@@ -119,34 +138,39 @@ endif
 
   # Required by kernel <= 2.6.22, ccflags-y is used on newer version
   LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
-  ifeq ($(LINUX_VERSION_CODE),132630)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
     EXTRA_CFLAGS += $(ccflags-y)
   endif
 
 all: include_e
        $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
-endif
+endif # !2.4
 
-#-- back to the common section of code
+#-- back to the common section of code for Linux 2.4 and 2.6
 
 # the list of objects used to build the module
 ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
 
-# Original ipfw and dummynet sources + FreeBSD stuff,
-IPFW_SRCS = ip_fw2.c ip_dummynet.c ip_fw_pfil.c in_cksum.c
-IPFW_SRCS += radix.c 
-# Module glue and functions missing in linux
-IPFW_SRCS += ipfw2_mod.c bsd_compat.c hashtable.c
-
 # additional $(CC) flags
 ccflags-y += $(WARN)
 ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
 ccflags-y += -g
 
 mod24: include_e $(obj-m)
 
 $(obj-m): $(ipfw_mod-y)
        $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
 clean:
        -rm -f *.o *.ko Module.symvers *.mod.c
        -rm -rf include_e
@@ -172,6 +196,7 @@ EFILES += net/vnet.h
 
 EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
 EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
 EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
 EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
 EFILES += netinet/udp_var.h
@@ -184,14 +209,13 @@ EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
 EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
 EFILES += sys/sysctl.h sys/time.h sys/ucred.h
 
-M ?= $(shell pwd)
 include_e:
        echo "running in $M"
        -@rm -rf $(M)/include_e opt_*
        -@mkdir -p $(M)/include_e
        -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
 
-endif # !openwrt
 
+#--- some other targets for testing purposes
 test_radix: test_radix.o radix.o
-test_radix: CFLAGS=-Wall -Werror -O1
+test_radix: CFLAGS=-Wall -Werror -O2
index 995d60c..cad3c5d 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * $Id$
+ * $Id: bsd_compat.c 4508 2009-12-15 21:54:14Z luigi $
  *
  * kernel variables and functions that are not available in linux.
  */
@@ -32,7 +32,6 @@
 #include <sys/cdefs.h>
 #include <asm/div64.h> /* do_div on 2.4 */
 #include <linux/random.h>      /* get_random_bytes on 2.4 */
-#include "missing.h"
 
 /*
  * gettimeofday would be in sys/time.h but it is not
index 61b3bec..fbc9581 100644 (file)
@@ -5,7 +5,13 @@
 #define _SYS_KERNEL_H_
 
 #define SYSINIT(a, b, c, d, e)  \
-        void *dummy_ ## d = d
+        void *sysinit_ ## d = d
+#define VNET_SYSINIT(a, b, c, d, e)  \
+        void *sysinit_ ## d = d
+#define SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
+#define VNET_SYSUNINIT(a, b, c, d, e)  \
+        void *sysuninit_ ## d = d
 
 /*
  * Some enumerated orders; "ANY" sorts last.
index ed3d3a1..12837bf 100644 (file)
@@ -108,10 +108,20 @@ m_tag_prepend(struct mbuf *m, struct m_tag *t)
 }
 
 /*
+ * Return the next tag in the list of tags associated with an mbuf.
+ */
+static __inline struct m_tag *
+m_tag_next(struct mbuf *m, struct m_tag *t)
+{
+        return (SLIST_NEXT(t, m_tag_link));
+}
+
+/*
  * Create an mtag of the given type
  */
 static __inline struct m_tag *
-m_tag_get(int type, int length, int wait)
+m_tag_alloc(uint32_t cookie, int type, int length, int wait)
 {
        int l = length + sizeof(struct m_tag);
        struct m_tag *m = malloc(l, 0, M_NOWAIT);
@@ -119,11 +129,18 @@ m_tag_get(int type, int length, int wait)
                memset(m, 0, l);
                m->m_tag_id = type;
                m->m_tag_len = length;
+               m->m_tag_cookie = cookie;
        }
        return m;
 };
 
 static __inline struct m_tag *
+m_tag_get(int type, int length, int wait)
+{
+       return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait);
+}
+
+static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
        return SLIST_FIRST(&m->m_pkthdr.tags);
@@ -140,6 +157,7 @@ m_tag_locate(struct mbuf *m, u_int32_t n, int x, struct m_tag *t)
        return NULL;
 };
 
+#define M_SETFIB(_m, _fib)     /* nothing on linux */
 static __inline void
 m_freem(struct mbuf *m)
 {
@@ -156,7 +174,7 @@ m_freem(struct mbuf *m)
 };
 
 /* we cannot pullup */
-#define m_pullup(__m, __i)     (m)
+//#define m_pullup(__m, __i)   (m)
 
 #define M_GETFIB(_m)   0
 
index 5296517..85bf220 100644 (file)
@@ -19,7 +19,6 @@ typedef struct moduledata {
         void            *priv;          /* extra data */
 } moduledata_t;
 
-int my_mod_register(struct moduledata *mod, const char *name, int order);
 /*
  * Hook the module descriptor, md, into our list of things to do.
  * We should in principle respect the order of loading.
index 0b23881..9fd70e2 100644 (file)
@@ -56,8 +56,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.110.2.4 2008/10/31 12:58:1
  * include files marked with XXX are probably not needed
  */
 
-#include "missing.h"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
index 4e46566..21d1b41 100644 (file)
@@ -70,11 +70,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
 #include <net/pf_mtag.h>
 #include <net/vnet.h>
 
-#ifdef linux
-#define INP_LOCK_ASSERT                /* define before missing.h otherwise ? */
-#include "missing.h"
-#endif
-
 #define        IPFW_INTERNAL   /* Access to protected data structures in ip_fw.h. */
 
 #include <netinet/in.h>
@@ -104,10 +99,6 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw2.c,v 1.175.2.13 2008/10/30 16:29:04 bz
 
 #include <machine/in_cksum.h>  /* XXX for in_cksum */
 
-#ifdef IPFW_HASHTABLES
-#include "hashtable.h"
-#endif
-
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
@@ -183,18 +174,14 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
-static unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
+unsigned int dummy_default_rule = IPFW_DEFAULT_RULE;
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_default_rule, IPFW_DEFAULT_RULE,
     "The default/max possible rule number.");
-static unsigned int dummy_tables_max = IPFW_TABLES_MAX;
+unsigned int dummy_tables_max = IPFW_TABLES_MAX;
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
     &dummy_tables_max, IPFW_TABLES_MAX,
     "The maximum number of tables.");
-static unsigned int skipto_entries = 256;
-SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, skipto_entries,
-    CTLFLAG_RW, &skipto_entries, 0,
-    "Number of entries in the skipto cache");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
@@ -1886,61 +1873,6 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
        args->m = NULL;
 }
 
-static void
-set_skipto_table(struct ip_fw_chain *ch)
-{
-       int i, n, sh;
-       struct ip_fw *f, **t, **oldt;
-
-       for (sh = 15; sh > 0; sh--)
-               if (skipto_entries > 1<<sh)
-                       break;
-       sh++;
-       skipto_entries = 1<< (16 - sh) ;
-       /* XXX unsafe and too long */
-       t = malloc(skipto_entries * sizeof(*t), M_IPFW_TBL, M_WAITOK | M_ZERO);
-       if (t == NULL)
-               return;
-       IPFW_RLOCK(ch);
-       /* Store pointers in the table. In the loop i is the next
-        * free slot, n is the slot where the current rule goes.
-        */
-       for (i = 0, f = ch->rules; f; f = f->next) {
-               n = f->rulenum >> sh ;
-               while (i <= n)
-                       t[i++] = f;
-       }
-       V_layer3_chain.skipto_shift = sh;
-       V_layer3_chain.skipto_size = skipto_entries;
-       oldt = V_layer3_chain.skipto_ptrs;
-       V_layer3_chain.skipto_ptrs = t;
-       IPFW_RUNLOCK(ch);
-       if (oldt) {
-               IPFW_WLOCK(ch);
-               IPFW_WUNLOCK(ch);
-               /* now can free oldt */
-               free(oldt, M_IPFW_TBL);
-       }
-}
-#if 0
-/*
- * Map a rule number to a rule pointer, using the skipto table.
- * First lookup the slot, then follow the chain until we find a
- * non-null entry with rulenum >= num. Return default_rule on error.
- */
-static struct ip_fw *
-rule2ptr(struct ip_fw_chain *ch, int num)
-{
-       struct ip_fw *r = NULL;
-       int ix = (num & 0xffff) >> ch->skipto_shift;
-
-       while (ix < ch->skipto_size && (r = ch->skipto_ptrs[ix]) == NULL)
-               ix++;
-       while (r && num < r->rulenum)
-               r = r->next;
-       return (r ? r : ch->default_rule);
-}
-#endif
 /**
  *
  * Given an ip_fw *, lookup_next_rule will return a pointer
@@ -1957,10 +1889,11 @@ rule2ptr(struct ip_fw_chain *ch, int num)
  */
 
 static struct ip_fw *
-lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
+lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
 {
        struct ip_fw *rule = NULL;
        ipfw_insn *cmd;
+       u_int16_t       rulenum;
 
        /* look for action, in case it is a skipto */
        cmd = ACTION_PTR(me);
@@ -1970,19 +1903,21 @@ lookup_next_rule(struct ip_fw_chain *ch, struct ip_fw *me, uint32_t tablearg)
                cmd += F_LEN(cmd);
        if (cmd->opcode == O_TAG)
                cmd += F_LEN(cmd);
-       if (cmd->opcode != O_SKIPTO ) {
-               rule = me->next;
-       } else {
-               tablearg = tablearg ? tablearg : cmd->arg1;
+       if (cmd->opcode == O_SKIPTO ) {
+               if (tablearg != 0) {
+                       rulenum = (u_int16_t)tablearg;
+               } else {
+                       rulenum = cmd->arg1;
+               }
                for (rule = me->next; rule ; rule = rule->next) {
-                       if (rule->rulenum >= tablearg) {
+                       if (rule->rulenum >= rulenum) {
                                break;
                        }
                }
-
-//             rule = rule2ptr(ch, tablearg ? tablearg : cmd->arg1);
        }
-       me->next_rule = rule; /* XXX perhaps unnecessary ? */
+       if (rule == NULL)               /* failure or not a skipto */
+               rule = me->next;
+       me->next_rule = rule;
        return rule;
 }
 
@@ -1994,11 +1929,6 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
        struct table_entry *ent;
        struct radix_node *rn;
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2037,11 +1967,6 @@ del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
        struct table_entry *ent;
        struct sockaddr_in sa, mask;
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2085,11 +2010,6 @@ flush_table(struct ip_fw_chain *ch, uint16_t tbl)
 
        IPFW_WLOCK_ASSERT(ch);
 
-#ifdef IPFW_HASHTABLES
-       if (tbl >= 2*IPFW_TABLES_MAX)
-               return EINVAL;
-       return EINVAL; // XXX to be completed
-#endif
        if (tbl >= IPFW_TABLES_MAX)
                return (EINVAL);
        rnh = ch->tables[tbl];
@@ -2107,10 +2027,6 @@ flush_tables(struct ip_fw_chain *ch)
 
        for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
                flush_table(ch, tbl);
-#ifdef IPFW_HASHTABLES
-       for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
-               ch->hashtab[tbl] = ipfw_ht_destroy(ch->hashtab[tbl]);
-#endif
 }
 
 static int
@@ -2127,10 +2043,6 @@ init_tables(struct ip_fw_chain *ch)
                        return (ENOMEM);
                }
        }
-#ifdef IPFW_HASHTABLES
-        for (i = 0; i < IPFW_TABLES_MAX; i++)
-               ch->hashtab[i] = ipfw_ht_destroy(ch->hashtab[i]);
-#endif
        return (0);
 }
 
@@ -2767,7 +2679,7 @@ do {                                                                      \
                        f = args->rule->next_rule;
 
                if (f == NULL)
-                       f = lookup_next_rule(chain, args->rule, 0);
+                       f = lookup_next_rule(args->rule, 0);
        } else {
                /*
                 * Find the starting rule. It can be either the first
@@ -2984,7 +2896,7 @@ do {                                                                      \
                                            a = dst_port;
                                        else if (v == 3)
                                            a = src_port;
-                                       else if (v >= 4 && v <= 6) {
+                                       else if (v == 4 || v == 5) {
                                            check_uidgid(
                                                    (ipfw_insn_u32 *)cmd,
                                                    proto, oif,
@@ -2994,16 +2906,12 @@ do {                                                                    \
 #ifdef linux
                                            if (v ==4 /* O_UID */)
                                                a = ucred_cache.uid;
-                                           else if (v == 5 /* O_GID */)
-                                               a = ucred_cache.gid;
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                a = ucred_cache.xid;
 #else
                                            if (v ==4 /* O_UID */)
                                                a = (*uc)->cr_uid;
-                                           else if (v == 5 /* O_GID */)
-                                               ; // a = groupmember((gid_t)insn->d[0], *uc);
-                                           else if (v == 6 /* O_JAIL */)
+                                           else if (v == 5 /* O_JAIL */)
                                                a = (*uc)->cr_prison->pr_id;
 #endif
                                        } else
@@ -3590,10 +3498,10 @@ do {                                                                    \
                                }
                                /* handle skipto */
                                if (cmd->arg1 == IP_FW_TABLEARG) {
-                                       f = lookup_next_rule(chain, f, tablearg);
-                               } else {
+                                       f = lookup_next_rule(f, tablearg);
+                               } else { // XXX ?
                                        if (f->next_rule == NULL)
-                                               lookup_next_rule(chain, f, 0);
+                                               lookup_next_rule(f, 0);
                                        f = f->next_rule;
                                }
                                /*
@@ -3883,15 +3791,17 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
                goto done;
         }
 
+       /*
+        * If rulenum is 0, find highest numbered rule before the
+        * default rule, and add autoinc_step
+        */
        if (V_autoinc_step < 1)
                V_autoinc_step = 1;
        else if (V_autoinc_step > 1000)
                V_autoinc_step = 1000;
        if (rule->rulenum == 0) {
                /*
-                * If rulenum is 0, use highest numbered rule before
-                * the default, adding autoinc_step if room.
-                * Also set the number in the caller.
+                * locate the highest numbered rule before default
                 */
                for (f = chain->rules; f; f = f->next) {
                        if (f->rulenum == IPFW_DEFAULT_RULE)
@@ -3905,7 +3815,6 @@ add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 
        /*
         * Now insert the new rule in the right place in the sorted list.
-        * XXX TODO also put in the skipto table.
         */
        for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
                if (f->rulenum > rule->rulenum) { /* found the location */
@@ -3958,7 +3867,6 @@ remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
                prev->next = n;
        V_static_count--;
        V_static_len -= l;
-       // XXX remove from the skipto table
 
        rule->next = chain->reap;
        chain->reap = rule;
@@ -5089,17 +4997,12 @@ ipfw_destroy(void)
        IPFW_WUNLOCK(&V_layer3_chain);
        if (reap != NULL)
                reap_rules(reap);
-       IPFW_DYN_LOCK_DESTROY();
        uma_zdestroy(ipfw_dyn_rule_zone);
+       IPFW_DYN_LOCK_DESTROY();
        if (V_ipfw_dyn_v != NULL)
                free(V_ipfw_dyn_v, M_IPFW);
        IPFW_LOCK_DESTROY(&V_layer3_chain);
 
-#ifdef INET6
-       /* Free IPv6 fw sysctl tree. */
-       sysctl_ctx_free(&ip6_fw_sysctl_ctx);
-#endif
-
        printf("IP firewall unloaded\n");
 }
 
@@ -5154,8 +5057,6 @@ vnet_ipfw_init(const void *unused)
        IPFW_LOCK_INIT(&V_layer3_chain);
        callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
 
-       set_skipto_table(&V_layer3_chain);
-
        bzero(&default_rule, sizeof default_rule);
        default_rule.act_ofs = 0;
        default_rule.rulenum = IPFW_DEFAULT_RULE;
index 368192a..b3fcba6 100644 (file)
@@ -43,17 +43,20 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_fw_pfil.c,v 1.25.2.2 2008/04/25 10:26:30
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
+#include <sys/ucred.h>
 
 #include <net/if.h>
+#include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
-#include "missing.h"
-
 #include <netinet/in.h>
+#include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
index 667d487..4b7edd1 100644 (file)
@@ -49,8 +49,6 @@
 #include <sys/mbuf.h>                  /* sizeof struct mbuf */
 #include <sys/param.h>                 /* NGROUPS */
 
-#include "missing.h"
-
 #ifdef __linux__
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -407,7 +405,7 @@ ipfw2_queue_handler(QH_ARGS)
        }
 
        if (m != NULL) {        /* Accept. reinject and free the mbuf */
-               REINJECT(info, NF_STOP);
+               REINJECT(info, NF_ACCEPT);
                m_freem(m);
        } else if (ret == 0) {
                /* dummynet has kept the packet, will reinject later. */
@@ -502,7 +500,7 @@ linux_lookup(const int proto, const __be32 saddr, const __be16 sport,
        if (proto != IPPROTO_TCP)       /* XXX extend for UDP */
                return -1;
 
-       if ((dir ? (void *)skb->dst : (void *)skb->dev) == NULL) {
+       if ((dir ? (void *)skb_dst(skb) : (void *)skb->dev) == NULL) {
                panic(" -- this should not happen\n");
                return -1;
        }
index d18f503..5b04dce 100644 (file)
@@ -33,6 +33,8 @@
 #ifndef _MISSING_H_
 #define _MISSING_H_
 
+#include <sys/cdefs.h>
+
 #ifdef _WIN32
 
 #ifndef DEFINE_SPINLOCK
@@ -50,6 +52,7 @@
 
 #else  /* __linux__ */
 
+#define MALLOC_DECLARE(x)      /* nothing */
 #include <linux/time.h>                /* do_gettimeofday */
 #include <netinet/ip.h>                /* local version */
 struct inpcb;
@@ -122,7 +125,11 @@ struct malloc_type {
 
 #define CTASSERT(x)
 
-#define log(_level, fmt, arg...)  printk(KERN_ERR fmt, ##arg)
+/* log... does not use the first argument */
+#define        LOG_ERR         0x100
+#define        LOG_INFO        0x200
+#define log(_level, fmt, arg...)  do {                 \
+       int __unused x=_level;printk(KERN_ERR fmt, ##arg); } while (0)
 
 /*
  * gettimeofday would be in sys/time.h but it is not
@@ -263,6 +270,10 @@ int in_cksum(struct mbuf *m, int len);
 #define INADDR_TO_IFP(a, b) b = NULL
 #define pf_find_mtag(a) NULL
 #define pf_get_mtag(a) NULL
+/* we don't pullup, fail */
+#define m_pullup(m, x)                                 \
+       ((m)->m_len >= x ? (m) : (netisr_dispatch(-1, m), NULL))
+
 #ifndef _WIN32
 #define AF_LINK AF_ASH /* ? our sys/socket.h */
 #endif
@@ -389,7 +400,6 @@ struct sock *inet_lookup(
         const __be32 saddr, const __be16 sport,
         const __be32 daddr, const __be16 dport,
         const int dif);
-static int inet_iif(const struct sk_buff *skb);
 struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
 #endif /* Linux < 2.6 */
 
@@ -504,4 +514,6 @@ extern  ip_fw_chk_t     *ip_fw_chk_ptr;
 #define SYSCTL_VNET_PROC       SYSCTL_PROC
 #define SYSCTL_VNET_INT                SYSCTL_INT
 
+int my_mod_register(struct moduledata *mod, const char *name, int order);
+
 #endif /* !_MISSING_H_ */
index 575c47c..639a561 100644 (file)
@@ -36,7 +36,6 @@
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/cdefs.h>
-#include "missing.h"
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
@@ -382,7 +381,7 @@ int rn_debug =  1;
  * the leaf (see RNTORT() in route.c), the second one is the parent.
  * This routine initializes the relevant fields of the nodes, so that
  * the leaf is the left child of the parent node, and both nodes have
- * (almost) all all fields filled as appropriate.
+ * (almost) all fields filled as appropriate.
  * (XXX some fields are left unset, see the '#if 0' section).
  * The function returns a pointer to the parent node.
  */
diff --git a/dummynet2/Makefile b/dummynet2/Makefile
new file mode 100644 (file)
index 0000000..2fe1d7b
--- /dev/null
@@ -0,0 +1,226 @@
+#
+# $Id: Makefile 4657 2010-01-04 11:20:53Z marta $
+#
+# gnu Makefile to build linux module for ipfw+dummynet.
+#
+# The defaults are set to build without modifications on PlanetLab
+# and possibly 2.6 versions.
+
+# Some variables need to have specific names, because they are used
+# by the build infrastructure on Linux and OpenWrt. They are:
+# 
+#   ccflags-y  additional $(CC) flags
+#   M          used by Kbuild, we must set it to `pwd`
+#   obj-m      list of .o modules to build
+#   $(MOD)-y   for each $MOD in obj-m, the list of objects
+#   obj-y      same as above, for openwrt
+#   O_TARGET   the link target, for openwrt
+#   EXTRA_CFLAGS as the name says... in openwrt
+#   EXTRA_CFLAGS is used in 2.6.22 module kernel compilation too
+#   KERNELPATH the path to the kernel sources or headers
+#
+# Not sure about this (the name might be reserved)
+#   ipfw-cflags                our flags for building the module
+#
+# Other variables are only private and can be renamed. They include:
+#
+#   VER                linux version we are building for (2.4 2.6 or openwrt)
+#---
+
+$(warning including dummynet/Makefile)
+
+# lets default for 2.6 for planetlab builds
+VER ?= 2.6
+
+#--- General values for all types of build ---
+# obj-m is the target module
+obj-m := ipfw_mod.o
+
+#-- the list of source files. IPFW_SRCS is our own name.
+# Original ipfw and dummynet sources + FreeBSD stuff,
+IPFW_SRCS := ip_fw2.c ip_dummynet.c ip_fw_pfil.c ip_fw_sockopt.c
+IPFW_SRCS += ip_fw_dynamic.c ip_fw_table.c ip_fw_log.c
+IPFW_SRCS += radix.c in_cksum.c
+# Module glue and functions missing in linux
+IPFW_SRCS += ipfw2_mod.c bsd_compat.c
+
+# generic cflags used on all systems
+#ipfw-cflags += -DIPFW_HASHTABLES
+ipfw-cflags += -DIPFIREWALL_DEFAULT_TO_ACCEPT
+# _BSD_SOURCE enables __FAVOR_BSD (udp/tcp bsd structs instead of posix)
+ipfw-cflags += -D_BSD_SOURCE
+ipfw-cflags += -DKERNEL_MODULE # build linux kernel module
+# the two header trees for empty and override files
+ipfw-cflags += -I $(M)/include_e
+ipfw-cflags += -I $(M)/include
+# XXX eventually ../dummynet/include will go away
+ipfw-cflags += -I $(M)/../dummynet/include
+ipfw-cflags += -include $(M)/../glue.h # headers
+ipfw-cflags += -include $(M)/missing.h # headers
+
+$(warning "---- Building dummynet kernel module for Version $(VER)")
+
+# We have three sections for OpenWrt, Linux 2.4 and Linux 2.6
+
+ifeq ($(VER),openwrt)
+  #--- The Makefile section for openwrt ---
+  # We do not include a dependency on include_e as it is called
+  # by Makefile.openwrt in Build/Prepare
+  M=.
+  obj-y := $(IPFW_SRCS:%.c=%.o)
+  O_TARGET := $(obj-m)
+
+  # xcflags-y is a temporary variable where we store build options
+  xcflags-y += -O1 -DLINUX_24
+  xcflags-y += -g
+
+  EXTRA_CFLAGS := $(xcflags-y) $(ipfw-cflags)
+
+  # we should not export anything
+  #export-objs := ipfw2_mod.o
+-include $(TOPDIR)/Rules.make
+
+else   # !openwrt, below we do linux builds for 2.4 and 2.6
+
+  # KERNELPATH is where the kernel headers reside. On PlanetLab
+  # it is set already by the build system.
+  # We can override it from the command line, or let the system guess.
+
+ifneq ($(shell echo $(VER)|grep '2.4'),)
+  # Makefile section for the linux 2.4 version
+  # tested on linux-2.4.35.4, does not work with 2.4.37
+  #
+  # guess the kernel path -- or is it under /lib/modules ?
+  KERNELPATH ?= /usr/src/`uname -r`
+
+  # We need to figure out the gcc include directory, if not
+  # set by the user through MYGCC_INCLUDE
+  # Find compiler version (3rd field in last line returned by gcc -v)
+  # e.g.       gcc version 4.3.2 (Debian 4.3.2-1.1)
+  MYGCC_VER ?= $(shell $(CC) -v 2>&1 |tail -n 1 | cut -d " " -f 3)
+  # We don't know the exact directory under /usr/lib/gcc so we guess
+  MYGCC_INCLUDE ?= $(shell echo /usr/lib/gcc/*/$(MYGCC_VER) | cut -d " " -f 1)/include
+  $(warning "---- gcc includes guessed to $(MYGCC_INCLUDE)")
+
+  # additional warning
+  WARN += -Wall -Wundef
+  WARN += -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
+  WARN += -fno-common -Werror-implicit-function-declaration
+  # WARN += -O2  -fno-stack-protector -m32 -msoft-float -mregparm=3
+  # -mregparm=3 gives a printk error
+  WARN += -m32 -msoft-float # -mregparm=3
+  #WARN += -freg-struct-return -mpreferred-stack-boundary=2
+  WARN += -Wno-sign-compare
+  WARN += -Wdeclaration-after-statement
+  ifneq ($(MYGCC_VER),3.4.6)
+        WARN += -Wno-pointer-sign
+  endif
+
+  ccflags-y += -O1 -DLINUX_24
+  CFLAGS = -DMODULE -D__KERNEL__ -nostdinc \
+       -isystem ${KERNELPATH}/include -isystem $(MYGCC_INCLUDE) \
+       ${ccflags-y}
+  # The Main target
+all: mod24
+
+else # !2.4 --
+
+  # This is the Makefile section for Linux 2.6.x including planetlab
+
+ifeq ($(IPFW_PLANETLAB),1)
+  $(warning "---- Building for PlanetLab")
+  ipfw-cflags += -DIPFW_PLANETLAB        # PlanetLab compilation
+endif
+  # if not set, use the version from the installed system
+  KERNELPATH ?= /lib/modules/`uname -r`/build
+  # Otherwise, if you have kernel sources, try something like this:
+  #KERNELPATH = /usr/src/linux-2.6.22
+  $(warning "---- Building Version 2.6 $(VER) in $(KERNELPATH)")
+  WARN := -O1 -Wall -Werror -DDEBUG_SPINLOCK -DDEBUG_MUTEXES
+  # The main target
+
+  # Required by kernel <= 2.6.22, ccflags-y is used on newer version
+  LINUX_VERSION_CODE := $(shell grep LINUX_VERSION_CODE $(KERNELPATH)/include/linux/version.h|cut -d " " -f3)
+  ifeq ($(shell if [ -z $(LINUX_VERSION_CODE) ] ; then echo "true"; fi),true)
+    $(warning "---- Perhaps you miss a (cd $(KERNELPATH); make oldconfig; make prepare; make scripts)");
+  endif
+  ifeq ($(shell if [ $(LINUX_VERSION_CODE) -le 132630 ] ; then echo "true"; fi),true)
+    EXTRA_CFLAGS += $(ccflags-y)
+  endif
+
+all: include_e
+       $(MAKE) -C $(KERNELPATH) V=1 M=`pwd` modules
+endif # !2.4
+
+#-- back to the common section of code for Linux 2.4 and 2.6
+
+# the list of objects used to build the module
+ipfw_mod-y = $(IPFW_SRCS:%.c=%.o)
+
+# additional $(CC) flags
+ccflags-y += $(WARN)
+ccflags-y += $(ipfw-cflags)
+# if we really want debug symbols...
+ccflags-y += -g
+
+mod24: include_e $(obj-m)
+
+$(obj-m): $(ipfw_mod-y)
+       $(LD) $(LDFLAGS) -m elf_i386 -r -o $@ $^
+
+# M is the current directory, used in recursive builds
+# so we allow it to be overridden
+M ?= $(shell pwd)
+endif # !openwrt
+
+#--- various common targets
+clean:
+       -rm -f *.o *.ko Module.symvers *.mod.c
+       -rm -rf include_e
+
+distclean: clean
+       -rm -f .*cmd modules.order opt_*
+       -rm -rf .tmp_versions include_e
+       -rm -rf .*.o.d
+
+# support to create empty dirs and files in include_e/
+# EDIRS is the list of directories, EFILES is the list of files.
+
+EDIRS= altq arpa machine net netinet netinet6 sys
+
+EFILES += opt_inet6.h opt_ipfw.h opt_ipsec.h opt_mpath.h
+EFILES += opt_mbuf_stress_test.h opt_param.h
+
+EFILES += altq/if_altq.h
+EFILES += arpa/inet.h
+EFILES += machine/in_cksum.h
+EFILES += net/ethernet.h net/netisr.h net/pf_mtag.h
+EFILES += net/bpf.h net/if_types.h
+EFILES += net/vnet.h
+
+EFILES += netinet/ether.h netinet/icmp6.h netinet/if_ether.h
+EFILES += netinet/in.h netinet/in_pcb.h netinet/in_var.h
+EFILES += netinet/in_systm.h
+EFILES += netinet/ip_carp.h netinet/ip_var.h netinet/pim.h
+EFILES += netinet/sctp.h netinet/tcp_timer.h netinet/tcpip.h
+EFILES += netinet/udp_var.h
+
+EFILES += netinet6/ip6_var.h
+
+EFILES += sys/_lock.h sys/_rwlock.h sys/_mutex.h sys/jail.h
+EFILES += sys/condvar.h sys/eventhandler.h sys/domain.h
+EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h
+EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h
+EFILES += sys/sysctl.h sys/time.h sys/ucred.h
+
+include_e:
+       echo "running in $M"
+       -@rm -rf $(M)/include_e opt_*
+       -@mkdir -p $(M)/include_e
+       -@(cd $(M)/include_e; mkdir -p $(EDIRS); touch $(EFILES) )
+
+
+#--- some other targets for testing purposes
+test_radix: test_radix.o radix.o
+test_lookup: ip_fw_lookup.o
+test_radix test_lookup: CFLAGS=-Wall -Werror -O1
diff --git a/dummynet2/bsd_compat.c b/dummynet2/bsd_compat.c
new file mode 100644 (file)
index 0000000..70268bb
--- /dev/null
@@ -0,0 +1,370 @@
+/*
+ * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id: bsd_compat.c 4665 2010-01-04 12:35:39Z luigi $
+ *
+ * kernel variables and functions that are not available in linux.
+ */
+
+#include <sys/cdefs.h>
+#include <asm/div64.h> /* do_div on 2.4 */
+#include <linux/random.h>      /* get_random_bytes on 2.4 */
+
+/*
+ * gettimeofday would be in sys/time.h but it is not
+ * visible if _KERNEL is defined
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+int ticks;             /* kernel ticks counter */
+int hz = 1000;         /* default clock time */
+long tick = 1000;      /* XXX is this 100000/hz ? */
+int bootverbose = 0;
+time_t time_uptime = 0;
+struct timeval boottime;
+
+int     ip_defttl;
+int fw_one_pass = 1;
+u_long  in_ifaddrhmask;                         /* mask for hash table */
+struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table  */
+
+u_int rt_numfibs = RT_NUMFIBS;
+
+/*
+ * pfil hook support.
+ * We make pfil_head_get return a non-null pointer, which is then ignored
+ * in our 'add-hook' routines.
+ */
+struct pfil_head;
+typedef int (pfil_hook_t)
+       (void *, struct mbuf **, struct ifnet *, int, struct inpcb *);
+
+struct pfil_head *
+pfil_head_get(int proto, u_long flags)
+{
+       static int dummy;
+       return (struct pfil_head *)&dummy;
+}
+int
+pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+int
+pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h)
+{
+       return 0;
+}
+
+/* define empty body for kernel function */
+int
+priv_check(struct thread *td, int priv)
+{
+       return 0;
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+       return 0;
+}
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+       return 0;
+}
+
+void
+ether_demux(struct ifnet *ifp, struct mbuf *m)
+{
+       return;
+}
+
+int
+ether_output_frame(struct ifnet *ifp, struct mbuf *m)
+{
+       return 0;
+}
+
+void
+in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
+{
+       return;
+}
+
+void
+icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
+{
+       return;
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+       return 0;
+}
+
+u_short
+in_cksum_hdr(struct ip *ip)
+{
+       return 0;
+}
+
+/*
+ * we don't really reassemble, just return whatever we had.
+ */
+struct mbuf *
+ip_reass(struct mbuf *clone)
+{
+       return clone;
+}
+#ifdef INP_LOCK_ASSERT
+#undef INP_LOCK_ASSERT
+#define INP_LOCK_ASSERT(a)
+#endif
+
+/* credentials check */
+#include <netinet/ip_fw.h>
+int
+cred_check(void *_insn,  int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp,
+    struct sk_buff *skb)
+{
+       int match = 0;
+       ipfw_insn_u32 *insn = (ipfw_insn_u32 *)_insn;
+
+       if (*ugid_lookupp == 0) {        /* actively lookup and copy in cache */
+               /* returns null if any element of the chain up to file is null.
+                * if sk != NULL then we also have a reference
+                */
+               *ugid_lookupp = linux_lookup(proto,
+                       src_ip.s_addr, htons(src_port),
+                       dst_ip.s_addr, htons(dst_port),
+                       skb, oif ? 1 : 0, u);
+       }
+       if (*ugid_lookupp < 0)
+               return 0;
+
+       if (insn->o.opcode == O_UID)
+               match = (u->uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_JAIL)
+               match = (u->xid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = (u->gid == (uid_t)insn->d[0]);
+       return match;
+}
+
+int
+jailed(struct ucred *cred)
+{
+       return 0;
+}
+
+/*
+* Return 1 if an internet address is for a ``local'' host
+* (one to which we have a connection).  If subnetsarelocal
+* is true, this includes other subnets of the local net.
+* Otherwise, it includes only the directly-connected (sub)nets.
+*/
+int
+in_localaddr(struct in_addr in)
+{
+       return 1;
+}
+
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (len < valsize)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(buf, sopt->sopt_val, valsize);
+       return 0;
+}
+
+/*
+ * copy data from userland to kernel
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+       size_t valsize = sopt->sopt_valsize;
+
+       if (valsize < minlen)
+               return EINVAL;
+       if (valsize > len)
+               sopt->sopt_valsize = valsize = len;
+       bcopy(sopt->sopt_val, buf, valsize);
+       return 0;
+}
+
+void
+getmicrouptime(struct timeval *tv)
+{
+#ifdef _WIN32
+#else
+       do_gettimeofday(tv);
+#endif
+}
+
+
+#include <arpa/inet.h>
+
+char *
+inet_ntoa_r(struct in_addr ina, char *buf)
+{
+#ifdef _WIN32
+#else
+       unsigned char *ucp = (unsigned char *)&ina;
+
+       sprintf(buf, "%d.%d.%d.%d",
+       ucp[0] & 0xff,
+       ucp[1] & 0xff,
+       ucp[2] & 0xff,
+       ucp[3] & 0xff);
+#endif
+       return buf;
+}
+
+char *
+inet_ntoa(struct in_addr ina)
+{
+       static char buf[16];
+       return inet_ntoa_r(ina, buf);
+}
+
+int
+random(void)
+{
+#ifdef _WIN32
+       return 0x123456;
+#else
+       int r;
+       get_random_bytes(&r, sizeof(r));
+       return r & 0x7fffffff; 
+#endif
+}
+
+
+/*
+ * do_div really does a u64 / u32 bit division.
+ * we save the sign and convert to uint befor calling.
+ * We are safe just because we always call it with small operands.
+ */
+int64_t
+div64(int64_t a, int64_t b)
+{
+#ifdef _WIN32
+        int a1 = a, b1 = b;
+       return a1/b1;
+#else
+       uint64_t ua, ub;
+       int sign = ((a>0)?1:-1) * ((b>0)?1:-1);
+
+       ua = ((a>0)?a:-a);
+       ub = ((b>0)?b:-b);
+        do_div(ua, ub);
+       return sign*ua;
+#endif
+}
+
+/*
+ * compact version of fnmatch.
+ */
+int
+fnmatch(const char *pattern, const char *string, int flags)
+{
+       char s;
+
+       if (!string || !pattern)
+               return 1;       /* no match */
+       while ( (s = *string++) ) {
+               char p = *pattern++;
+               if (p == '\0')          /* pattern is over, no match */
+                       return 1;
+               if (p == '*')           /* wildcard, match */
+                       return 0;
+               if (p == '.' || p == s) /* char match, continue */
+                       continue;
+               return 1;               /* no match */
+       }
+       /* end of string, make sure the pattern is over too */
+       if (*pattern == '\0' || *pattern == '*')
+               return 0;
+       return 1;       /* no match */
+}
+
+#ifdef _WIN32
+/*
+ * as good as anywhere, place here the missing calls
+ */
+
+void *
+my_alloc(int size)
+{
+       void *_ret = ExAllocatePoolWithTag(0, size, 'wfpi');
+       if (_ret)
+               memset(_ret, 0, size);
+       return _ret;
+}
+
+void
+panic(const char *fmt, ...)
+{
+       printf("%s", fmt);
+       for (;;);
+}
+
+#include <stdarg.h>
+
+extern int _vsnprintf(char *buf, int buf_size, char * fmt, va_list ap);
+
+/*
+ * Windows' _snprintf doesn't terminate buffer with zero if size > buf_size
+ */
+int
+snprintf(char *buf, int buf_size, char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    if (_vsnprintf(buf, buf_size, fmt, ap) < 0)
+        buf[buf_size - 1] = '\0';
+    va_end(ap);
+
+    return 0;
+}
+#endif
diff --git a/dummynet2/in_cksum.c b/dummynet2/in_cksum.c
new file mode 100644 (file)
index 0000000..8972cef
--- /dev/null
@@ -0,0 +1,150 @@
+/*-
+ * Copyright (c) 1988, 1992, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)in_cksum.c  8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/in_cksum.c,v 1.10 2007/10/07 20:44:22 silby Exp $");
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers (Portable Version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+
+int
+in_cksum(struct mbuf *m, int len)
+{
+       register u_short *w;
+       register int sum = 0;
+       register int mlen = 0;
+       int byte_swapped = 0;
+
+       union {
+               char    c[2];
+               u_short s;
+       } s_util;
+       union {
+               u_short s[2];
+               long    l;
+       } l_util;
+
+       for (;m && len; m = m->m_next) {
+               if (m->m_len == 0)
+                       continue;
+               w = mtod(m, u_short *);
+               if (mlen == -1) {
+                       /*
+                        * The first byte of this mbuf is the continuation
+                        * of a word spanning between this mbuf and the
+                        * last mbuf.
+                        *
+                        * s_util.c[0] is already saved when scanning previous
+                        * mbuf.
+                        */
+                       s_util.c[1] = *(char *)w;
+                       sum += s_util.s;
+                       w = (u_short *)((char *)w + 1);
+                       mlen = m->m_len - 1;
+                       len--;
+               } else
+                       mlen = m->m_len;
+               if (len < mlen)
+                       mlen = len;
+               len -= mlen;
+               /*
+                * Force to even boundary.
+                */
+#if defined(CONFIG_X86_64)
+               if ((1 & (long) w) && (mlen > 0)) {
+#else
+               if ((1 & (int) w) && (mlen > 0)) {
+#endif
+                       REDUCE;
+                       sum <<= 8;
+                       s_util.c[0] = *(u_char *)w;
+                       w = (u_short *)((char *)w + 1);
+                       mlen--;
+                       byte_swapped = 1;
+               }
+               /*
+                * Unroll the loop to make overhead from
+                * branches &c small.
+                */
+               while ((mlen -= 32) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
+                       sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
+                       sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
+                       w += 16;
+               }
+               mlen += 32;
+               while ((mlen -= 8) >= 0) {
+                       sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+                       w += 4;
+               }
+               mlen += 8;
+               if (mlen == 0 && byte_swapped == 0)
+                       continue;
+               REDUCE;
+               while ((mlen -= 2) >= 0) {
+                       sum += *w++;
+               }
+               if (byte_swapped) {
+                       REDUCE;
+                       sum <<= 8;
+                       byte_swapped = 0;
+                       if (mlen == -1) {
+                               s_util.c[1] = *(char *)w;
+                               sum += s_util.s;
+                               mlen = 0;
+                       } else
+                               mlen = -1;
+               } else if (mlen == -1)
+                       s_util.c[0] = *(char *)w;
+       }
+       if (len)
+               printf("cksum: out of data\n");
+       if (mlen == -1) {
+               /* The last mbuf has odd # of bytes. Follow the
+                  standard (the odd byte may be shifted left by 8 bits
+                  or not as determined by endian-ness of the machine) */
+               s_util.c[1] = 0;
+               sum += s_util.s;
+       }
+       REDUCE;
+       return (~sum & 0xffff);
+}
diff --git a/dummynet2/include/netgraph/ng_ipfw.h b/dummynet2/include/netgraph/ng_ipfw.h
new file mode 100644 (file)
index 0000000..55fd890
--- /dev/null
@@ -0,0 +1,33 @@
+/*-
+ * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netgraph/ng_ipfw.h,v 1.2 2006/02/17 09:42:49 glebius Exp $
+ */
+#ifndef __NG_IPFW_H
+#define __NG_IPFW_H
+
+#define NG_IPFW_NODE_TYPE    "ipfw"
+#define NGM_IPFW_COOKIE      1105988990
+#endif /* __NG_IPFW_H */
diff --git a/dummynet2/include/netinet/ip_dummynet.h b/dummynet2/include/netinet/ip_dummynet.h
new file mode 100644 (file)
index 0000000..f01bfe2
--- /dev/null
@@ -0,0 +1,374 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.40.2.1 2008/04/25 10:26:30 oleg Exp $
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of dummynet data structures. In the structures, I decided
+ * not to use the macros in <sys/queue.h> in the hope of making the code
+ * easier to port to other architectures. The type of lists and queue we
+ * use here is pretty simple anyways.
+ */
+
+/*
+ * We start with a heap, which is used in the scheduler to decide when
+ * to transmit packets etc.
+ *
+ * The key for the heap is used for two different values:
+ *
+ * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ *
+ * 2. virtual times. These increase in steps of len/x, where len is the
+ *    packet length, and x is either the weight of the flow, or the
+ *    sum of all weights.
+ *    If we limit to max 1000 flows and a max weight of 100, then
+ *    x needs 17 bits. The packet size is 16 bits, so we can easily
+ *    overflow if we do not allow errors.
+ * So we use a key "dn_key" which is 64 bits. Some macros are used to
+ * compare key values and handle wraparounds.
+ * MAX64 returns the largest of two key values.
+ * MY_M is used as a shift count when doing fixed point arithmetic
+ * (a better name would be useful...).
+ */
+typedef u_int64_t dn_key ;      /* sorting key */
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#define MY_M   16 /* number of left shift to obtain a larger precision */
+
+/*
+ * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
+ * virtual time wraps every 15 days.
+ */
+
+
+/*
+ * The maximum hash table size for queues.  This value must be a power
+ * of 2.
+ */
+#define DN_MAX_HASH_SIZE 65536
+
+/*
+ * A heap entry is made of a key and a pointer to the actual
+ * object stored in the heap.
+ * The heap is an array of dn_heap_entry entries, dynamically allocated.
+ * Current size is "size", with "elements" actually in use.
+ * The heap normally supports only ordered insert and extract from the top.
+ * If we want to extract an object from the middle of the heap, we
+ * have to know where the object itself is located in the heap (or we
+ * need to scan the whole array). To this purpose, an object has a
+ * field (int) which contains the index of the object itself into the
+ * heap. When the object is moved, the field must also be updated.
+ * The offset of the index in the object is stored in the 'offset'
+ * field in the heap descriptor. The assumption is that this offset
+ * is non-zero if we want to support extract from the middle.
+ */
+struct dn_heap_entry {
+    dn_key key ;       /* sorting key. Topmost element is smallest one */
+    void *object ;     /* object pointer */
+} ;
+
+struct dn_heap {
+    int size ;
+    int elements ;
+    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
+    struct dn_heap_entry *p ;  /* really an array of "size" entries */
+} ;
+
+#ifdef _KERNEL
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.  This is used within
+ * the dummynet code as well as outside when checking for special
+ * processing requirements.
+ * Note that the first part is the reinject info and is common to
+ * other forms of packet reinjection.
+ */
+struct dn_pkt_tag {
+       struct ipfw_rule_ref rule;      /* matching rule */
+
+    /* second part, dummynet specific */
+    int dn_dir;                        /* action when packet comes out. */
+                               /* see ip_fw_private.h */
+
+    dn_key output_time;                /* when the pkt is due for delivery     */
+    struct ifnet *ifp;         /* interface, for ip_output             */
+    struct _ip6dn_args ip6opt; /* XXX ipv6 options                     */
+};
+#endif /* _KERNEL */
+
+/*
+ * Overall structure of dummynet (with WF2Q+):
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE.
+
+A QUEUE is just a queue with configurable size and queue management
+policy. It is also associated with a mask (to discriminate among
+different flows), a weight (used to give different shares of the
+bandwidth to different flows) and a "pipe", which essentially
+supplies the transmit clock for all queues associated with that
+pipe.
+
+A PIPE emulates a fixed-bandwidth link, whose bandwidth is
+configurable.  The "clock" for a pipe can come from either an
+internal timer, or from the transmit interrupt of an interface.
+A pipe is also associated with one (or more, if masks are used)
+queue, where all packets for that pipe are stored.
+
+The bandwidth available on the pipe is shared by the queues
+associated with that pipe (only one in case the packet is sent
+to a PIPE) according to the WF2Q+ scheduling algorithm and the
+configured weights.
+
+In general, incoming packets are stored in the appropriate queue,
+which is then placed into one of a few heaps managed by a scheduler
+to decide when the packet should be extracted.
+The scheduler (a function called dummynet()) is run at every timer
+tick, and grabs queues from the head of the heaps when they are
+ready for processing.
+
+There are three data structures definining a pipe and associated queues:
+
+ + dn_pipe, which contains the main configuration parameters related
+   to delay and bandwidth;
+ + dn_flow_set, which contains WF2Q+ configuration, flow
+   masks, plr and RED configuration;
+ + dn_flow_queue, which is the per-flow queue (containing the packets)
+
+Multiple dn_flow_set can be linked to the same pipe, and multiple
+dn_flow_queue can be linked to the same dn_flow_set.
+All data structures are linked in a linear list which is used for
+housekeeping purposes.
+
+During configuration, we create and initialize the dn_flow_set
+and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
+
+At runtime: packets are sent to the appropriate dn_flow_set (either
+WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
+which in turn dispatches them to the appropriate dn_flow_queue
+(created dynamically according to the masks).
+
+The transmit clock for fixed rate flows (ready_event()) selects the
+dn_flow_queue to be used to transmit the next packet. For WF2Q,
+wfq_ready_event() extract a pipe which in turn selects the right
+flow using a number of heaps defined into the pipe itself.
+
+ *
+ */
+
+/*
+ * per flow queue. This contains the flow identifier, the queue
+ * of packets, counters, and parameters used to support both RED and
+ * WF2Q+.
+ *
+ * A dn_flow_queue is created and initialized whenever a packet for
+ * a new flow arrives.
+ */
+struct dn_flow_queue {
+    struct dn_flow_queue *next ;
+    struct ipfw_flow_id id ;
+
+    struct mbuf *head, *tail ; /* queue of packets */
+    u_int len ;
+    u_int len_bytes ;
+
+    /*
+     * When we emulate MAC overheads, or channel unavailability due
+     * to other traffic on a shared medium, we augment the packet at
+     * the head of the queue with an 'extra_bits' field representsing
+     * the additional delay the packet will be subject to:
+     *         extra_bits = bw*unavailable_time.
+     * With large bandwidth and large delays, extra_bits (and also numbytes)
+     * can become very large, so better play safe and use 64 bit
+     */
+    uint64_t numbytes ;                /* credit for transmission (dynamic queues) */
+    int64_t extra_bits;                /* extra bits simulating unavailable channel */
+
+    u_int64_t tot_pkts ;       /* statistics counters  */
+    u_int64_t tot_bytes ;
+    u_int32_t drops ;
+
+    int hash_slot ;            /* debugging/diagnostic */
+
+    /* RED parameters */
+    int avg ;                   /* average queue length est. (scaled) */
+    int count ;                 /* arrivals since last RED drop */
+    int random ;                /* random value (scaled) */
+    dn_key idle_time;          /* start of queue idle time */
+
+    /* WF2Q+ support */
+    struct dn_flow_set *fs ;   /* parent flow set */
+    int heap_pos ;             /* position (index) of struct in heap */
+    dn_key sched_time ;                /* current time when queue enters ready_heap */
+
+    dn_key S,F ;               /* start time, finish time */
+    /*
+     * Setting F < S means the timestamp is invalid. We only need
+     * to test this when the queue is empty.
+     */
+} ;
+
+/*
+ * flow_set descriptor. Contains the "template" parameters for the
+ * queue configuration, and pointers to the hash table of dn_flow_queue's.
+ *
+ * The hash table is an array of lists -- we identify the slot by
+ * hashing the flow-id, then scan the list looking for a match.
+ * The size of the hash table (buckets) is configurable on a per-queue
+ * basis.
+ *
+ * A dn_flow_set is created whenever a new queue or pipe is created (in the
+ * latter case, the structure is located inside the struct dn_pipe).
+ */
+struct dn_flow_set {
+    SLIST_ENTRY(dn_flow_set)   next;   /* linked list in a hash slot */
+
+    u_short fs_nr ;             /* flow_set number       */
+    u_short flags_fs;
+#define DN_HAVE_FLOW_MASK      0x0001
+#define DN_IS_RED              0x0002
+#define DN_IS_GENTLE_RED       0x0004
+#define DN_QSIZE_IS_BYTES      0x0008  /* queue size is measured in bytes */
+#define DN_NOERROR             0x0010  /* do not report ENOBUFS on drops  */
+#define        DN_HAS_PROFILE          0x0020  /* the pipe has a delay profile. */
+#define DN_IS_PIPE             0x4000
+#define DN_IS_QUEUE            0x8000
+
+    struct dn_pipe *pipe ;     /* pointer to parent pipe */
+    u_short parent_nr ;                /* parent pipe#, 0 if local to a pipe */
+
+    int weight ;               /* WFQ queue weight */
+    int qsize ;                        /* queue size in slots or bytes */
+    int plr ;                  /* pkt loss rate (2^31-1 means 100%) */
+
+    struct ipfw_flow_id flow_mask ;
+
+    /* hash table of queues onto this flow_set */
+    int rq_size ;              /* number of slots */
+    int rq_elements ;          /* active elements */
+    struct dn_flow_queue **rq; /* array of rq_size entries */
+
+    u_int32_t last_expired ;   /* do not expire too frequently */
+    int backlogged ;           /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+    int w_q ;                  /* queue weight (scaled) */
+    int max_th ;               /* maximum threshold for queue (scaled) */
+    int min_th ;               /* minimum threshold for queue (scaled) */
+    int max_p ;                        /* maximum value for p_b (scaled) */
+    u_int c_1 ;                        /* max_p/(max_th-min_th) (scaled) */
+    u_int c_2 ;                        /* max_p*min_th/(max_th-min_th) (scaled) */
+    u_int c_3 ;                        /* for GRED, (1-max_p)/max_th (scaled) */
+    u_int c_4 ;                        /* for GRED, 1 - 2*max_p (scaled) */
+    u_int * w_q_lookup ;       /* lookup table for computing (1-w_q)^t */
+    u_int lookup_depth ;       /* depth of lookup table */
+    int lookup_step ;          /* granularity inside the lookup table */
+    int lookup_weight ;                /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+    int avg_pkt_size ;         /* medium packet size */
+    int max_pkt_size ;         /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+/*
+ * Pipe descriptor. Contains global parameters, delay-line queue,
+ * and the flow_set used for fixed-rate queues.
+ *
+ * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
+ *   not_eligible_heap, for queues whose start time is higher
+ *     than the virtual time. Sorted by start time.
+ *   scheduler_heap, for queues eligible for scheduling. Sorted by
+ *     finish time.
+ *   idle_heap, all flows that are idle and can be removed. We
+ *     do that on each tick so we do not slow down too much
+ *     operations during forwarding.
+ *
+ */
+struct dn_pipe {               /* a pipe */
+    SLIST_ENTRY(dn_pipe)       next;   /* linked list in a hash slot */
+
+    int        pipe_nr ;               /* number       */
+    int bandwidth;             /* really, bytes/tick.  */
+    int        delay ;                 /* really, ticks        */
+
+    struct     mbuf *head, *tail ;     /* packets in delay line */
+
+    /* WF2Q+ */
+    struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
+    struct dn_heap not_eligible_heap; /* top extract- key Start time */
+    struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
+
+    dn_key V ;                 /* virtual time */
+    int sum;                   /* sum of weights of all active sessions */
+
+    /* Same as in dn_flow_queue, numbytes can become large */
+    int64_t numbytes;          /* bits I can transmit (more or less). */
+    uint64_t burst;            /* burst size, scaled: bits * hz */
+
+    dn_key sched_time ;                /* time pipe was scheduled in ready_heap */
+    dn_key idle_time;          /* start of pipe idle time */
+
+    /*
+     * When the tx clock come from an interface (if_name[0] != '\0'), its name
+     * is stored below, whereas the ifp is filled when the rule is configured.
+     */
+    char if_name[IFNAMSIZ];
+    struct ifnet *ifp ;
+    int ready ; /* set if ifp != NULL and we got a signal from it */
+
+    struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+
+#define ED_MAX_NAME_LEN                32
+    char name[ED_MAX_NAME_LEN];
+    int loss_level;
+    int samples_no;
+    int *samples;
+};
+
+/* dn_pipe_max is used to pass pipe configuration from userland onto
+ * kernel space and back
+ */
+#define ED_MAX_SAMPLES_NO      1024
+struct dn_pipe_max {
+       struct dn_pipe pipe;
+       int samples[ED_MAX_SAMPLES_NO];
+};
+
+SLIST_HEAD(dn_pipe_head, dn_pipe);
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/dummynet2/include/netinet/ip_fw.h b/dummynet2/include/netinet/ip_fw.h
new file mode 100644 (file)
index 0000000..238601f
--- /dev/null
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ip_fw.h 200580 2009-12-15 16:15:14Z luigi $
+ */
+
+#ifndef _IPFW2_H
+#define _IPFW2_H
+
+/*
+ * The default rule number.  By the design of ip_fw, the default rule
+ * is the last one, so its number can also serve as the highest number
+ * allowed for a rule.  The ip_fw code relies on both meanings of this
+ * constant. 
+ */
+#define        IPFW_DEFAULT_RULE       65535
+
+/*
+ * The number of ipfw tables.  The maximum allowed table number is the
+ * (IPFW_TABLES_MAX - 1).
+ */
+#define        IPFW_TABLES_MAX         128
+
+/*
+ * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
+ * argument between 1 and 65534. The value 0 is unused, the value
+ * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
+ * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * result of the most recent table() lookup.
+ * Note that 16bit is only a historical limit, resulting from
+ * the use of a 16-bit fields for that value. In reality, we can have
+ * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ */
+#define        IPFW_ARG_MIN            1
+#define        IPFW_ARG_MAX            65534
+#define IP_FW_TABLEARG         65535   /* XXX should use 0 */
+
+/*
+ * The kernel representation of ipfw rules is made of a list of
+ * 'instructions' (for all practical purposes equivalent to BPF
+ * instructions), which specify which fields of the packet
+ * (or its metadata) should be analysed.
+ *
+ * Each instruction is stored in a structure which begins with
+ * "ipfw_insn", and can contain extra fields depending on the
+ * instruction type (listed below).
+ * Note that the code is written so that individual instructions
+ * have a size which is a multiple of 32 bits. This means that, if
+ * such structures contain pointers or other 64-bit entities,
+ * (there is just one instance now) they may end up unaligned on
+ * 64-bit architectures, so the must be handled with care.
+ *
+ * "enum ipfw_opcodes" are the opcodes supported. We can have up
+ * to 256 different opcodes. When adding new opcodes, they should
+ * be appended to the end of the opcode list before O_LAST_OPCODE,
+ * this will prevent the ABI from being broken, otherwise users
+ * will have to recompile ipfw(8) when they update the kernel.
+ */
+
+enum ipfw_opcodes {            /* arguments (4 byte each)      */
+       O_NOP,
+
+       O_IP_SRC,               /* u32 = IP                     */
+       O_IP_SRC_MASK,          /* ip = IP/mask                 */
+       O_IP_SRC_ME,            /* none                         */
+       O_IP_SRC_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_DST,               /* u32 = IP                     */
+       O_IP_DST_MASK,          /* ip = IP/mask                 */
+       O_IP_DST_ME,            /* none                         */
+       O_IP_DST_SET,           /* u32=base, arg1=len, bitmap   */
+
+       O_IP_SRCPORT,           /* (n)port list:mask 4 byte ea  */
+       O_IP_DSTPORT,           /* (n)port list:mask 4 byte ea  */
+       O_PROTO,                /* arg1=protocol                */
+
+       O_MACADDR2,             /* 2 mac addr:mask              */
+       O_MAC_TYPE,             /* same as srcport              */
+
+       O_LAYER2,               /* none                         */
+       O_IN,                   /* none                         */
+       O_FRAG,                 /* none                         */
+
+       O_RECV,                 /* none                         */
+       O_XMIT,                 /* none                         */
+       O_VIA,                  /* none                         */
+
+       O_IPOPT,                /* arg1 = 2*u8 bitmap           */
+       O_IPLEN,                /* arg1 = len                   */
+       O_IPID,                 /* arg1 = id                    */
+
+       O_IPTOS,                /* arg1 = id                    */
+       O_IPPRECEDENCE,         /* arg1 = precedence << 5       */
+       O_IPTTL,                /* arg1 = TTL                   */
+
+       O_IPVER,                /* arg1 = version               */
+       O_UID,                  /* u32 = id                     */
+       O_GID,                  /* u32 = id                     */
+       O_ESTAB,                /* none (tcp established)       */
+       O_TCPFLAGS,             /* arg1 = 2*u8 bitmap           */
+       O_TCPWIN,               /* arg1 = desired win           */
+       O_TCPSEQ,               /* u32 = desired seq.           */
+       O_TCPACK,               /* u32 = desired seq.           */
+       O_ICMPTYPE,             /* u32 = icmp bitmap            */
+       O_TCPOPTS,              /* arg1 = 2*u8 bitmap           */
+
+       O_VERREVPATH,           /* none                         */
+       O_VERSRCREACH,          /* none                         */
+
+       O_PROBE_STATE,          /* none                         */
+       O_KEEP_STATE,           /* none                         */
+       O_LIMIT,                /* ipfw_insn_limit              */
+       O_LIMIT_PARENT,         /* dyn_type, not an opcode.     */
+
+       /*
+        * These are really 'actions'.
+        */
+
+       O_LOG,                  /* ipfw_insn_log                */
+       O_PROB,                 /* u32 = match probability      */
+
+       O_CHECK_STATE,          /* none                         */
+       O_ACCEPT,               /* none                         */
+       O_DENY,                 /* none                         */
+       O_REJECT,               /* arg1=icmp arg (same as deny) */
+       O_COUNT,                /* none                         */
+       O_SKIPTO,               /* arg1=next rule number        */
+       O_PIPE,                 /* arg1=pipe number             */
+       O_QUEUE,                /* arg1=queue number            */
+       O_DIVERT,               /* arg1=port number             */
+       O_TEE,                  /* arg1=port number             */
+       O_FORWARD_IP,           /* fwd sockaddr                 */
+       O_FORWARD_MAC,          /* fwd mac                      */
+       O_NAT,                  /* nope                         */
+       O_REASS,                /* none                         */
+       
+       /*
+        * More opcodes.
+        */
+       O_IPSEC,                /* has ipsec history            */
+       O_IP_SRC_LOOKUP,        /* arg1=table number, u32=value */
+       O_IP_DST_LOOKUP,        /* arg1=table number, u32=value */
+       O_ANTISPOOF,            /* none                         */
+       O_JAIL,                 /* u32 = id                     */
+       O_ALTQ,                 /* u32 = altq classif. qid      */
+       O_DIVERTED,             /* arg1=bitmap (1:loop, 2:out)  */
+       O_TCPDATALEN,           /* arg1 = tcp data len          */
+       O_IP6_SRC,              /* address without mask         */
+       O_IP6_SRC_ME,           /* my addresses                 */
+       O_IP6_SRC_MASK,         /* address with the mask        */
+       O_IP6_DST,
+       O_IP6_DST_ME,
+       O_IP6_DST_MASK,
+       O_FLOW6ID,              /* for flow id tag in the ipv6 pkt */
+       O_ICMP6TYPE,            /* icmp6 packet type filtering  */
+       O_EXT_HDR,              /* filtering for ipv6 extension header */
+       O_IP6,
+
+       /*
+        * actions for ng_ipfw
+        */
+       O_NETGRAPH,             /* send to ng_ipfw              */
+       O_NGTEE,                /* copy to ng_ipfw              */
+
+       O_IP4,
+
+       O_UNREACH6,             /* arg1=icmpv6 code arg (deny)  */
+
+       O_TAG,                  /* arg1=tag number */
+       O_TAGGED,               /* arg1=tag number */
+
+       O_SETFIB,               /* arg1=FIB number */
+       O_FIB,                  /* arg1=FIB desired fib number */
+
+       O_LAST_OPCODE           /* not an opcode!               */
+};
+
+/*
+ * The extension header are filtered only for presence using a bit
+ * vector with a flag for each header.
+ */
+#define EXT_FRAGMENT   0x1
+#define EXT_HOPOPTS    0x2
+#define EXT_ROUTING    0x4
+#define EXT_AH         0x8
+#define EXT_ESP                0x10
+#define EXT_DSTOPTS    0x20
+#define EXT_RTHDR0             0x40
+#define EXT_RTHDR2             0x80
+
+/*
+ * Template for instructions.
+ *
+ * ipfw_insn is used for all instructions which require no operands,
+ * a single 16-bit value (arg1), or a couple of 8-bit values.
+ *
+ * For other instructions which require different/larger arguments
+ * we have derived structures, ipfw_insn_*.
+ *
+ * The size of the instruction (in 32-bit words) is in the low
+ * 6 bits of "len". The 2 remaining bits are used to implement
+ * NOT and OR on individual instructions. Given a type, you can
+ * compute the length to be put in "len" using F_INSN_SIZE(t)
+ *
+ * F_NOT       negates the match result of the instruction.
+ *
+ * F_OR                is used to build or blocks. By default, instructions
+ *             are evaluated as part of a logical AND. An "or" block
+ *             { X or Y or Z } contains F_OR set in all but the last
+ *             instruction of the block. A match will cause the code
+ *             to skip past the last instruction of the block.
+ *
+ * NOTA BENE: in a couple of places we assume that
+ *     sizeof(ipfw_insn) == sizeof(u_int32_t)
+ * this needs to be fixed.
+ *
+ */
+typedef struct _ipfw_insn {    /* template for instructions */
+       u_int8_t        opcode;
+       u_int8_t        len;    /* number of 32-bit words */
+#define        F_NOT           0x80
+#define        F_OR            0x40
+#define        F_LEN_MASK      0x3f
+#define        F_LEN(cmd)      ((cmd)->len & F_LEN_MASK)
+
+       u_int16_t       arg1;
+} ipfw_insn;
+
+/*
+ * The F_INSN_SIZE(type) computes the size, in 4-byte words, of
+ * a given type.
+ */
+#define        F_INSN_SIZE(t)  ((sizeof (t))/sizeof(u_int32_t))
+
+/*
+ * This is used to store an array of 16-bit entries (ports etc.)
+ */
+typedef struct _ipfw_insn_u16 {
+       ipfw_insn o;
+       u_int16_t ports[2];     /* there may be more */
+} ipfw_insn_u16;
+
+/*
+ * This is used to store an array of 32-bit entries
+ * (uid, single IPv4 addresses etc.)
+ */
+typedef struct _ipfw_insn_u32 {
+       ipfw_insn o;
+       u_int32_t d[1]; /* one or more */
+} ipfw_insn_u32;
+
+/*
+ * This is used to store IP addr-mask pairs.
+ */
+typedef struct _ipfw_insn_ip {
+       ipfw_insn o;
+       struct in_addr  addr;
+       struct in_addr  mask;
+} ipfw_insn_ip;
+
+/*
+ * This is used to forward to a given address (ip).
+ */
+typedef struct  _ipfw_insn_sa {
+       ipfw_insn o;
+       struct sockaddr_in sa;
+} ipfw_insn_sa;
+
+/*
+ * This is used for MAC addr-mask pairs.
+ */
+typedef struct _ipfw_insn_mac {
+       ipfw_insn o;
+       u_char addr[12];        /* dst[6] + src[6] */
+       u_char mask[12];        /* dst[6] + src[6] */
+} ipfw_insn_mac;
+
+/*
+ * This is used for interface match rules (recv xx, xmit xx).
+ */
+typedef struct _ipfw_insn_if {
+       ipfw_insn o;
+       union {
+               struct in_addr ip;
+               int glob;
+       } p;
+       char name[IFNAMSIZ];
+} ipfw_insn_if;
+
+/*
+ * This is used for storing an altq queue id number.
+ */
+typedef struct _ipfw_insn_altq {
+       ipfw_insn       o;
+       u_int32_t       qid;
+} ipfw_insn_altq;
+
+/*
+ * This is used for limit rules.
+ */
+typedef struct _ipfw_insn_limit {
+       ipfw_insn o;
+       u_int8_t _pad;
+       u_int8_t limit_mask;    /* combination of DYN_* below   */
+#define        DYN_SRC_ADDR    0x1
+#define        DYN_SRC_PORT    0x2
+#define        DYN_DST_ADDR    0x4
+#define        DYN_DST_PORT    0x8
+
+       u_int16_t conn_limit;
+} ipfw_insn_limit;
+
+/*
+ * This is used for log instructions.
+ */
+typedef struct  _ipfw_insn_log {
+        ipfw_insn o;
+       u_int32_t max_log;      /* how many do we log -- 0 = all */
+       u_int32_t log_left;     /* how many left to log         */
+} ipfw_insn_log;
+
+/*
+ * Data structures required by both ipfw(8) and ipfw(4) but not part of the
+ * management API are protected by IPFW_INTERNAL.
+ */
+#ifdef IPFW_INTERNAL
+/* Server pool support (LSNAT). */
+struct cfg_spool {
+       LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
+       struct in_addr          addr;
+       u_short                 port;
+};
+#endif
+
+/* Redirect modes id. */
+#define REDIR_ADDR      0x01
+#define REDIR_PORT      0x02
+#define REDIR_PROTO     0x04
+
+#ifdef IPFW_INTERNAL
+/* Nat redirect configuration. */
+struct cfg_redir {
+       LIST_ENTRY(cfg_redir)   _next;          /* chain of redir instances */
+       u_int16_t               mode;           /* type of redirect mode */
+       struct in_addr          laddr;          /* local ip address */
+       struct in_addr          paddr;          /* public ip address */
+       struct in_addr          raddr;          /* remote ip address */
+       u_short                 lport;          /* local port */
+       u_short                 pport;          /* public port */
+       u_short                 rport;          /* remote port  */
+       u_short                 pport_cnt;      /* number of public ports */
+       u_short                 rport_cnt;      /* number of remote ports */
+       int                     proto;          /* protocol: tcp/udp */
+       struct alias_link       **alink;        
+       /* num of entry in spool chain */
+       u_int16_t               spool_cnt;      
+       /* chain of spool instances */
+       LIST_HEAD(spool_chain, cfg_spool) spool_chain;
+};
+#endif
+
+#define NAT_BUF_LEN     1024
+
+#ifdef IPFW_INTERNAL
+/* Nat configuration data struct. */
+struct cfg_nat {
+       /* chain of nat instances */
+       LIST_ENTRY(cfg_nat)     _next;
+       int                     id;                     /* nat id */
+       struct in_addr          ip;                     /* nat ip address */
+       char                    if_name[IF_NAMESIZE];   /* interface name */
+       int                     mode;                   /* aliasing mode */
+       struct libalias         *lib;                   /* libalias instance */
+       /* number of entry in spool chain */
+       int                     redir_cnt;              
+       /* chain of redir instances */
+       LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
+};
+#endif
+
+#define SOF_NAT         sizeof(struct cfg_nat)
+#define SOF_REDIR       sizeof(struct cfg_redir)
+#define SOF_SPOOL       sizeof(struct cfg_spool)
+
+/* Nat command. */
+typedef struct _ipfw_insn_nat {
+       ipfw_insn       o;
+       struct cfg_nat *nat;    
+} ipfw_insn_nat;
+
+/* Apply ipv6 mask on ipv6 addr */
+#define APPLY_MASK(addr,mask)                          \
+    (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
+    (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
+    (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
+    (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];
+
+/* Structure for ipv6 */
+typedef struct _ipfw_insn_ip6 {
+       ipfw_insn o;
+       struct in6_addr addr6;
+       struct in6_addr mask6;
+} ipfw_insn_ip6;
+
+/* Used to support icmp6 types */
+typedef struct _ipfw_insn_icmp6 {
+       ipfw_insn o;
+       uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
+                       *     define ICMP6_MAXTYPE
+                       *     as follows: n = ICMP6_MAXTYPE/32 + 1
+                        *     Actually is 203 
+                       */
+} ipfw_insn_icmp6;
+
+/*
+ * Here we have the structure representing an ipfw rule.
+ *
+ * It starts with a general area (with link fields and counters)
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values.
+ *
+ * Given a rule pointer  r:
+ *
+ *  r->cmd             is the start of the first instruction.
+ *  ACTION_PTR(r)      is the start of the first action (things to do
+ *                     once a rule matched).
+ *
+ * When assembling instruction, remember the following:
+ *
+ *  + if a rule has a "keep-state" (or "limit") option, then the
+ *     first instruction (at r->cmd) MUST BE an O_PROBE_STATE
+ *  + if a rule has a "log" option, then the first action
+ *     (at ACTION_PTR(r)) MUST be O_LOG
+ *  + if a rule has an "altq" option, it comes after "log"
+ *  + if a rule has an O_TAG option, it comes after "log" and "altq"
+ *
+ * NOTE: we use a simple linked list of rules because we never need
+ *     to delete a rule without scanning the list. We do not use
+ *     queue(3) macros for portability and readability.
+ */
+
+struct ip_fw {
+       struct ip_fw    *x_next;        /* linked list of rules         */
+       struct ip_fw    *next_rule;     /* ptr to next [skipto] rule    */
+       /* 'next_rule' is used to pass up 'set_disable' status          */
+
+       uint16_t        act_ofs;        /* offset of action in 32-bit units */
+       uint16_t        cmd_len;        /* # of 32-bit words in cmd     */
+       uint16_t        rulenum;        /* rule number                  */
+       uint8_t set;            /* rule set (0..31)             */
+#define        RESVD_SET       31      /* set for default and persistent rules */
+       uint8_t         _pad;           /* padding                      */
+       uint32_t        id;             /* rule id */
+
+       /* These fields are present in all rules.                       */
+       uint64_t        pcnt;           /* Packet counter               */
+       uint64_t        bcnt;           /* Byte counter                 */
+       uint32_t        timestamp;      /* tv_sec of last match         */
+
+       ipfw_insn       cmd[1];         /* storage for commands         */
+};
+
+#define ACTION_PTR(rule)                               \
+       (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
+
+#define RULESIZE(rule)  (sizeof(struct ip_fw) + \
+       ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+
+/*
+ * This structure is used as a flow mask and a flow id for various
+ * parts of the code.
+ */
+struct ipfw_flow_id {
+       u_int32_t       dst_ip;
+       u_int32_t       src_ip;
+       u_int16_t       dst_port;
+       u_int16_t       src_port;
+       u_int8_t        fib;
+       u_int8_t        proto;
+       u_int8_t        flags;  /* protocol-specific flags */
+       uint8_t         addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
+       struct in6_addr dst_ip6;        /* could also store MAC addr! */
+       struct in6_addr src_ip6;
+       u_int32_t       flow_id6;
+       u_int32_t       frag_id6;
+};
+
+#define IS_IP6_FLOW_ID(id)     ((id)->addr_type == 6)
+
+/*
+ * Dynamic ipfw rule.
+ */
+typedef struct _ipfw_dyn_rule ipfw_dyn_rule;
+
+struct _ipfw_dyn_rule {
+       ipfw_dyn_rule   *next;          /* linked list of rules.        */
+       struct ip_fw *rule;             /* pointer to rule              */
+       /* 'rule' is used to pass up the rule number (from the parent)  */
+
+       ipfw_dyn_rule *parent;          /* pointer to parent rule       */
+       u_int64_t       pcnt;           /* packet match counter         */
+       u_int64_t       bcnt;           /* byte match counter           */
+       struct ipfw_flow_id id;         /* (masked) flow id             */
+       u_int32_t       expire;         /* expire time                  */
+       u_int32_t       bucket;         /* which bucket in hash table   */
+       u_int32_t       state;          /* state of this rule (typically a
+                                        * combination of TCP flags)
+                                        */
+       u_int32_t       ack_fwd;        /* most recent ACKs in forward  */
+       u_int32_t       ack_rev;        /* and reverse directions (used */
+                                       /* to generate keepalives)      */
+       u_int16_t       dyn_type;       /* rule type                    */
+       u_int16_t       count;          /* refcount                     */
+};
+
+/*
+ * Definitions for IP option names.
+ */
+#define        IP_FW_IPOPT_LSRR        0x01
+#define        IP_FW_IPOPT_SSRR        0x02
+#define        IP_FW_IPOPT_RR          0x04
+#define        IP_FW_IPOPT_TS          0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define        IP_FW_TCPOPT_MSS        0x01
+#define        IP_FW_TCPOPT_WINDOW     0x02
+#define        IP_FW_TCPOPT_SACK       0x04
+#define        IP_FW_TCPOPT_TS         0x08
+#define        IP_FW_TCPOPT_CC         0x10
+
+#define        ICMP_REJECT_RST         0x100   /* fake ICMP code (send a TCP RST) */
+#define        ICMP6_UNREACH_RST       0x100   /* fake ICMPv6 code (send a TCP RST) */
+
+/*
+ * These are used for lookup tables.
+ */
+typedef struct _ipfw_table_entry {
+       in_addr_t       addr;           /* network address              */
+       u_int32_t       value;          /* value                        */
+       u_int16_t       tbl;            /* table number                 */
+       u_int8_t        masklen;        /* mask length                  */
+} ipfw_table_entry;
+
+typedef struct _ipfw_table {
+       u_int32_t       size;           /* size of entries in bytes     */
+       u_int32_t       cnt;            /* # of entries                 */
+       u_int16_t       tbl;            /* table number                 */
+       ipfw_table_entry ent[0];        /* entries                      */
+} ipfw_table;
+
+#endif /* _IPFW2_H */
diff --git a/dummynet2/include/netinet/ipfw/ip_fw_private.h b/dummynet2/include/netinet/ipfw/ip_fw_private.h
new file mode 100644 (file)
index 0000000..41ae845
--- /dev/null
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 200601 2009-12-16 10:48:40Z luigi $
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+#define MTAG_IPFW      1148380143      /* IPFW-tagged cookie */
+#define MTAG_IPFW_RULE 1262273568      /* rule reference */
+
+/* Return values from ipfw_chk() */
+enum {
+       IP_FW_PASS = 0,
+       IP_FW_DENY,
+       IP_FW_DIVERT,
+       IP_FW_TEE,
+       IP_FW_DUMMYNET,
+       IP_FW_NETGRAPH,
+       IP_FW_NGTEE,
+       IP_FW_NAT,
+       IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+       struct ip6_pktopts *opt_or;
+       struct route_in6 ro_or;
+       int flags_or;
+       struct ip6_moptions *im6o_or;
+       struct ifnet *origifp_or;
+       struct ifnet *ifp_or;
+       struct sockaddr_in6 dst_or;
+       u_long mtu_or;
+       struct route_in6 ro_pmtu_or;
+};
+
+/*
+ * Reference to an ipfw rule that can be carried outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+       uint32_t        slot;           /* slot for matching rule       */
+       uint32_t        rulenum;        /* matching rule number         */
+       uint32_t        rule_id;        /* matching rule id             */
+       uint32_t        chain_id;       /* ruleset id                   */
+       uint32_t        info;           /* see below                    */
+};
+
+enum {
+       IPFW_INFO_MASK  = 0x0000ffff,
+       IPFW_INFO_OUT   = 0x00000000,   /* outgoing, just for convenience */
+       IPFW_INFO_IN    = 0x80000000,   /* incoming, overloads dir */
+       IPFW_ONEPASS    = 0x40000000,   /* One-pass, do not reinject */
+       IPFW_IS_MASK    = 0x30000000,   /* which source ? */
+       IPFW_IS_DIVERT  = 0x20000000,
+       IPFW_IS_DUMMYNET =0x10000000,
+       IPFW_IS_PIPE    = 0x08000000,   /* pip1=1, queue = 0 */
+};
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+       struct mbuf     *m;             /* the mbuf chain               */
+       struct ifnet    *oif;           /* output interface             */
+       struct sockaddr_in *next_hop;   /* forward address              */
+
+       /*
+        * On return, it points to the matching rule.
+        * On entry, rule.slot > 0 means the info is valid and
+        * contains the the starting rule for an ipfw search.
+        * If chain_id == chain->id && slot >0 then jump to that slot.
+        * Otherwise, we locate the first rule >= rulenum:rule_id
+        */
+       struct ipfw_rule_ref rule;      /* match/restart info           */
+
+       struct ether_header *eh;        /* for bridged packets          */
+
+       struct ipfw_flow_id f_id;       /* grabbed from IP header       */
+       //uint32_t      cookie;         /* a cookie depending on rule action */
+       struct inpcb    *inp;
+
+       struct _ip6dn_args      dummypar; /* dummynet->ip6_output */
+       struct sockaddr_in hopstore;    /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+       DIR_MASK =      0x3,
+       DIR_OUT =       0,
+       DIR_IN =        1,
+       DIR_FWD =       2,
+       DIR_DROP =      3,
+       PROTO_LAYER2 =  0x4, /* set for layer 2 */
+       /* PROTO_DEFAULT = 0, */
+       PROTO_IPV4 =    0x08,
+       PROTO_IPV6 =    0x10,
+       PROTO_IFB =     0x0c, /* layer2 + ifbridge */
+    /*  PROTO_OLDBDG =  0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifdef __linux__
+#define FREE_PKT(m)    netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m)    m_freem(m)
+#endif
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+       struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+       struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define        V_norule_counter        VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define        V_verbose_limit         VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+       MATCH_REVERSE = 0,
+       MATCH_FORWARD,
+       MATCH_NONE,
+       MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+    u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+       int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void);    /* uma_zcreate .... */
+void ipfw_dyn_detach(void);    /* uma_zdestroy ... */
+void ipfw_dyn_init(void);      /* per-vnet initialization */
+void ipfw_dyn_uninit(int);     /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define        V_fw_one_pass           VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define        V_fw_verbose            VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define        V_layer3_chain          VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define        V_set_disable           VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step         VNET(autoinc_step)
+
+struct ip_fw_chain {
+       struct ip_fw    *rules;         /* list of rules */
+       struct ip_fw    *reap;          /* list of rules to reap */
+       struct ip_fw    *default_rule;
+       int             n_rules;        /* number of static rules */
+       int             static_len;     /* total len of static rules */
+       struct ip_fw    **map;          /* array of rule ptrs to ease lookup */
+       LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
+       struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+        spinlock_t rwmtx;
+        spinlock_t uh_lock;
+#else
+       struct rwlock   rwmtx;
+       struct rwlock   uh_lock;        /* lock for upper half */
+#endif
+       uint32_t        id;             /* ruleset id */
+};
+
+struct sockopt;        /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define        IPFW_LOCK_INIT(_chain) do {                     \
+       rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+       rw_init(&(_chain)->uh_lock, "IPFW UH lock");    \
+       } while (0)
+
+#define        IPFW_LOCK_DESTROY(_chain) do {                  \
+       rw_destroy(&(_chain)->rwmtx);                   \
+       rw_destroy(&(_chain)->uh_lock);                 \
+       } while (0)
+
+#define        IPFW_WLOCK_ASSERT(_chain)       rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+void ipfw_flush_tables(struct ip_fw_chain *ch);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* hooks for divert */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+
+/* In ip_fw_nat.c */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+/* netgraph prototypes */
+
+typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
+extern  ng_ipfw_input_t *ng_ipfw_input_p;
+#define NG_IPFW_LOADED  (ng_ipfw_input_p != NULL)
+
+#define TAGSIZ  (sizeof(struct ng_ipfw_tag) - sizeof(struct m_tag))
+
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/dummynet2/ip_dummynet.c b/dummynet2/ip_dummynet.c
new file mode 100644 (file)
index 0000000..bb34c04
--- /dev/null
@@ -0,0 +1,2370 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 200601 2009-12-16 10:48:40Z luigi $");
+
+#define        DUMMYNET_DEBUG
+
+#include "opt_inet6.h"
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ *  + variable declarations;
+ *  + heap management functions;
+ *  + scheduler and dummynet functions;
+ *  + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by the "dummynet lock".
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513:     initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>    /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>                /* ip_len, ip_off */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>    /* ip_output(), IP_FORWARDING */
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ; /* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static long searches, search_steps ;
+static int pipe_expire = 1 ;   /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
+static long pipe_byte_limit = 1024 * 1024;
+
+static int red_lookup_depth = 256;     /* RED - default lookup table depth */
+static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
+static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+
+static struct timeval prev_t, t;
+static long tick_last;                 /* Last tick duration (usec). */
+static long tick_delta;                        /* Last vs standard tick diff (usec). */
+static long tick_delta_sum;            /* Accumulated tick difference (usec).*/
+static long tick_adjustment;           /* Tick adjustments done. */
+static long tick_lost;                 /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static int             io_fast;
+static unsigned long   io_pkt;
+static unsigned long   io_pkt_fast;
+static unsigned long   io_pkt_drop;
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int     heap_init(struct dn_heap *h, int size);
+static int     heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void    heap_extract(struct dn_heap *h, void *obj);
+static void    transmit_event(struct dn_pipe *pipe, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event(struct dn_flow_queue *q, struct mbuf **head,
+                   struct mbuf **tail);
+static void    ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
+                   struct mbuf **tail);
+
+#define        HASHSIZE        16
+#define        HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
+static struct dn_pipe_head     pipehash[HASHSIZE];     /* all pipes */
+static struct dn_flow_set_head flowsethash[HASHSIZE];  /* all flowsets */
+
+static struct callout dn_timeout;
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+#if 0  /* curr_time is 64 bit */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+    CTLFLAG_RD, &curr_time, 0, "Current tick");
+#endif
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
+    CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+    CTLFLAG_RW, &dn_max_ratio, 0,
+    "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
+#endif
+
+#ifdef DUMMYNET_DEBUG
+int    dummynet_debug = 0;
+#ifdef SYSCTL_NODE
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
+           0, "control debugging printfs");
+#endif
+#define        DPRINTF(X)      if (dummynet_debug) printf X
+#else
+#define        DPRINTF(X)
+#endif
+
+static struct task     dn_task;
+static struct taskqueue        *dn_tq = NULL;
+static void dummynet_task(void *, int);
+
+#if defined( __linux__ ) || defined( _WIN32 )
+static DEFINE_SPINLOCK(dummynet_mtx);
+#else
+static struct mtx dummynet_mtx;
+#endif
+#define        DUMMYNET_LOCK_INIT() \
+       mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
+#define        DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
+#define        DUMMYNET_LOCK()         mtx_lock(&dummynet_mtx)
+#define        DUMMYNET_UNLOCK()       mtx_unlock(&dummynet_mtx)
+#define        DUMMYNET_LOCK_ASSERT()  mtx_assert(&dummynet_mtx, MA_OWNED)
+
+static int     config_pipe(struct dn_pipe *p);
+static int     ip_dn_ctl(struct sockopt *sopt);
+
+static void    dummynet(void *);
+static void    dummynet_flush(void);
+static void    dummynet_send(struct mbuf *);
+void           dummynet_drain(void);
+static int     dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+
+/*
+ * Flow queue is idle if:
+ *   1) it's empty for at least 1 tick
+ *   2) it has invalid timestamp (WF2Q case)
+ *   3) parent pipe has no 'exhausted' burst.
+ */
+#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
+       curr_time > (q)->idle_time + 1 && \
+       ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
+       (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define        HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+       printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
+               h->size, new_size);
+       return 0 ;
+    }
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
+    if (p == NULL) {
+       printf("dummynet: %s, resize %d failed\n", __func__, new_size );
+       return 1 ; /* error */
+    }
+    if (h->size > 0) {
+       bcopy(h->p, p, h->size * sizeof(*p) );
+       free(h->p, M_DUMMYNET);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+           *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+    int son = h->elements ;
+
+    if (p == NULL)     /* data already there, set starting point */
+       son = key1 ;
+    else {             /* insert new element at the end, possibly resize */
+       son = h->elements ;
+       if (son == h->size) /* need resize... */
+           if (heap_init(h, h->elements+1) )
+               return 1 ; /* failure... */
+       h->p[son].object = p ;
+       h->p[son].key = key1 ;
+       h->elements++ ;
+    }
+    while (son > 0) {                          /* bubble up */
+       int father = HEAP_FATHER(son) ;
+       struct dn_heap_entry tmp  ;
+
+       if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+           break ; /* found right position */
+       /* son smaller than father, swap and repeat */
+       HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+       SET_OFFSET(h, son);
+       son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+    int child, father, max = h->elements - 1 ;
+
+    if (max < 0) {
+       printf("dummynet: warning, extract from empty heap 0x%p\n", h);
+       return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+       if (h->offset <= 0)
+           panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
+       father = *((int *)((char *)obj + h->offset)) ;
+       if (father < 0 || father >= h->elements) {
+           printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+               father, h->elements);
+           panic("dummynet: heap_extract");
+       }
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;                /* left child */
+    while (child <= max) {             /* valid entry */
+       if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+           child = child+1 ;           /* take right child, otherwise left */
+       h->p[father] = h->p[child] ;
+       SET_OFFSET(h, father);
+       father = child ;
+       child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }
+    h->elements-- ;
+    if (father != max) {
+       /*
+        * Fill hole with last entry and bubble up, reusing the insert code
+        */
+       h->p[father] = h->p[max] ;
+       heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+       panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+       h->p[i].key = new_key ;
+       for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+                i = temp ) { /* bubble up */
+           HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+           SET_OFFSET(h, i);
+       }
+    } else {           /* must move down */
+       h->p[i].key = new_key ;
+       while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+           if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+               temp++ ; /* select child with min key */
+           if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+               HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+               SET_OFFSET(h, i);
+           } else
+               break ;
+           i = temp ;
+       }
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+       heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+       free(h->p, M_DUMMYNET);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Dispose a list of packet. Use an inline functions so if we
+ * need to free extra state associated to a packet, this is a
+ * central point to do it.
+ */
+
+static __inline void dn_free_pkts(struct mbuf *mnext)
+{
+       struct mbuf *m;
+
+       while ((m = mnext) != NULL) {
+               mnext = m->m_nextpkt;
+               FREE_PKT(m);
+       }
+}
+
+/*
+ * Return the mbuf tag holding the dummynet state.  As an optimization
+ * this is assumed to be the first tag on the list.  If this turns out
+ * wrong we'll need to search the list.
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+    struct m_tag *mtag = m_tag_first(m);
+    KASSERT(mtag != NULL &&
+           mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+           mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+           ("packet on dummynet queue w/o dummynet tag!"));
+    return (struct dn_pkt_tag *)(mtag+1);
+}
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *m;
+       struct dn_pkt_tag *pkt;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       while ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               if (!DN_KEY_LEQ(pkt->output_time, curr_time))
+                       break;
+
+               pipe->head = m->m_nextpkt;
+               if (*tail != NULL)
+                       (*tail)->m_nextpkt = m;
+               else
+                       *head = m;
+               *tail = m;
+       }
+       if (*tail != NULL)
+               (*tail)->m_nextpkt = NULL;
+
+       /* If there are leftover packets, put into the heap for next event. */
+       if ((m = pipe->head) != NULL) {
+               pkt = dn_tag_get(m);
+               /*
+                * XXX Should check errors on heap_insert, by draining the
+                * whole pipe p and hoping in the future we are more successful.
+                */
+               heap_insert(&extract_heap, pkt->output_time, pipe);
+       }
+}
+
+#ifndef __linux__
+#define div64(a, b)    ((int64_t)(a) / (int64_t)(b))
+#endif
+/*
+ * Compute how many ticks we have to wait before being able to send
+ * a packet. This is computed as the "wire time" for the packet
+ * (length + extra bits), minus the credit available, scaled to ticks.
+ * Check that the result is not be negative (it could be if we have
+ * too much leftover credit in q->numbytes).
+ */
+static inline dn_key
+set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+{
+       int64_t ret;
+
+       ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
+               - q->numbytes + p->bandwidth - 1 , p->bandwidth);
+       if (ret < 0)
+               ret = 0;
+       return ret;
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are in milliseconds
+ * so we need to divide by 1000.
+ */
+static dn_key
+compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+{
+       int index;
+       dn_key extra_bits;
+
+       if (!p->samples || p->samples_no == 0)
+               return 0;
+       index  = random() % p->samples_no;
+       extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000);
+       if (index >= p->loss_level) {
+               struct dn_pkt_tag *dt = dn_tag_get(pkt);
+               if (dt)
+                       dt->dn_dir = DIR_DROP;
+       }
+       return extra_bits;
+}
+
+static void
+free_pipe(struct dn_pipe *p)
+{
+       if (p->samples)
+               free(p->samples, M_DUMMYNET);
+       free(p, M_DUMMYNET);
+}
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
+    int len)
+{
+    struct dn_pkt_tag *dt = dn_tag_get(pkt);
+
+    q->head = pkt->m_nextpkt ;
+    q->len-- ;
+    q->len_bytes -= len ;
+
+    dt->output_time = curr_time + p->delay ;
+
+    if (p->head == NULL)
+       p->head = pkt;
+    else
+       p->tail->m_nextpkt = pkt;
+    p->tail = pkt;
+    p->tail->m_nextpkt = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+{
+       struct mbuf *pkt;
+       struct dn_pipe *p = q->fs->pipe;
+       int p_was_empty;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p == NULL) {
+               printf("dummynet: ready_event- pipe is gone\n");
+               return;
+       }
+       p_was_empty = (p->head == NULL);
+
+       /*
+        * Schedule fixed-rate queues linked to this pipe:
+        * account for the bw accumulated since last scheduling, then
+        * drain as many pkts as allowed by q->numbytes and move to
+        * the delay line (in p) computing output time.
+        * bandwidth==0 (no limit) means we can drain the whole queue,
+        * setting len_scaled = 0 does the job.
+        */
+       q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
+       while ((pkt = q->head) != NULL) {
+               int len = pkt->m_pkthdr.len;
+               dn_key len_scaled = p->bandwidth ? len*8*hz
+                       + q->extra_bits*hz
+                       : 0;
+
+               if (DN_KEY_GT(len_scaled, q->numbytes))
+                       break;
+               q->numbytes -= len_scaled;
+               move_pkt(pkt, q, p, len);
+               if (q->head)
+                       q->extra_bits = compute_extra_bits(q->head, p);
+       }
+       /*
+        * If we have more packets queued, schedule next ready event
+        * (can only occur when bandwidth != 0, otherwise we would have
+        * flushed the whole queue in the previous loop).
+        * To this purpose we record the current time and compute how many
+        * ticks to go for the finish time of the packet.
+        */
+       if ((pkt = q->head) != NULL) {  /* this implies bandwidth != 0 */
+               dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+
+               q->sched_time = curr_time;
+               heap_insert(&ready_heap, curr_time + t, (void *)q);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       } else          /* RED needs to know when the queue becomes empty. */
+               q->idle_time = curr_time;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes < 0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+{
+       int p_was_empty = (p->head == NULL);
+       struct dn_heap *sch = &(p->scheduler_heap);
+       struct dn_heap *neh = &(p->not_eligible_heap);
+       int64_t p_numbytes = p->numbytes;
+
+       /*
+        * p->numbytes is only 32bits in FBSD7, but we might need 64 bits.
+        * Use a local variable for the computations, and write back the
+        * results when done, saturating if needed.
+        * The local variable has no impact on performance and helps
+        * reducing diffs between the various branches.
+        */
+
+       DUMMYNET_LOCK_ASSERT();
+
+       if (p->if_name[0] == 0)         /* tx clock is simulated */
+               p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
+       else {  /*
+                * tx clock is for real,
+                * the ifq must be empty or this is a NOP.
+                */
+#ifdef __linux__
+               return;
+#else
+               if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+                       return;
+               else {
+                       DPRINTF(("dummynet: pipe %d ready from %s --\n",
+                           p->pipe_nr, p->if_name));
+               }
+#endif
+       }
+
+       /*
+        * While we have backlogged traffic AND credit, we need to do
+        * something on the queue.
+        */
+       while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
+               if (sch->elements > 0) {
+                       /* Have some eligible pkts to send out. */
+                       struct dn_flow_queue *q = sch->p[0].object;
+                       struct mbuf *pkt = q->head;
+                       struct dn_flow_set *fs = q->fs;
+                       uint64_t len = pkt->m_pkthdr.len;
+                       int len_scaled = p->bandwidth ? len * 8 * hz : 0;
+
+                       heap_extract(sch, NULL); /* Remove queue from heap. */
+                       p_numbytes -= len_scaled;
+                       move_pkt(pkt, q, p, len);
+
+                       p->V += div64((len << MY_M), p->sum);   /* Update V. */
+                       q->S = q->F;                    /* Update start time. */
+                       if (q->len == 0) {
+                               /* Flow not backlogged any more. */
+                               fs->backlogged--;
+                               heap_insert(&(p->idle_heap), q->F, q);
+                       } else {
+                               /* Still backlogged. */
+
+                               /*
+                                * Update F and position in backlogged queue,
+                                * then put flow in not_eligible_heap
+                                * (we will fix this later).
+                                */
+                               len = (q->head)->m_pkthdr.len;
+                               q->F += div64((len << MY_M), fs->weight);
+                               if (DN_KEY_LEQ(q->S, p->V))
+                                       heap_insert(neh, q->S, q);
+                               else
+                                       heap_insert(sch, q->F, q);
+                       }
+               }
+               /*
+                * Now compute V = max(V, min(S_i)). Remember that all elements
+                * in sch have by definition S_i <= V so if sch is not empty,
+                * V is surely the max and we must not update it. Conversely,
+                * if sch is empty we only need to look at neh.
+                */
+               if (sch->elements == 0 && neh->elements > 0)
+                       p->V = MAX64(p->V, neh->p[0].key);
+               /* Move from neh to sch any packets that have become eligible */
+               while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
+                       struct dn_flow_queue *q = neh->p[0].object;
+                       heap_extract(neh, NULL);
+                       heap_insert(sch, q->F, q);
+               }
+
+               if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
+                       p_numbytes = -1;        /* Mark not ready for I/O. */
+                       break;
+               }
+       }
+       if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) {
+               p->idle_time = curr_time;
+               /*
+                * No traffic and no events scheduled.
+                * We can get rid of idle-heap.
+                */
+               if (p->idle_heap.elements > 0) {
+                       int i;
+
+                       for (i = 0; i < p->idle_heap.elements; i++) {
+                               struct dn_flow_queue *q;
+                               
+                               q = p->idle_heap.p[i].object;
+                               q->F = 0;
+                               q->S = q->F + 1;
+                       }
+                       p->sum = 0;
+                       p->V = 0;
+                       p->idle_heap.elements = 0;
+               }
+       }
+       /*
+        * If we are getting clocks from dummynet (not a real interface) and
+        * If we are under credit, schedule the next ready event.
+        * Also fix the delivery time of the last packet.
+        */
+       if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
+               dn_key t = 0;           /* Number of ticks i have to wait. */
+
+               if (p->bandwidth > 0)
+                       t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth);
+               dn_tag_get(p->tail)->output_time += t;
+               p->sched_time = curr_time;
+               heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+               /*
+                * XXX Should check errors on heap_insert, and drain the whole
+                * queue on error hoping next time we are luckier.
+                */
+       }
+
+       /* Write back p_numbytes (adjust 64->32bit if necessary). */
+       p->numbytes = p_numbytes;
+
+       /*
+        * If the delay line was empty call transmit_event() now.
+        * Otherwise, the scheduler will take care of it.
+        */
+       if (p_was_empty)
+               transmit_event(p, head, tail);
+}
+
+/*
+ * This is called one tick, after previous run. It is used to
+ * schedule next run.
+ */
+static void
+dummynet(void * __unused unused)
+{
+
+       taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+/*
+ * The main dummynet processing function.
+ */
+static void
+dummynet_task(void *context, int pending)
+{
+       struct mbuf *head = NULL, *tail = NULL;
+       struct dn_pipe *pipe;
+       struct dn_heap *heaps[3];
+       struct dn_heap *h;
+       void *p;        /* generic parameter to handler */
+       int i;
+
+       DUMMYNET_LOCK();
+
+       heaps[0] = &ready_heap;                 /* fixed-rate queues */
+       heaps[1] = &wfq_ready_heap;             /* wfq queues */
+       heaps[2] = &extract_heap;               /* delay line */
+
+       /* Update number of lost(coalesced) ticks. */
+       tick_lost += pending - 1;
+       getmicrouptime(&t);
+       /* Last tick duration (usec). */
+       tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
+           (t.tv_usec - prev_t.tv_usec);
+       /* Last tick vs standard tick difference (usec). */
+       tick_delta = (tick_last * hz - 1000000) / hz;
+       /* Accumulated tick difference (usec). */
+       tick_delta_sum += tick_delta;
+       prev_t = t;
+       /*
+        * Adjust curr_time if accumulated tick difference greater than
+        * 'standard' tick. Since curr_time should be monotonically increasing,
+        * we do positive adjustment as required and throttle curr_time in
+        * case of negative adjustment.
+        */
+       curr_time++;
+       if (tick_delta_sum - tick >= 0) {
+               int diff = tick_delta_sum / tick;
+               curr_time += diff;
+               tick_diff += diff;
+               tick_delta_sum %= tick;
+               tick_adjustment++;
+       } else if (tick_delta_sum + tick <= 0) {
+               curr_time--;
+               tick_diff--;
+               tick_delta_sum += tick;
+               tick_adjustment++;
+       }
+
+       for (i = 0; i < 3; i++) {
+               h = heaps[i];
+               while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
+                       if (h->p[0].key > curr_time)
+                               printf("dummynet: warning, "
+                                   "heap %d is %d ticks late\n",
+                                   i, (int)(curr_time - h->p[0].key));
+                       /* store a copy before heap_extract */
+                       p = h->p[0].object;
+                       /* need to extract before processing */
+                       heap_extract(h, NULL);
+                       if (i == 0)
+                               ready_event(p, &head, &tail);
+                       else if (i == 1) {
+                               struct dn_pipe *pipe = p;
+                               if (pipe->if_name[0] != '\0')
+                                       printf("dummynet: bad ready_event_wfq "
+                                           "for pipe %s\n", pipe->if_name);
+                               else
+                                       ready_event_wfq(p, &head, &tail);
+                       } else
+                               transmit_event(p, &head, &tail);
+               }
+       }
+
+       /* Sweep pipes trying to expire idle flow_queues. */
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_FOREACH(pipe, &pipehash[i], next) {
+                       if (pipe->idle_heap.elements > 0 &&
+                           DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
+                               struct dn_flow_queue *q =
+                                   pipe->idle_heap.p[0].object;
+
+                               heap_extract(&(pipe->idle_heap), NULL);
+                               /* Mark timestamp as invalid. */
+                               q->S = q->F + 1;
+                               pipe->sum -= q->fs->weight;
+                       }
+               }
+       }
+
+       DUMMYNET_UNLOCK();
+
+       if (head != NULL)
+               dummynet_send(head);
+
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+
+static void
+dummynet_send(struct mbuf *m)
+{
+       struct mbuf *n;
+
+       for (; m != NULL; m = n) {
+               struct ifnet *ifp = NULL;
+               int dst;
+               struct m_tag *tag;
+
+               n = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               tag = m_tag_first(m);
+               if (tag == NULL) {
+                       dst = DIR_DROP;
+               } else {
+                       struct dn_pkt_tag *pkt = dn_tag_get(m);
+                       /* extract the dummynet info, rename the tag */
+                       dst = pkt->dn_dir;
+                       ifp = pkt->ifp;
+                       /* rename the tag so it carries reinject info */
+                       tag->m_tag_cookie = MTAG_IPFW_RULE;
+                       tag->m_tag_id = 0;
+               }
+
+               switch (dst) {
+               case DIR_OUT:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+                       break ;
+               case DIR_IN :
+                       /* put header in network format for ip_input() */
+                       //SET_NET_IPLEN(mtod(m, struct ip *));
+                       netisr_dispatch(NETISR_IP, m);
+                       break;
+#ifdef INET6
+               case DIR_IN | PROTO_IPV6:
+                       netisr_dispatch(NETISR_IPV6, m);
+                       break;
+
+               case DIR_OUT | PROTO_IPV6:
+                       SET_HOST_IPLEN(mtod(m, struct ip *));
+                       ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+                       break;
+#endif
+               case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+                       if (bridge_dn_p != NULL)
+                               ((*bridge_dn_p)(m, ifp));
+                       else
+                               printf("dummynet: if_bridge not loaded\n");
+
+                       break;
+               case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+                       /*
+                        * The Ethernet code assumes the Ethernet header is
+                        * contiguous in the first mbuf header.
+                        * Insure this is true.
+                        */
+                       if (m->m_len < ETHER_HDR_LEN &&
+                           (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+                               printf("dummynet/ether: pullup failed, "
+                                   "dropping packet\n");
+                               break;
+                       }
+                       ether_demux(m->m_pkthdr.rcvif, m);
+                       break;
+               case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+                       ether_output_frame(ifp, m);
+                       break;
+
+               case DIR_DROP:
+                       /* drop the packet after some time */
+                       FREE_PKT(m);
+                       break;
+
+               default:
+                       printf("dummynet: bad switch %d!\n", dst);
+                       FREE_PKT(m);
+                       break;
+               }
+       }
+}
+
+/*
+ * Unconditionally expire empty queues in case of shortage.
+ * Returns the number of queues freed.
+ */
+static int
+expire_queues(struct dn_flow_set *fs)
+{
+    struct dn_flow_queue *q, *prev ;
+    int i, initial_elements = fs->rq_elements ;
+
+    if (fs->last_expired == time_uptime)
+       return 0 ;
+    fs->last_expired = time_uptime ;
+    for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */
+       for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) {
+           if (!QUEUE_IS_IDLE(q)) {
+               prev = q ;
+               q = q->next ;
+           } else { /* entry is idle, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+           }
+       }
+    }
+    return initial_elements - fs->rq_elements ;
+}
+
+/*
+ * If room, create a new queue and put at head of slot i;
+ * otherwise, create or use the default queue.
+ */
+static struct dn_flow_queue *
+create_queue(struct dn_flow_set *fs, int i)
+{
+       struct dn_flow_queue *q;
+
+       if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
+           expire_queues(fs) == 0) {
+               /* No way to get room, use or create overflow queue. */
+               i = fs->rq_size;
+               if (fs->rq[i] != NULL)
+                   return fs->rq[i];
+       }
+       q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
+       if (q == NULL) {
+               printf("dummynet: sorry, cannot allocate queue for new flow\n");
+               return (NULL);
+       }
+       q->fs = fs;
+       q->hash_slot = i;
+       q->next = fs->rq[i];
+       q->S = q->F + 1;        /* hack - mark timestamp as invalid. */
+       q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
+       fs->rq[i] = q;
+       fs->rq_elements++;
+       return (q);
+}
+
+/*
+ * Given a flow_set and a pkt in last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
+{
+    int i = 0 ; /* we need i and q for new allocations */
+    struct dn_flow_queue *q, *prev;
+    int is_v6 = IS_IP6_FLOW_ID(id);
+
+    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
+       q = fs->rq[0] ;
+    else {
+       /* first, do the masking, then hash */
+       id->dst_port &= fs->flow_mask.dst_port ;
+       id->src_port &= fs->flow_mask.src_port ;
+       id->proto &= fs->flow_mask.proto ;
+       id->flags = 0 ; /* we don't care about this one */
+       if (is_v6) {
+           APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
+           APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
+           id->flow_id6 &= fs->flow_mask.flow_id6;
+
+           i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
+
+               ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
+               ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
+
+               ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
+               ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
+
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto ) ^
+               (id->flow_id6);
+       } else {
+           id->dst_ip &= fs->flow_mask.dst_ip ;
+           id->src_ip &= fs->flow_mask.src_ip ;
+
+           i = ( (id->dst_ip) & 0xffff ) ^
+               ( (id->dst_ip >> 15) & 0xffff ) ^
+               ( (id->src_ip << 1) & 0xffff ) ^
+               ( (id->src_ip >> 16 ) & 0xffff ) ^
+               (id->dst_port << 1) ^ (id->src_port) ^
+               (id->proto );
+       }
+       i = i % fs->rq_size ;
+       /* finally, scan the current list for a match */
+       searches++ ;
+       for (prev=NULL, q = fs->rq[i] ; q ; ) {
+           search_steps++;
+           if (is_v6 &&
+                   IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
+                   IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags &&
+                   id->flow_id6 == q->id.flow_id6)
+               break ; /* found */
+
+           if (!is_v6 && id->dst_ip == q->id.dst_ip &&
+                   id->src_ip == q->id.src_ip &&
+                   id->dst_port == q->id.dst_port &&
+                   id->src_port == q->id.src_port &&
+                   id->proto == q->id.proto &&
+                   id->flags == q->id.flags)
+               break ; /* found */
+
+           /* No match. Check if we can expire the entry */
+           if (pipe_expire && QUEUE_IS_IDLE(q)) {
+               /* entry is idle and not in any heap, expire it */
+               struct dn_flow_queue *old_q = q ;
+
+               if (prev != NULL)
+                   prev->next = q = q->next ;
+               else
+                   fs->rq[i] = q = q->next ;
+               fs->rq_elements-- ;
+               free(old_q, M_DUMMYNET);
+               continue ;
+           }
+           prev = q ;
+           q = q->next ;
+       }
+       if (q && prev != NULL) { /* found and not in front */
+           prev->next = q->next ;
+           q->next = fs->rq[i] ;
+           fs->rq[i] = q ;
+       }
+    }
+    if (q == NULL) { /* no match, need to allocate a new entry */
+       q = create_queue(fs, i);
+       if (q != NULL)
+       q->id = *id ;
+    }
+    return q ;
+}
+
+static int
+red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+{
+       /*
+        * RED algorithm
+        *
+        * RED calculates the average queue size (avg) using a low-pass filter
+        * with an exponential weighted (w_q) moving average:
+        *      avg  <-  (1-w_q) * avg + w_q * q_size
+        * where q_size is the queue length (measured in bytes or * packets).
+        *
+        * If q_size == 0, we compute the idle time for the link, and set
+        *      avg = (1 - w_q)^(idle/s)
+        * where s is the time needed for transmitting a medium-sized packet.
+        *
+        * Now, if avg < min_th the packet is enqueued.
+        * If avg > max_th the packet is dropped. Otherwise, the packet is
+        * dropped with probability P function of avg.
+        */
+
+       int64_t p_b = 0;
+
+       /* Queue in bytes or packets? */
+       u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
+           q->len_bytes : q->len;
+
+       DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
+
+       /* Average queue size estimation. */
+       if (q_size != 0) {
+               /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+               int diff = SCALE(q_size) - q->avg;
+               int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+               q->avg += (int)v;
+       } else {
+               /*
+                * Queue is empty, find for how long the queue has been
+                * empty and use a lookup table for computing
+                * (1 - * w_q)^(idle_time/s) where s is the time to send a
+                * (small) packet.
+                * XXX check wraps...
+                */
+               if (q->avg) {
+                       u_int t = div64(curr_time - q->idle_time,
+                           fs->lookup_step);
+
+                       q->avg = (t < fs->lookup_depth) ?
+                           SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+               }
+       }
+       DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
+
+       /* Should i drop? */
+       if (q->avg < fs->min_th) {
+               q->count = -1;
+               return (0);     /* accept packet */
+       }
+       if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
+               if (fs->flags_fs & DN_IS_GENTLE_RED) {
+                       /*
+                        * According to Gentle-RED, if avg is greater than
+                        * max_th the packet is dropped with a probability
+                        *       p_b = c_3 * avg - c_4
+                        * where c_3 = (1 - max_p) / max_th
+                        *       c_4 = 1 - 2 * max_p
+                        */
+                       p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+                           fs->c_4;
+               } else {
+                       q->count = -1;
+                       DPRINTF(("dummynet: - drop"));
+                       return (1);
+               }
+       } else if (q->avg > fs->min_th) {
+               /*
+                * We compute p_b using the linear dropping function
+                *       p_b = c_1 * avg - c_2
+                * where c_1 = max_p / (max_th - min_th)
+                *       c_2 = max_p * min_th / (max_th - min_th)
+                */
+               p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+       }
+
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES)
+               p_b = div64(p_b * len, fs->max_pkt_size);
+       if (++q->count == 0)
+               q->random = random() & 0xffff;
+       else {
+               /*
+                * q->count counts packets arrived since last drop, so a greater
+                * value of q->count means a greater packet drop probability.
+                */
+               if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+                       q->count = 0;
+                       DPRINTF(("dummynet: - red drop"));
+                       /* After a drop we calculate a new random value. */
+                       q->random = random() & 0xffff;
+                       return (1);     /* drop */
+               }
+       }
+       /* End of RED algorithm. */
+
+       return (0);     /* accept */
+}
+
+static __inline struct dn_flow_set *
+locate_flowset(int fs_nr)
+{
+       struct dn_flow_set *fs;
+
+       SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
+               if (fs->fs_nr == fs_nr)
+                       return (fs);
+
+       return (NULL);
+}
+
+static __inline struct dn_pipe *
+locate_pipe(int pipe_nr)
+{
+       struct dn_pipe *pipe;
+
+       SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
+               if (pipe->pipe_nr == pipe_nr)
+                       return (pipe);
+
+       return (NULL);
+}
+
+/*
+ * dummynet hook for packets. Below 'pipe' is a pipe or a queue
+ * depending on whether WF2Q or fixed bw is used.
+ *
+ * pipe_nr     pipe or queue the packet is destined for.
+ * dir         where shall we send the packet after dummynet.
+ * m           the mbuf with the packet
+ * ifp         the 'ifp' parameter from the caller.
+ *             NULL in ip_input, destination interface in ip_output,
+ * rule                matching rule, in case of multiple passes
+ */
+static int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+       struct mbuf *m = *m0, *head = NULL, *tail = NULL;
+       struct dn_pkt_tag *pkt;
+       struct m_tag *mtag;
+       struct dn_flow_set *fs = NULL;
+       struct dn_pipe *pipe;
+       uint64_t len = m->m_pkthdr.len;
+       struct dn_flow_queue *q = NULL;
+       int is_pipe = fwa->rule.info & IPFW_IS_PIPE;
+
+       KASSERT(m->m_nextpkt == NULL,
+           ("dummynet_io: mbuf queue passed to dummynet"));
+
+       DUMMYNET_LOCK();
+       io_pkt++;
+       /*
+        * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
+        */
+       if (is_pipe) {
+               pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK);
+               if (pipe != NULL)
+                       fs = &(pipe->fs);
+       } else
+               fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK);
+
+       if (fs == NULL)
+               goto dropit;    /* This queue/pipe does not exist! */
+       pipe = fs->pipe;
+       if (pipe == NULL) {     /* Must be a queue, try find a matching pipe. */
+               pipe = locate_pipe(fs->parent_nr);
+               if (pipe != NULL)
+                       fs->pipe = pipe;
+               else {
+                       printf("dummynet: no pipe %d for queue %d, drop pkt\n",
+                           fs->parent_nr, fs->fs_nr);
+                       goto dropit;
+               }
+       }
+       q = find_queue(fs, &(fwa->f_id));
+       if (q == NULL)
+               goto dropit;            /* Cannot allocate queue. */
+
+       /* Update statistics, then check reasons to drop pkt. */
+       q->tot_bytes += len;
+       q->tot_pkts++;
+       if (fs->plr && random() < fs->plr)
+               goto dropit;            /* Random pkt drop. */
+       if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (q->len_bytes > fs->qsize)
+                       goto dropit;    /* Queue size overflow. */
+       } else {
+               if (q->len >= fs->qsize)
+                       goto dropit;    /* Queue count overflow. */
+       }
+       if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
+               goto dropit;
+
+       /* XXX expensive to zero, see if we can remove it. */
+       mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+           sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
+       if (mtag == NULL)
+               goto dropit;            /* Cannot allocate packet header. */
+       m_tag_prepend(m, mtag);         /* Attach to mbuf chain. */
+
+       pkt = (struct dn_pkt_tag *)(mtag + 1);
+       /*
+        * Ok, i can handle the pkt now...
+        * Build and enqueue packet + parameters.
+        */
+       pkt->rule = fwa->rule;
+       pkt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+       pkt->dn_dir = dir;
+       pkt->ifp = fwa->oif;
+
+       if (q->head == NULL)
+               q->head = m;
+       else
+               q->tail->m_nextpkt = m;
+       q->tail = m;
+       q->len++;
+       q->len_bytes += len;
+
+       if (q->head != m)               /* Flow was not idle, we are done. */
+               goto done;
+
+       if (is_pipe) {                  /* Fixed rate queues. */
+               if (q->idle_time < curr_time) {
+                       /* Calculate available burst size. */
+                       q->numbytes +=
+                           (curr_time - q->idle_time - 1) * pipe->bandwidth;
+                       if (q->numbytes > pipe->burst)
+                               q->numbytes = pipe->burst;
+                       if (io_fast)
+                               q->numbytes += pipe->bandwidth;
+               }
+       } else {                        /* WF2Q. */
+               if (pipe->idle_time < curr_time &&
+                   pipe->scheduler_heap.elements == 0 &&
+                   pipe->not_eligible_heap.elements == 0) {
+                       /* Calculate available burst size. */
+                       pipe->numbytes +=
+                           (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
+                       if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
+                               pipe->numbytes = pipe->burst;
+                       if (io_fast)
+                               pipe->numbytes += pipe->bandwidth;
+               }
+               pipe->idle_time = curr_time;
+       }
+       /* Necessary for both: fixed rate & WF2Q queues. */
+       q->idle_time = curr_time;
+
+       /*
+        * If we reach this point the flow was previously idle, so we need
+        * to schedule it. This involves different actions for fixed-rate or
+        * WF2Q queues.
+        */
+       if (is_pipe) {
+               /* Fixed-rate queue: just insert into the ready_heap. */
+               dn_key t = 0;
+
+               if (pipe->bandwidth) {
+                       q->extra_bits = compute_extra_bits(m, pipe);
+                       t = set_ticks(m, q, pipe);
+               }
+               q->sched_time = curr_time;
+               if (t == 0)             /* Must process it now. */
+                       ready_event(q, &head, &tail);
+               else
+                       heap_insert(&ready_heap, curr_time + t , q);
+       } else {
+               /*
+                * WF2Q. First, compute start time S: if the flow was
+                * idle (S = F + 1) set S to the virtual time V for the
+                * controlling pipe, and update the sum of weights for the pipe;
+                * otherwise, remove flow from idle_heap and set S to max(F,V).
+                * Second, compute finish time F = S + len / weight.
+                * Third, if pipe was idle, update V = max(S, V).
+                * Fourth, count one more backlogged flow.
+                */
+               if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
+                       q->S = pipe->V;
+                       pipe->sum += fs->weight; /* Add weight of new queue. */
+               } else {
+                       heap_extract(&(pipe->idle_heap), q);
+                       q->S = MAX64(q->F, pipe->V);
+               }
+               q->F = q->S + div64(len << MY_M, fs->weight);
+
+               if (pipe->not_eligible_heap.elements == 0 &&
+                   pipe->scheduler_heap.elements == 0)
+                       pipe->V = MAX64(q->S, pipe->V);
+               fs->backlogged++;
+               /*
+                * Look at eligibility. A flow is not eligibile if S>V (when
+                * this happens, it means that there is some other flow already
+                * scheduled for the same pipe, so the scheduler_heap cannot be
+                * empty). If the flow is not eligible we just store it in the
+                * not_eligible_heap. Otherwise, we store in the scheduler_heap
+                * and possibly invoke ready_event_wfq() right now if there is
+                * leftover credit.
+                * Note that for all flows in scheduler_heap (SCH), S_i <= V,
+                * and for all flows in not_eligible_heap (NEH), S_i > V.
+                * So when we need to compute max(V, min(S_i)) forall i in
+                * SCH+NEH, we only need to look into NEH.
+                */
+               if (DN_KEY_GT(q->S, pipe->V)) {         /* Not eligible. */
+                       if (pipe->scheduler_heap.elements == 0)
+                               printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
+                       heap_insert(&(pipe->not_eligible_heap), q->S, q);
+               } else {
+                       heap_insert(&(pipe->scheduler_heap), q->F, q);
+                       if (pipe->numbytes >= 0) {       /* Pipe is idle. */
+                               if (pipe->scheduler_heap.elements != 1)
+                                       printf("dummynet: OUCH! pipe should have been idle!\n");
+                               DPRINTF(("dummynet: waking up pipe %d at %d\n",
+                                   pipe->pipe_nr, (int)(q->F >> MY_M)));
+                               pipe->sched_time = curr_time;
+                               ready_event_wfq(pipe, &head, &tail);
+                       }
+               }
+       }
+done:
+       if (head == m && (dir & PROTO_LAYER2) == 0 ) {
+               /* Fast io. */
+               io_pkt_fast++;
+               if (m->m_nextpkt != NULL)
+                       printf("dummynet: fast io: pkt chain detected!\n");
+               head = m->m_nextpkt = NULL;
+       } else
+               *m0 = NULL;             /* Normal io. */
+
+       DUMMYNET_UNLOCK();
+       if (head != NULL)
+               dummynet_send(head);
+       return (0);
+
+dropit:
+       io_pkt_drop++;
+       if (q)
+               q->drops++;
+       DUMMYNET_UNLOCK();
+       FREE_PKT(m);
+       *m0 = NULL;
+       return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+}
+
+/*
+ * Dispose all packets and flow_queues on a flow_set.
+ * If all=1, also remove red lookup table and other storage,
+ * including the descriptor itself.
+ * For the one in dn_pipe MUST also cleanup ready_heap...
+ */
+static void
+purge_flow_set(struct dn_flow_set *fs, int all)
+{
+       struct dn_flow_queue *q, *qn;
+       int i;
+
+       DUMMYNET_LOCK_ASSERT();
+
+       for (i = 0; i <= fs->rq_size; i++) {
+               for (q = fs->rq[i]; q != NULL; q = qn) {
+                       dn_free_pkts(q->head);
+                       qn = q->next;
+                       free(q, M_DUMMYNET);
+               }
+               fs->rq[i] = NULL;
+       }
+
+       fs->rq_elements = 0;
+       if (all) {
+               /* RED - free lookup table. */
+               if (fs->w_q_lookup != NULL)
+                       free(fs->w_q_lookup, M_DUMMYNET);
+               if (fs->rq != NULL)
+                       free(fs->rq, M_DUMMYNET);
+               /* If this fs is not part of a pipe, free it. */
+               if (fs->pipe == NULL || fs != &(fs->pipe->fs))
+                       free(fs, M_DUMMYNET);
+       }
+}
+
+/*
+ * Dispose all packets queued on a pipe (not a flow_set).
+ * Also free all resources associated to a pipe, which is about
+ * to be deleted.
+ */
+static void
+purge_pipe(struct dn_pipe *pipe)
+{
+
+    purge_flow_set( &(pipe->fs), 1 );
+
+    dn_free_pkts(pipe->head);
+
+    heap_free( &(pipe->scheduler_heap) );
+    heap_free( &(pipe->not_eligible_heap) );
+    heap_free( &(pipe->idle_heap) );
+}
+
+/*
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
+ */
+static void
+dummynet_flush(void)
+{
+       struct dn_pipe *pipe, *pipe1;
+       struct dn_flow_set *fs, *fs1;
+       int i;
+
+       DUMMYNET_LOCK();
+       /* Free heaps so we don't have unwanted events. */
+       heap_free(&ready_heap);
+       heap_free(&wfq_ready_heap);
+       heap_free(&extract_heap);
+
+       /*
+        * Now purge all queued pkts and delete all pipes.
+        *
+        * XXXGL: can we merge the for(;;) cycles into one or not?
+        */
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
+                       SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
+                       purge_flow_set(fs, 1);
+               }
+       for (i = 0; i < HASHSIZE; i++)
+               SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
+                       SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
+                       purge_pipe(pipe);
+                       free_pipe(pipe);
+               }
+       DUMMYNET_UNLOCK();
+}
+
+/*
+ * setup RED parameters
+ */
+static int
+config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+{
+       int i;
+
+       x->w_q = p->w_q;
+       x->min_th = SCALE(p->min_th);
+       x->max_th = SCALE(p->max_th);
+       x->max_p = p->max_p;
+
+       x->c_1 = p->max_p / (p->max_th - p->min_th);
+       x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
+
+       if (x->flags_fs & DN_IS_GENTLE_RED) {
+               x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
+               x->c_4 = SCALE(1) - 2 * p->max_p;
+       }
+
+       /* If the lookup table already exist, free and create it again. */
+       if (x->w_q_lookup) {
+               free(x->w_q_lookup, M_DUMMYNET);
+               x->w_q_lookup = NULL;
+       }
+       if (red_lookup_depth == 0) {
+               printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+                   "must be > 0\n");
+               free(x, M_DUMMYNET);
+               return (EINVAL);
+       }
+       x->lookup_depth = red_lookup_depth;
+       x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+           M_DUMMYNET, M_NOWAIT);
+       if (x->w_q_lookup == NULL) {
+               printf("dummynet: sorry, cannot allocate red lookup table\n");
+               free(x, M_DUMMYNET);
+               return(ENOSPC);
+       }
+
+       /* Fill the lookup table with (1 - w_q)^x */
+       x->lookup_step = p->lookup_step;
+       x->lookup_weight = p->lookup_weight;
+       x->w_q_lookup[0] = SCALE(1) - x->w_q;
+
+       for (i = 1; i < x->lookup_depth; i++)
+               x->w_q_lookup[i] =
+                   SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+
+       if (red_avg_pkt_size < 1)
+               red_avg_pkt_size = 512;
+       x->avg_pkt_size = red_avg_pkt_size;
+       if (red_max_pkt_size < 1)
+               red_max_pkt_size = 1500;
+       x->max_pkt_size = red_max_pkt_size;
+       return (0);
+}
+
+static int
+alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
+{
+    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
+       int l = pfs->rq_size;
+
+       if (l == 0)
+           l = dn_hash_size;
+       if (l < 4)
+           l = 4;
+       else if (l > DN_MAX_HASH_SIZE)
+           l = DN_MAX_HASH_SIZE;
+       x->rq_size = l;
+    } else                  /* one is enough for null mask */
+       x->rq_size = 1;
+    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
+           M_DUMMYNET, M_NOWAIT | M_ZERO);
+    if (x->rq == NULL) {
+       printf("dummynet: sorry, cannot allocate queue\n");
+       return (ENOMEM);
+    }
+    x->rq_elements = 0;
+    return 0 ;
+}
+
+static void
+set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
+{
+       x->flags_fs = src->flags_fs;
+       x->qsize = src->qsize;
+       x->plr = src->plr;
+       x->flow_mask = src->flow_mask;
+       if (x->flags_fs & DN_QSIZE_IS_BYTES) {
+               if (x->qsize > pipe_byte_limit)
+                       x->qsize = 1024 * 1024;
+       } else {
+               if (x->qsize == 0)
+                       x->qsize = 50;
+               if (x->qsize > pipe_slot_limit)
+                       x->qsize = 50;
+       }
+       /* Configuring RED. */
+       if (x->flags_fs & DN_IS_RED)
+               config_red(src, x);     /* XXX should check errors */
+}
+
+/*
+ * Setup pipe or queue parameters.
+ */
+static int
+config_pipe(struct dn_pipe *p)
+{
+       struct dn_flow_set *pfs = &(p->fs);
+       struct dn_flow_queue *q;
+       int i, error;
+
+       /*
+        * The config program passes parameters as follows:
+        * bw = bits/second (0 means no limits),
+        * delay = ms, must be translated into ticks.
+        * qsize = slots/bytes
+        */
+       p->delay = (p->delay * hz) / 1000;
+       /* Scale burst size: bytes -> bits * hz */
+       p->burst *= 8 * hz;
+       /* We need either a pipe number or a flow_set number. */
+       if (p->pipe_nr == 0 && pfs->fs_nr == 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0 && pfs->fs_nr != 0)
+               return (EINVAL);
+       if (p->pipe_nr != 0) {                  /* this is a pipe */
+               struct dn_pipe *pipe;
+
+               DUMMYNET_LOCK();
+               pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+               if (pipe == NULL) {             /* new pipe */
+                       pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (pipe == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf("dummynet: no memory for new pipe\n");
+                               return (ENOMEM);
+                       }
+                       pipe->pipe_nr = p->pipe_nr;
+                       pipe->fs.pipe = pipe;
+                       /*
+                        * idle_heap is the only one from which
+                        * we extract from the middle.
+                        */
+                       pipe->idle_heap.size = pipe->idle_heap.elements = 0;
+                       pipe->idle_heap.offset =
+                           offsetof(struct dn_flow_queue, heap_pos);
+               } else {
+                       /* Flush accumulated credit for all queues. */
+                       for (i = 0; i <= pipe->fs.rq_size; i++) {
+                               for (q = pipe->fs.rq[i]; q; q = q->next) {
+                                       q->numbytes = p->burst +
+                                           (io_fast ? p->bandwidth : 0);
+                               }
+                       }
+               }
+
+               pipe->bandwidth = p->bandwidth;
+               pipe->burst = p->burst;
+               pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
+               bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
+               pipe->ifp = NULL;               /* reset interface ptr */
+               pipe->delay = p->delay;
+               set_fs_parms(&(pipe->fs), pfs);
+
+               /* Handle changes in the delay profile. */
+               if (p->samples_no > 0) {
+                       if (pipe->samples_no != p->samples_no) {
+                               if (pipe->samples != NULL)
+                                       free(pipe->samples, M_DUMMYNET);
+                               pipe->samples =
+                                   malloc(p->samples_no*sizeof(dn_key),
+                                       M_DUMMYNET, M_NOWAIT | M_ZERO);
+                               if (pipe->samples == NULL) {
+                                       DUMMYNET_UNLOCK();
+                                       printf("dummynet: no memory "
+                                               "for new samples\n");
+                                       return (ENOMEM);
+                               }
+                               pipe->samples_no = p->samples_no;
+                       }
+
+                       strncpy(pipe->name,p->name,sizeof(pipe->name));
+                       pipe->loss_level = p->loss_level;
+                       for (i = 0; i<pipe->samples_no; ++i)
+                               pipe->samples[i] = p->samples[i];
+               } else if (pipe->samples != NULL) {
+                       free(pipe->samples, M_DUMMYNET);
+                       pipe->samples = NULL;
+                       pipe->samples_no = 0;
+               }
+
+               if (pipe->fs.rq == NULL) {      /* a new pipe */
+                       error = alloc_hash(&(pipe->fs), pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free_pipe(pipe);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
+                           pipe, next);
+               }
+               DUMMYNET_UNLOCK();
+       } else {                                /* config queue */
+               struct dn_flow_set *fs;
+
+               DUMMYNET_LOCK();
+               fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+
+               if (fs == NULL) {               /* new */
+                       if (pfs->parent_nr == 0) { /* need link to a pipe */
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+                       fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
+                           M_NOWAIT | M_ZERO);
+                       if (fs == NULL) {
+                               DUMMYNET_UNLOCK();
+                               printf(
+                                   "dummynet: no memory for new flow_set\n");
+                               return (ENOMEM);
+                       }
+                       fs->fs_nr = pfs->fs_nr;
+                       fs->parent_nr = pfs->parent_nr;
+                       fs->weight = pfs->weight;
+                       if (fs->weight == 0)
+                               fs->weight = 1;
+                       else if (fs->weight > 100)
+                               fs->weight = 100;
+               } else {
+                       /*
+                        * Change parent pipe not allowed;
+                        * must delete and recreate.
+                        */
+                       if (pfs->parent_nr != 0 &&
+                           fs->parent_nr != pfs->parent_nr) {
+                               DUMMYNET_UNLOCK();
+                               return (EINVAL);
+                       }
+               }
+
+               set_fs_parms(fs, pfs);
+
+               if (fs->rq == NULL) {           /* a new flow_set */
+                       error = alloc_hash(fs, pfs);
+                       if (error) {
+                               DUMMYNET_UNLOCK();
+                               free(fs, M_DUMMYNET);
+                               return (error);
+                       }
+                       SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
+                           fs, next);
+               }
+               DUMMYNET_UNLOCK();
+       }
+       return (0);
+}
+
+/*
+ * Helper function to remove from a heap queues which are linked to
+ * a flow_set about to be deleted.
+ */
+static void
+fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+{
+    int i, found;
+
+    for (i = found = 0 ; i < h->elements ;) {
+       if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           found++ ;
+       } else
+           i++ ;
+    }
+    if (found)
+       heapify(h);
+}
+
+/*
+ * helper function to remove a pipe from a heap (can be there at most once)
+ */
+static void
+pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+{
+    int i;
+
+    for (i=0; i < h->elements ; i++ ) {
+       if (h->p[i].object == p) { /* found it */
+           h->elements-- ;
+           h->p[i] = h->p[h->elements] ;
+           heapify(h);
+           break ;
+       }
+    }
+}
+
+/*
+ * drain all queues. Called in case of severe mbuf shortage.
+ */
+void
+dummynet_drain(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    heap_free(&ready_heap);
+    heap_free(&wfq_ready_heap);
+    heap_free(&extract_heap);
+    /* remove all references to this pipe from flow_sets */
+    for (i = 0; i < HASHSIZE; i++)
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               purge_flow_set(fs, 0);
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               purge_flow_set(&(pipe->fs), 0);
+               dn_free_pkts(pipe->head);
+               pipe->head = pipe->tail = NULL;
+       }
+    }
+}
+
+/*
+ * Fully delete a pipe or a queue, cleaning up associated info.
+ */
+static int
+delete_pipe(struct dn_pipe *p)
+{
+
+    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+       return EINVAL ;
+    if (p->pipe_nr != 0) { /* this is an old-style pipe */
+       struct dn_pipe *pipe;
+       struct dn_flow_set *fs;
+       int i;
+
+       DUMMYNET_LOCK();
+       pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+
+       if (pipe == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT);    /* not found */
+       }
+
+       /* Unlink from list of pipes. */
+       SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+
+       /* Remove all references to this pipe from flow_sets. */
+       for (i = 0; i < HASHSIZE; i++) {
+           SLIST_FOREACH(fs, &flowsethash[i], next) {
+               if (fs->pipe == pipe) {
+                       printf("dummynet: ++ ref to pipe %d from fs %d\n",
+                           p->pipe_nr, fs->fs_nr);
+                       fs->pipe = NULL ;
+                       purge_flow_set(fs, 0);
+               }
+           }
+       }
+       fs_remove_from_heap(&ready_heap, &(pipe->fs));
+       purge_pipe(pipe); /* remove all data associated to this pipe */
+       /* remove reference to here from extract_heap and wfq_ready_heap */
+       pipe_remove_from_heap(&extract_heap, pipe);
+       pipe_remove_from_heap(&wfq_ready_heap, pipe);
+       DUMMYNET_UNLOCK();
+
+       free_pipe(pipe);
+    } else { /* this is a WF2Q queue (dn_flow_set) */
+       struct dn_flow_set *fs;
+
+       DUMMYNET_LOCK();
+       fs = locate_flowset(p->fs.fs_nr); /* locate set */
+
+       if (fs == NULL) {
+           DUMMYNET_UNLOCK();
+           return (ENOENT); /* not found */
+       }
+
+       /* Unlink from list of flowsets. */
+       SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+
+       if (fs->pipe != NULL) {
+           /* Update total weight on parent pipe and cleanup parent heaps. */
+           fs->pipe->sum -= fs->weight * fs->backlogged ;
+           fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
+           fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
+#if 1  /* XXX should i remove from idle_heap as well ? */
+           fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
+#endif
+       }
+       purge_flow_set(fs, 1);
+       DUMMYNET_UNLOCK();
+    }
+    return 0 ;
+}
+
+/*
+ * helper function used to copy data from kernel in DUMMYNET_GET
+ */
+static char *
+dn_copy_set(struct dn_flow_set *set, char *bp)
+{
+    int i, copied = 0 ;
+    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    for (i = 0 ; i <= set->rq_size ; i++) {
+       for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
+           if (q->hash_slot != i)
+               printf("dummynet: ++ at %d: wrong slot (have %d, "
+                   "should be %d)\n", copied, q->hash_slot, i);
+           if (q->fs != set)
+               printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
+                       i, q->fs, set);
+           copied++ ;
+           bcopy(q, qp, sizeof( *q ) );
+           /* cleanup pointers */
+           qp->next = NULL ;
+           qp->head = qp->tail = NULL ;
+           qp->fs = NULL ;
+       }
+    }
+    if (copied != set->rq_elements)
+       printf("dummynet: ++ wrong count, have %d should be %d\n",
+           copied, set->rq_elements);
+    return (char *)qp ;
+}
+
+static size_t
+dn_calc_size(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    size_t size = 0;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+    /*
+     * Compute size of data structures: list of pipes and flow_sets.
+     */
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next)
+               size += sizeof(*pipe) +
+                   pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
+       SLIST_FOREACH(fs, &flowsethash[i], next)
+               size += sizeof (*fs) +
+                   fs->rq_elements * sizeof(struct dn_flow_queue);
+    }
+    return size;
+}
+
+static int
+dummynet_get(struct sockopt *sopt)
+{
+    char *buf, *bp ; /* bp is the "copy-pointer" */
+    size_t size ;
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int error=0, i ;
+
+    /* XXX lock held too long */
+    DUMMYNET_LOCK();
+    /*
+     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
+     *      cannot use this flag while holding a mutex.
+     */
+    for (i = 0; i < 10; i++) {
+       size = dn_calc_size();
+       DUMMYNET_UNLOCK();
+       buf = malloc(size, M_TEMP, M_WAITOK);
+       DUMMYNET_LOCK();
+       if (size >= dn_calc_size())
+               break;
+       free(buf, M_TEMP);
+       buf = NULL;
+    }
+    if (buf == NULL) {
+       DUMMYNET_UNLOCK();
+       return ENOBUFS ;
+    }
+    bp = buf;
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(pipe, &pipehash[i], next) {
+               struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
+
+               /*
+                * Copy pipe descriptor into *bp, convert delay back to ms,
+                * then copy the flow_set descriptor(s) one at a time.
+                * After each flow_set, copy the queue descriptor it owns.
+                */
+               bcopy(pipe, bp, sizeof(*pipe));
+               pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
+               pipe_bp->burst = div64(pipe_bp->burst, 8 * hz);
+               /*
+                * XXX the following is a hack based on ->next being the
+                * first field in dn_pipe and dn_flow_set. The correct
+                * solution would be to move the dn_flow_set to the beginning
+                * of struct dn_pipe.
+                */
+               pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
+               /* Clean pointers. */
+               pipe_bp->head = pipe_bp->tail = NULL;
+               pipe_bp->fs.next.sle_next = NULL;
+               pipe_bp->fs.pipe = NULL;
+               pipe_bp->fs.rq = NULL;
+               pipe_bp->samples = NULL;
+
+               bp += sizeof(*pipe) ;
+               bp = dn_copy_set(&(pipe->fs), bp);
+       }
+    }
+
+    for (i = 0; i < HASHSIZE; i++) {
+       SLIST_FOREACH(fs, &flowsethash[i], next) {
+               struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+
+               bcopy(fs, bp, sizeof(*fs));
+               /* XXX same hack as above */
+               fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+               fs_bp->pipe = NULL;
+               fs_bp->rq = NULL;
+               bp += sizeof(*fs);
+               bp = dn_copy_set(fs, bp);
+       }
+    }
+
+    DUMMYNET_UNLOCK();
+
+    error = sooptcopyout(sopt, buf, size);
+    free(buf, M_TEMP);
+    return error ;
+}
+
+/*
+ * Handler for the various dummynet socket options (get, flush, config, del)
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+    int error;
+    struct dn_pipe *p = NULL;
+
+    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+    if (error)
+       return (error);
+
+    /* Disallow sets in really-really secure mode. */
+    if (sopt->sopt_dir == SOPT_SET) {
+#if __FreeBSD_version >= 500034
+       error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+       if (error)
+           return (error);
+#else
+       if (securelevel >= 3)
+           return (EPERM);
+#endif
+    }
+
+    switch (sopt->sopt_name) {
+    default :
+       printf("dummynet: -- unknown option %d", sopt->sopt_name);
+       error = EINVAL ;
+       break;
+
+    case IP_DUMMYNET_GET :
+       error = dummynet_get(sopt);
+       break ;
+
+    case IP_DUMMYNET_FLUSH :
+       dummynet_flush() ;
+       break ;
+
+    case IP_DUMMYNET_CONFIGURE :
+       p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
+       if (error)
+           break ;
+       if (p->samples_no > 0)
+           p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+
+       error = config_pipe(p);
+       break ;
+
+    case IP_DUMMYNET_DEL :     /* remove a pipe or queue */
+       p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
+       error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
+       if (error)
+           break ;
+
+       error = delete_pipe(p);
+       break ;
+    }
+
+    if (p != NULL)
+       free(p, M_TEMP);
+
+    return error ;
+}
+
+static void
+ip_dn_init(void)
+{
+       int i;
+
+       if (bootverbose)
+               printf("DUMMYNET with IPv6 initialized (040826)\n");
+
+       DUMMYNET_LOCK_INIT();
+
+       for (i = 0; i < HASHSIZE; i++) {
+               SLIST_INIT(&pipehash[i]);
+               SLIST_INIT(&flowsethash[i]);
+       }
+       ready_heap.size = ready_heap.elements = 0;
+       ready_heap.offset = 0;
+
+       wfq_ready_heap.size = wfq_ready_heap.elements = 0;
+       wfq_ready_heap.offset = 0;
+
+       extract_heap.size = extract_heap.elements = 0;
+       extract_heap.offset = 0;
+
+       ip_dn_ctl_ptr = ip_dn_ctl;
+       ip_dn_io_ptr = dummynet_io;
+
+       TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+       dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+           taskqueue_thread_enqueue, &dn_tq);
+       taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+       callout_init(&dn_timeout, CALLOUT_MPSAFE);
+       callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+       /* Initialize curr_time adjustment mechanics. */
+       getmicrouptime(&prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(void)
+{
+       ip_dn_ctl_ptr = NULL;
+       ip_dn_io_ptr = NULL;
+
+       DUMMYNET_LOCK();
+       callout_stop(&dn_timeout);
+       DUMMYNET_UNLOCK();
+       taskqueue_drain(dn_tq, &dn_task);
+       taskqueue_free(dn_tq);
+
+       dummynet_flush();
+
+       DUMMYNET_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+       switch (type) {
+       case MOD_LOAD:
+               if (ip_dn_io_ptr) {
+                   printf("DUMMYNET already loaded\n");
+                   return EEXIST ;
+               }
+               ip_dn_init();
+               break;
+
+       case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+               printf("dummynet statically compiled, cannot unload\n");
+               return EINVAL ;
+#else
+               ip_dn_destroy();
+#endif
+               break ;
+       default:
+               return EOPNOTSUPP;
+               break ;
+       }
+       return 0 ;
+}
+
+static moduledata_t dummynet_mod = {
+       "dummynet",
+       dummynet_modevent,
+       NULL
+};
+DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 1);
+/* end of file */
diff --git a/dummynet2/ip_fw2.c b/dummynet2/ip_fw2.c
new file mode 100644 (file)
index 0000000..3cc08e7
--- /dev/null
@@ -0,0 +1,2466 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 200601 2009-12-16 10:48:40Z luigi $");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h>  /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define        V_ipfw_vnet_ready       VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define        V_fw_deny_unknown_exthdrs       VNET(fw_deny_unknown_exthdrs)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define        V_set_disable                   VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+    "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+    "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+    "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+    "Set upper limit of matches of ipfw rules logged");
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+    &dummy_def, 0,
+    "The default/max possible rule number.");
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+    &dummy_tables_max, 0,
+    "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+    &default_to_accept, 0,
+    "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+    "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+    "Deny packets with unknown IPv6 Extension Headers");
+#endif /* INET6 */
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define        L3HDR(T, ip)    ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+#define        ICMP(p)         ((struct icmphdr *)(p))
+#define        ICMP6(p)        ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT     ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+       int type = icmp->icmp_type;
+
+       return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ *     (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+       u_char want_clear;
+       bits = ~bits;
+
+       if ( ((cmd->arg1 & 0xff) & bits) != 0)
+               return 0; /* some bits we want set were clear */
+       want_clear = (cmd->arg1 >> 8) & 0xff;
+       if ( (want_clear & bits) != want_clear)
+               return 0; /* some bits we want clear were set */
+       return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(ip + 1);
+       int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[IPOPT_OPTVAL];
+
+               if (opt == IPOPT_EOL)
+                       break;
+               if (opt == IPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[IPOPT_OLEN];
+                       if (optlen <= 0 || optlen > x)
+                               return 0; /* invalid or truncated */
+               }
+               switch (opt) {
+
+               default:
+                       break;
+
+               case IPOPT_LSRR:
+                       bits |= IP_FW_IPOPT_LSRR;
+                       break;
+
+               case IPOPT_SSRR:
+                       bits |= IP_FW_IPOPT_SSRR;
+                       break;
+
+               case IPOPT_RR:
+                       bits |= IP_FW_IPOPT_RR;
+                       break;
+
+               case IPOPT_TS:
+                       bits |= IP_FW_IPOPT_TS;
+                       break;
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+       int optlen, bits = 0;
+       u_char *cp = (u_char *)(tcp + 1);
+       int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+       for (; x > 0; x -= optlen, cp += optlen) {
+               int opt = cp[0];
+               if (opt == TCPOPT_EOL)
+                       break;
+               if (opt == TCPOPT_NOP)
+                       optlen = 1;
+               else {
+                       optlen = cp[1];
+                       if (optlen <= 0)
+                               break;
+               }
+
+               switch (opt) {
+
+               default:
+                       break;
+
+               case TCPOPT_MAXSEG:
+                       bits |= IP_FW_TCPOPT_MSS;
+                       break;
+
+               case TCPOPT_WINDOW:
+                       bits |= IP_FW_TCPOPT_WINDOW;
+                       break;
+
+               case TCPOPT_SACK_PERMITTED:
+               case TCPOPT_SACK:
+                       bits |= IP_FW_TCPOPT_SACK;
+                       break;
+
+               case TCPOPT_TIMESTAMP:
+                       bits |= IP_FW_TCPOPT_TS;
+                       break;
+
+               }
+       }
+       return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+       if (ifp == NULL)        /* no iface with this packet, match fails */
+               return 0;
+       /* Check by name or by IP address */
+       if (cmd->name[0] != '\0') { /* match by name */
+               /* Check name */
+               if (cmd->p.glob) {
+                       if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+                               return(1);
+               } else {
+                       if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+                               return(1);
+               }
+       } else {
+#if !defined( __linux__ ) && !defined( _WIN32 )
+               struct ifaddr *ia;
+
+               if_addr_rlock(ifp);
+               TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+                       if (ia->ifa_addr->sa_family != AF_INET)
+                               continue;
+                       if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+                           (ia->ifa_addr))->sin_addr.s_addr) {
+                               if_addr_runlock(ifp);
+                               return(1);      /* match */
+                       }
+               }
+               if_addr_runlock(ifp);
+#endif
+       }
+       return(0);      /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ * 
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ *   ip verify unicast reverse-path
+ *   ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#if defined( __linux__ ) || defined( _WIN32 )
+       return 0;
+#else
+       struct route ro;
+       struct sockaddr_in *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in *)&(ro.ro_dst);
+       dst->sin_family = AF_INET;
+       dst->sin_len = sizeof(*dst);
+       dst->sin_addr = src;
+       in_rtalloc_ign(&ro, 0, fib);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /*
+        * If ifp is provided, check for equality with rtentry.
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * in order to pass packets injected back by if_simloop():
+        * if useloopback == 1 routing entry (via lo0) for our own address
+        * may exist, so we need to handle routing assymetry.
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+            satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+#endif
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+       return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+       int i;
+       for (i=0; i <= cmd->o.arg1; ++i )
+               if (curr_flow == cmd->d[i] )
+                       return 1;
+       return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+       struct ifnet *mdc;
+       struct ifaddr *mdc2;
+       struct in6_ifaddr *fdm;
+       struct in6_addr copia;
+
+       TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+               if_addr_rlock(mdc);
+               TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+                       if (mdc2->ifa_addr->sa_family == AF_INET6) {
+                               fdm = (struct in6_ifaddr *)mdc2;
+                               copia = fdm->ia_addr.sin6_addr;
+                               /* need for leaving scope_id in the sock_addr */
+                               in6_clearscope(&copia);
+                               if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+                                       if_addr_runlock(mdc);
+                                       return 1;
+                               }
+                       }
+               }
+               if_addr_runlock(mdc);
+       }
+       return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+       struct route_in6 ro;
+       struct sockaddr_in6 *dst;
+
+       bzero(&ro, sizeof(ro));
+
+       dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+       dst->sin6_family = AF_INET6;
+       dst->sin6_len = sizeof(*dst);
+       dst->sin6_addr = *src;
+       /* XXX MRT 0 for ipv6 at this time */
+       rtalloc_ign((struct route *)&ro, 0);
+
+       if (ro.ro_rt == NULL)
+               return 0;
+
+       /* 
+        * if ifp is provided, check for equality with rtentry
+        * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+        * to support the case of sending packets to an address of our own.
+        * (where the former interface is the first argument of if_simloop()
+        *  (=ifp), the latter is lo0)
+        */
+       if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* if no ifp provided, check if rtentry is not default route */
+       if (ifp == NULL &&
+           IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* or if this is a blackhole/reject route */
+       if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+               RTFREE(ro.ro_rt);
+               return 0;
+       }
+
+       /* found valid route */
+       RTFREE(ro.ro_rt);
+       return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+       if ((icmp6_type <= ICMP6_MAXTYPE) &&
+           (icmp6_type == ICMP6_ECHO_REQUEST ||
+           icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+           icmp6_type == ICMP6_WRUREQUEST ||
+           icmp6_type == ICMP6_FQDN_QUERY ||
+           icmp6_type == ICMP6_NI_QUERY))
+               return (1);
+
+       return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+       struct mbuf *m;
+
+       m = args->m;
+       if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *tcp;
+               tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+               if ((tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m0;
+                       m0 = ipfw_send_pkt(args->m, &(args->f_id),
+                           ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                           tcp->th_flags | TH_RST);
+                       if (m0 != NULL)
+                               ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+                                   NULL);
+               }
+               FREE_PKT(m);
+       } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+               /*
+                * Unlike above, the mbufs need to line up with the ip6 hdr,
+                * as the contents are read. We need to m_adj() the
+                * needed amount.
+                * The mbuf will however be thrown away so we can adjust it.
+                * Remember we did an m_pullup on it already so we
+                * can make some assumptions about contiguousness.
+                */
+               if (args->L3offset)
+                       m_adj(m, args->L3offset);
+#endif
+               icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+       } else
+               FREE_PKT(m);
+
+       args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+       /* XXX When ip is not guaranteed to be at mtod() we will
+        * need to account for this */
+        * The mbuf will however be thrown away so we can adjust it.
+        * Remember we did an m_pullup on it already so we
+        * can make some assumptions about contiguousness.
+        */
+       if (args->L3offset)
+               m_adj(m, args->L3offset);
+#endif
+       if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+               /* We need the IP header in host order for icmp_error(). */
+               SET_HOST_IPLEN(ip);
+               icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+       } else if (args->f_id.proto == IPPROTO_TCP) {
+               struct tcphdr *const tcp =
+                   L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+               if ( (tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m;
+                       m = ipfw_send_pkt(args->m, &(args->f_id),
+                               ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+                               tcp->th_flags | TH_RST);
+                       if (m != NULL)
+                               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               }
+               FREE_PKT(args->m);
+       } else
+               FREE_PKT(args->m);
+       args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
+    struct inpcb *inp)
+{
+#ifdef __linux__
+       return cred_check(insn, proto, oif,
+       dst_ip, dst_port, src_ip, src_port,
+       (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
+       struct inpcbinfo *pi;
+       int wildcard;
+       struct inpcb *pcb;
+       int match;
+
+       /*
+        * Check to see if the UDP or TCP stack supplied us with
+        * the PCB. If so, rather then holding a lock and looking
+        * up the PCB, we can use the one that was supplied.
+        */
+       if (inp && *ugid_lookupp == 0) {
+               INP_LOCK_ASSERT(inp);
+               if (inp->inp_socket != NULL) {
+                       *uc = crhold(inp->inp_cred);
+                       *ugid_lookupp = 1;
+               } else
+                       *ugid_lookupp = -1;
+       }
+       /*
+        * If we have already been here and the packet has no
+        * PCB entry associated with it, then we can safely
+        * assume that this is a no match.
+        */
+       if (*ugid_lookupp == -1)
+               return (0);
+       if (proto == IPPROTO_TCP) {
+               wildcard = 0;
+               pi = &V_tcbinfo;
+       } else if (proto == IPPROTO_UDP) {
+               wildcard = INPLOOKUP_WILDCARD;
+               pi = &V_udbinfo;
+       } else
+               return 0;
+       match = 0;
+       if (*ugid_lookupp == 0) {
+               INP_INFO_RLOCK(pi);
+               pcb =  (oif) ?
+                       in_pcblookup_hash(pi,
+                               dst_ip, htons(dst_port),
+                               src_ip, htons(src_port),
+                               wildcard, oif) :
+                       in_pcblookup_hash(pi,
+                               src_ip, htons(src_port),
+                               dst_ip, htons(dst_port),
+                               wildcard, NULL);
+               if (pcb != NULL) {
+                       *uc = crhold(pcb->inp_cred);
+                       *ugid_lookupp = 1;
+               }
+               INP_INFO_RUNLOCK(pi);
+               if (*ugid_lookupp == 0) {
+                       /*
+                        * We tried and failed, set the variable to -1
+                        * so we will not try again on this packet.
+                        */
+                       *ugid_lookupp = -1;
+                       return (0);
+               }
+       } 
+       if (insn->o.opcode == O_UID)
+               match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+       else if (insn->o.opcode == O_GID)
+               match = groupmember((gid_t)insn->d[0], *uc);
+       else if (insn->o.opcode == O_JAIL)
+               match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+       return match;
+#endif
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+       struct ip_fw_chain *chain)
+{
+       args->rule.chain_id = chain->id;
+       args->rule.slot = slot + 1; /* we use 0 as a marker */
+       args->rule.rule_id = 1 + chain->map[slot]->id;
+       args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ *     args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ *             Starts with the IP header.
+ *     args->eh (in)   Mac header if present, NULL for layer3 packet.
+ *     args->L3offset  Number of bytes bypassed if we came from L2.
+ *                     e.g. often sizeof(eh)  ** NOTYET **
+ *     args->oif       Outgoing interface, NULL if packet is incoming.
+ *             The incoming interface is in the mbuf. (in)
+ *     args->divert_rule (in/out)
+ *             Skip up to the first rule past this rule number;
+ *             upon return, non-zero port number for divert or tee.
+ *
+ *     args->rule      Pointer to the last matching rule (in/out)
+ *     args->next_hop  Socket we are forwarding to (out).
+ *     args->f_id      Addresses grabbed from the packet (out)
+ *     args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ *     IP_FW_PASS      the packet must be accepted
+ *     IP_FW_DENY      the packet must be dropped
+ *     IP_FW_DIVERT    divert packet, port in m_tag
+ *     IP_FW_TEE       tee packet, port in m_tag
+ *     IP_FW_DUMMYNET  to dummynet, pipe in args->cookie
+ *     IP_FW_NETGRAPH  into netgraph, cookie args->cookie
+ *             args->rule contains the matching rule,
+ *             args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+       /*
+        * Local variables holding state while processing a packet:
+        *
+        * IMPORTANT NOTE: to speed up the processing of rules, there
+        * are some assumption on the values of the variables, which
+        * are documented here. Should you change them, please check
+        * the implementation of the various instructions to make sure
+        * that they still work.
+        *
+        * args->eh     The MAC header. It is non-null for a layer2
+        *      packet, it is NULL for a layer-3 packet.
+        * **notyet**
+        * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+        *
+        * m | args->m  Pointer to the mbuf, as received from the caller.
+        *      It may change if ipfw_chk() does an m_pullup, or if it
+        *      consumes the packet because it calls send_reject().
+        *      XXX This has to change, so that ipfw_chk() never modifies
+        *      or consumes the buffer.
+        * ip   is the beginning of the ip(4 or 6) header.
+        *      Calculated by adding the L3offset to the start of data.
+        *      (Until we start using L3offset, the packet is
+        *      supposed to start with the ip header).
+        */
+       struct mbuf *m = args->m;
+       struct ip *ip = mtod(m, struct ip *);
+
+       /*
+        * For rules which contain uid/gid or jail constraints, cache
+        * a copy of the users credentials after the pcb lookup has been
+        * executed. This will speed up the processing of rules with
+        * these types of constraints, as well as decrease contention
+        * on pcb related locks.
+        */
+#ifdef __linux__
+       struct bsd_ucred ucred_cache;
+#else
+       struct ucred *ucred_cache = NULL;
+#endif
+       int ucred_lookup = 0;
+
+       /*
+        * oif | args->oif      If NULL, ipfw_chk has been called on the
+        *      inbound path (ether_input, ip_input).
+        *      If non-NULL, ipfw_chk has been called on the outbound path
+        *      (ether_output, ip_output).
+        */
+       struct ifnet *oif = args->oif;
+
+       int f_pos = 0;          /* index of current rule in the array */
+       int retval = 0;
+
+       /*
+        * hlen The length of the IP header.
+        */
+       u_int hlen = 0;         /* hlen >0 means we have an IP pkt */
+
+       /*
+        * offset       The offset of a fragment. offset != 0 means that
+        *      we have a fragment at this offset of an IPv4 packet.
+        *      offset == 0 means that (if this is an IPv4 packet)
+        *      this is the first or only fragment.
+        *      For IPv6 offset == 0 means there is no Fragment Header. 
+        *      If offset != 0 for IPv6 always use correct mask to
+        *      get the correct offset because we add IP6F_MORE_FRAG
+        *      to be able to dectect the first fragment which would
+        *      otherwise have offset = 0.
+        */
+       u_short offset = 0;
+
+       /*
+        * Local copies of addresses. They are only valid if we have
+        * an IP packet.
+        *
+        * proto        The protocol. Set to 0 for non-ip packets,
+        *      or to the protocol read from the packet otherwise.
+        *      proto != 0 means that we have an IPv4 packet.
+        *
+        * src_port, dst_port   port numbers, in HOST format. Only
+        *      valid for TCP and UDP packets.
+        *
+        * src_ip, dst_ip       ip addresses, in NETWORK format.
+        *      Only valid for IPv4 packets.
+        */
+       uint8_t proto;
+       uint16_t src_port = 0, dst_port = 0;    /* NOTE: host format    */
+       struct in_addr src_ip, dst_ip;          /* NOTE: network format */
+       uint16_t iplen=0;
+       int pktlen;
+       uint16_t        etype = 0;      /* Host order stored ether type */
+
+       /*
+        * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+        *      MATCH_NONE when checked and not matched (q = NULL),
+        *      MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+        */
+       int dyn_dir = MATCH_UNKNOWN;
+       ipfw_dyn_rule *q = NULL;
+       struct ip_fw_chain *chain = &V_layer3_chain;
+
+       /*
+        * We store in ulp a pointer to the upper layer protocol header.
+        * In the ipv4 case this is easy to determine from the header,
+        * but for ipv6 we might have some additional headers in the middle.
+        * ulp is NULL if not found.
+        */
+       void *ulp = NULL;               /* upper layer protocol pointer. */
+       /* XXX ipv6 variables */
+       int is_ipv6 = 0;
+       u_int16_t ext_hd = 0;   /* bits vector for extension header filtering */
+       /* end of ipv6 variables */
+       int is_ipv4 = 0;
+
+       int done = 0;           /* flag to exit the outer loop */
+
+       if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+               return (IP_FW_PASS);    /* accept */
+
+       dst_ip.s_addr = 0;              /* make sure it is initialized */
+       src_ip.s_addr = 0;              /* make sure it is initialized */
+       pktlen = m->m_pkthdr.len;
+       args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+       proto = args->f_id.proto = 0;   /* mark f_id invalid */
+               /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)                                  \
+do {                                                           \
+       int x = (_len) + sizeof(T);                             \
+       if ((m)->m_len < x) {                                   \
+               args->m = m = m_pullup(m, x);                   \
+               if (m == NULL)                                  \
+                       goto pullup_failed;                     \
+       }                                                       \
+       p = (mtod(m, char *) + (_len));                         \
+} while (0)
+
+       /*
+        * if we have an ether header,
+        */
+       if (args->eh)
+               etype = ntohs(args->eh->ether_type);
+
+       /* Identify IP packets and fill up variables. */
+       if (pktlen >= sizeof(struct ip6_hdr) &&
+           (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+               struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+               is_ipv6 = 1;
+               args->f_id.addr_type = 6;
+               hlen = sizeof(struct ip6_hdr);
+               proto = ip6->ip6_nxt;
+
+               /* Search extension headers to find upper layer protocols */
+               while (ulp == NULL) {
+                       switch (proto) {
+                       case IPPROTO_ICMPV6:
+                               PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+                               args->f_id.flags = ICMP6(ulp)->icmp6_type;
+                               break;
+
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_SCTP:
+                               PULLUP_TO(hlen, ulp, struct sctphdr);
+                               src_port = SCTP(ulp)->src_port;
+                               dst_port = SCTP(ulp)->dest_port;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_HOPOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_HOPOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ROUTING:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+                               switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+                               case 0:
+                                       ext_hd |= EXT_RTHDR0;
+                                       break;
+                               case 2:
+                                       ext_hd |= EXT_RTHDR2;
+                                       break;
+                               default:
+                                       printf("IPFW2: IPV6 - Unknown Routing "
+                                           "Header type(%d)\n",
+                                           ((struct ip6_rthdr *)ulp)->ip6r_type);
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               ext_hd |= EXT_ROUTING;
+                               hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+                               proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_FRAGMENT:  /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_frag);
+                               ext_hd |= EXT_FRAGMENT;
+                               hlen += sizeof (struct ip6_frag);
+                               proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+                               offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_OFF_MASK;
+                               /* Add IP6F_MORE_FRAG for offset of first
+                                * fragment to be != 0. */
+                               offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+                                       IP6F_MORE_FRAG;
+                               if (offset == 0) {
+                                       printf("IPFW2: IPV6 - Invalid Fragment "
+                                           "Header\n");
+                                       if (V_fw_deny_unknown_exthdrs)
+                                           return (IP_FW_DENY);
+                                       break;
+                               }
+                               args->f_id.frag_id6 =
+                                   ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_DSTOPTS:   /* RFC 2460 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                               ext_hd |= EXT_DSTOPTS;
+                               hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                               proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_AH:        /* RFC 2402 */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               ext_hd |= EXT_AH;
+                               hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+                               proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+                               ulp = NULL;
+                               break;
+
+                       case IPPROTO_ESP:       /* RFC 2406 */
+                               PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+                               /* Anything past Seq# is variable length and
+                                * data past this ext. header is encrypted. */
+                               ext_hd |= EXT_ESP;
+                               break;
+
+                       case IPPROTO_NONE:      /* RFC 2460 */
+                               /*
+                                * Packet ends here, and IPv6 header has
+                                * already been pulled up. If ip6e_len!=0
+                                * then octets must be ignored.
+                                */
+                               ulp = ip; /* non-NULL to get out of loop. */
+                               break;
+
+                       case IPPROTO_OSPFIGP:
+                               /* XXX OSPF header check? */
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+
+                       case IPPROTO_PIM:
+                               /* XXX PIM header check? */
+                               PULLUP_TO(hlen, ulp, struct pim);
+                               break;
+
+                       case IPPROTO_CARP:
+                               PULLUP_TO(hlen, ulp, struct carp_header);
+                               if (((struct carp_header *)ulp)->carp_version !=
+                                   CARP_VERSION) 
+                                       return (IP_FW_DENY);
+                               if (((struct carp_header *)ulp)->carp_type !=
+                                   CARP_ADVERTISEMENT) 
+                                       return (IP_FW_DENY);
+                               break;
+
+                       case IPPROTO_IPV6:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip6_hdr);
+                               break;
+
+                       case IPPROTO_IPV4:      /* RFC 2893 */
+                               PULLUP_TO(hlen, ulp, struct ip);
+                               break;
+
+                       default:
+                               printf("IPFW2: IPV6 - Unknown Extension "
+                                   "Header(%d), ext_hd=%x\n", proto, ext_hd);
+                               if (V_fw_deny_unknown_exthdrs)
+                                   return (IP_FW_DENY);
+                               PULLUP_TO(hlen, ulp, struct ip6_ext);
+                               break;
+                       } /*switch */
+               }
+               ip = mtod(m, struct ip *);
+               ip6 = (struct ip6_hdr *)ip;
+               args->f_id.src_ip6 = ip6->ip6_src;
+               args->f_id.dst_ip6 = ip6->ip6_dst;
+               args->f_id.src_ip = 0;
+               args->f_id.dst_ip = 0;
+               args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+       } else if (pktlen >= sizeof(struct ip) &&
+           (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+               is_ipv4 = 1;
+               hlen = ip->ip_hl << 2;
+               args->f_id.addr_type = 4;
+
+               /*
+                * Collect parameters into local variables for faster matching.
+                */
+               proto = ip->ip_p;
+               src_ip = ip->ip_src;
+               dst_ip = ip->ip_dst;
+               offset = ntohs(ip->ip_off) & IP_OFFMASK;
+               iplen = ntohs(ip->ip_len);
+               pktlen = iplen < pktlen ? iplen : pktlen;
+
+               if (offset == 0) {
+                       switch (proto) {
+                       case IPPROTO_TCP:
+                               PULLUP_TO(hlen, ulp, struct tcphdr);
+                               dst_port = TCP(ulp)->th_dport;
+                               src_port = TCP(ulp)->th_sport;
+                               args->f_id.flags = TCP(ulp)->th_flags;
+                               break;
+
+                       case IPPROTO_UDP:
+                               PULLUP_TO(hlen, ulp, struct udphdr);
+                               dst_port = UDP(ulp)->uh_dport;
+                               src_port = UDP(ulp)->uh_sport;
+                               break;
+
+                       case IPPROTO_ICMP:
+                               PULLUP_TO(hlen, ulp, struct icmphdr);
+                               args->f_id.flags = ICMP(ulp)->icmp_type;
+                               break;
+
+                       default:
+                               break;
+                       }
+               }
+
+               ip = mtod(m, struct ip *);
+               args->f_id.src_ip = ntohl(src_ip.s_addr);
+               args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+       }
+#undef PULLUP_TO
+       if (proto) { /* we may have port numbers, store them */
+               args->f_id.proto = proto;
+               args->f_id.src_port = src_port = ntohs(src_port);
+               args->f_id.dst_port = dst_port = ntohs(dst_port);
+       }
+
+       IPFW_RLOCK(chain);
+       if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+               IPFW_RUNLOCK(chain);
+               return (IP_FW_PASS);    /* accept */
+       }
+       if (args->rule.slot) {
+               /*
+                * Packet has already been tagged as a result of a previous
+                * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+                * REASS, NETGRAPH, DIVERT/TEE...)
+                * Validate the slot and continue from the next one
+                * if still present, otherwise do a lookup.
+                */
+               f_pos = (args->rule.chain_id == chain->id) ?
+                   args->rule.slot :
+                   ipfw_find_rule(chain, args->rule.rulenum,
+                       args->rule.rule_id);
+       } else {
+               f_pos = 0;
+       }
+
+       /*
+        * Now scan the rules, and parse microinstructions for each rule.
+        * We have two nested loops and an inner switch. Sometimes we
+        * need to break out of one or both loops, or re-enter one of
+        * the loops with updated variables. Loop variables are:
+        *
+        *      f_pos (outer loop) points to the current rule.
+        *              On output it points to the matching rule.
+        *      done (outer loop) is used as a flag to break the loop.
+        *      l (inner loop)  residual length of current rule.
+        *              cmd points to the current microinstruction.
+        *
+        * We break the inner loop by setting l=0 and possibly
+        * cmdlen=0 if we don't want to advance cmd.
+        * We break the outer loop by setting done=1
+        * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+        * as needed.
+        */
+       for (; f_pos < chain->n_rules; f_pos++) {
+               ipfw_insn *cmd;
+               uint32_t tablearg = 0;
+               int l, cmdlen, skip_or; /* skip rest of OR block */
+               struct ip_fw *f;
+
+               f = chain->map[f_pos];
+               if (V_set_disable & (1 << f->set) )
+                       continue;
+
+               skip_or = 0;
+               for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+                   l -= cmdlen, cmd += cmdlen) {
+                       int match;
+
+                       /*
+                        * check_body is a jump target used when we find a
+                        * CHECK_STATE, and need to jump to the body of
+                        * the target rule.
+                        */
+
+/* check_body: */
+                       cmdlen = F_LEN(cmd);
+                       /*
+                        * An OR block (insn_1 || .. || insn_n) has the
+                        * F_OR bit set in all but the last instruction.
+                        * The first match will set "skip_or", and cause
+                        * the following instructions to be skipped until
+                        * past the one with the F_OR bit clear.
+                        */
+                       if (skip_or) {          /* skip this instruction */
+                               if ((cmd->len & F_OR) == 0)
+                                       skip_or = 0;    /* next one is good */
+                               continue;
+                       }
+                       match = 0; /* set to 1 if we succeed */
+
+                       switch (cmd->opcode) {
+                       /*
+                        * The first set of opcodes compares the packet's
+                        * fields with some pattern, setting 'match' if a
+                        * match is found. At the end of the loop there is
+                        * logic to deal with F_NOT and F_OR flags associated
+                        * with the opcode.
+                        */
+                       case O_NOP:
+                               match = 1;
+                               break;
+
+                       case O_FORWARD_MAC:
+                               printf("ipfw: opcode %d unimplemented\n",
+                                   cmd->opcode);
+                               break;
+
+                       case O_GID:
+                       case O_UID:
+                       case O_JAIL:
+                               /*
+                                * We only check offset == 0 && proto != 0,
+                                * as this ensures that we have a
+                                * packet with the ports info.
+                                */
+                               if (offset!=0)
+                                       break;
+                               if (is_ipv6) /* XXX to be fixed later */
+                                       break;
+                               if (proto == IPPROTO_TCP ||
+                                   proto == IPPROTO_UDP)
+                                       match = check_uidgid(
+                                                   (ipfw_insn_u32 *)cmd,
+                                                   proto, oif,
+                                                   dst_ip, dst_port,
+                                                   src_ip, src_port, (void *)&ucred_cache,
+                                                   &ucred_lookup, (struct inpcb *)args->m);
+                               break;
+
+                       case O_RECV:
+                               match = iface_match(m->m_pkthdr.rcvif,
+                                   (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_XMIT:
+                               match = iface_match(oif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_VIA:
+                               match = iface_match(oif ? oif :
+                                   m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+                               break;
+
+                       case O_MACADDR2:
+                               if (args->eh != NULL) { /* have MAC header */
+                                       u_int32_t *want = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->addr;
+                                       u_int32_t *mask = (u_int32_t *)
+                                               ((ipfw_insn_mac *)cmd)->mask;
+                                       u_int32_t *hdr = (u_int32_t *)args->eh;
+
+                                       match =
+                                           ( want[0] == (hdr[0] & mask[0]) &&
+                                             want[1] == (hdr[1] & mask[1]) &&
+                                             want[2] == (hdr[2] & mask[2]) );
+                               }
+                               break;
+
+                       case O_MAC_TYPE:
+                               if (args->eh != NULL) {
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (etype >= p[0] &&
+                                                   etype <= p[1]);
+                               }
+                               break;
+
+                       case O_FRAG:
+                               match = (offset != 0);
+                               break;
+
+                       case O_IN:      /* "out" is "not in" */
+                               match = (oif == NULL);
+                               break;
+
+                       case O_LAYER2:
+                               match = (args->eh != NULL);
+                               break;
+
+                       case O_DIVERTED:
+                           {
+                               /* For diverted packets, args->rule.info
+                                * contains the divert port (in host format)
+                                * reason and direction.
+                                */
+                               uint32_t i = args->rule.info;
+                               match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+                                   cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+                           }
+                               break;
+
+                       case O_PROTO:
+                               /*
+                                * We do not allow an arg of 0 so the
+                                * check of "proto" only suffices.
+                                */
+                               match = (proto == cmd->arg1);
+                               break;
+
+                       case O_IP_SRC:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   src_ip.s_addr);
+                               break;
+
+                       case O_IP_SRC_LOOKUP:
+                       case O_IP_DST_LOOKUP:
+                               if (is_ipv4) {
+                                   uint32_t key =
+                                       (cmd->opcode == O_IP_DST_LOOKUP) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t v = 0;
+
+                                   if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+                                       /* generic lookup. The key must be
+                                        * in 32bit big-endian format.
+                                        */
+                                       v = ((ipfw_insn_u32 *)cmd)->d[1];
+                                       if (v == 0)
+                                           key = dst_ip.s_addr;
+                                       else if (v == 1)
+                                           key = src_ip.s_addr;
+                                       else if (offset != 0)
+                                           break;
+                                       else if (proto != IPPROTO_TCP &&
+                                               proto != IPPROTO_UDP)
+                                           break;
+                                       else if (v == 2)
+                                           key = htonl(dst_port);
+                                       else if (v == 3)
+                                           key = htonl(src_port);
+                                       else if (v == 4 || v == 5) {
+                                           check_uidgid(
+                                               (ipfw_insn_u32 *)cmd,
+                                               proto, oif,
+                                               dst_ip, dst_port,
+                                               src_ip, src_port, (void *)&ucred_cache,
+                                               &ucred_lookup, (struct inpcb *)args->m);
+#ifdef __linux__
+                                           if (v ==4 /* O_UID */)
+                                               key = ucred_cache.uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache.xid;
+#else
+                                           if (v == 4 /* O_UID */)
+                                               key = ucred_cache->cr_uid;
+                                           else if (v == 5 /* O_JAIL */)
+                                               key = ucred_cache->cr_prison->pr_id;
+#endif
+                                           key = htonl(key);
+                                       } else
+                                           break;
+                                   }
+                                   match = ipfw_lookup_table(chain,
+                                       cmd->arg1, key, &v);
+                                   if (!match)
+                                       break;
+                                   if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+                                       match =
+                                           ((ipfw_insn_u32 *)cmd)->d[0] == v;
+                                   else
+                                       tablearg = v;
+                               }
+                               break;
+
+                       case O_IP_SRC_MASK:
+                       case O_IP_DST_MASK:
+                               if (is_ipv4) {
+                                   uint32_t a =
+                                       (cmd->opcode == O_IP_DST_MASK) ?
+                                           dst_ip.s_addr : src_ip.s_addr;
+                                   uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+                                   int i = cmdlen-1;
+
+                                   for (; !match && i>0; i-= 2, p+= 2)
+                                       match = (p[0] == (a & p[1]));
+                               }
+                               break;
+
+                       case O_IP_SRC_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(src_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_DST_SET:
+                       case O_IP_SRC_SET:
+                               if (is_ipv4) {
+                                       u_int32_t *d = (u_int32_t *)(cmd+1);
+                                       u_int32_t addr =
+                                           cmd->opcode == O_IP_DST_SET ?
+                                               args->f_id.dst_ip :
+                                               args->f_id.src_ip;
+
+                                           if (addr < d[0])
+                                                   break;
+                                           addr -= d[0]; /* subtract base */
+                                           match = (addr < cmd->arg1) &&
+                                               ( d[ 1 + (addr>>5)] &
+                                                 (1<<(addr & 0x1f)) );
+                               }
+                               break;
+
+                       case O_IP_DST:
+                               match = is_ipv4 &&
+                                   (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+                                   dst_ip.s_addr);
+                               break;
+
+                       case O_IP_DST_ME:
+                               if (is_ipv4) {
+                                       struct ifnet *tif;
+
+                                       INADDR_TO_IFP(dst_ip, tif);
+                                       match = (tif != NULL);
+                               }
+                               break;
+
+                       case O_IP_SRCPORT:
+                       case O_IP_DSTPORT:
+                               /*
+                                * offset == 0 && proto != 0 is enough
+                                * to guarantee that we have a
+                                * packet with port info.
+                                */
+                               if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+                                   && offset == 0) {
+                                       u_int16_t x =
+                                           (cmd->opcode == O_IP_SRCPORT) ?
+                                               src_port : dst_port ;
+                                       u_int16_t *p =
+                                           ((ipfw_insn_u16 *)cmd)->ports;
+                                       int i;
+
+                                       for (i = cmdlen - 1; !match && i>0;
+                                           i--, p += 2)
+                                               match = (x>=p[0] && x<=p[1]);
+                               }
+                               break;
+
+                       case O_ICMPTYPE:
+                               match = (offset == 0 && proto==IPPROTO_ICMP &&
+                                   icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+                               break;
+
+#ifdef INET6
+                       case O_ICMP6TYPE:
+                               match = is_ipv6 && offset == 0 &&
+                                   proto==IPPROTO_ICMPV6 &&
+                                   icmp6type_match(
+                                       ICMP6(ulp)->icmp6_type,
+                                       (ipfw_insn_u32 *)cmd);
+                               break;
+#endif /* INET6 */
+
+                       case O_IPOPT:
+                               match = (is_ipv4 &&
+                                   ipopts_match(ip, cmd) );
+                               break;
+
+                       case O_IPVER:
+                               match = (is_ipv4 &&
+                                   cmd->arg1 == ip->ip_v);
+                               break;
+
+                       case O_IPID:
+                       case O_IPLEN:
+                       case O_IPTTL:
+                               if (is_ipv4) {  /* only for IP packets */
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   if (cmd->opcode == O_IPLEN)
+                                       x = iplen;
+                                   else if (cmd->opcode == O_IPTTL)
+                                       x = ip->ip_ttl;
+                                   else /* must be IPID */
+                                       x = ntohs(ip->ip_id);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_IPPRECEDENCE:
+                               match = (is_ipv4 &&
+                                   (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+                               break;
+
+                       case O_IPTOS:
+                               match = (is_ipv4 &&
+                                   flags_match(cmd, ip->ip_tos));
+                               break;
+
+                       case O_TCPDATALEN:
+                               if (proto == IPPROTO_TCP && offset == 0) {
+                                   struct tcphdr *tcp;
+                                   uint16_t x;
+                                   uint16_t *p;
+                                   int i;
+
+                                   tcp = TCP(ulp);
+                                   x = iplen -
+                                       ((ip->ip_hl + tcp->th_off) << 2);
+                                   if (cmdlen == 1) {
+                                       match = (cmd->arg1 == x);
+                                       break;
+                                   }
+                                   /* otherwise we have ranges */
+                                   p = ((ipfw_insn_u16 *)cmd)->ports;
+                                   i = cmdlen - 1;
+                                   for (; !match && i>0; i--, p += 2)
+                                       match = (x >= p[0] && x <= p[1]);
+                               }
+                               break;
+
+                       case O_TCPFLAGS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   flags_match(cmd, TCP(ulp)->th_flags));
+                               break;
+
+                       case O_TCPOPTS:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   tcpopts_match(TCP(ulp), cmd));
+                               break;
+
+                       case O_TCPSEQ:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_seq);
+                               break;
+
+                       case O_TCPACK:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   ((ipfw_insn_u32 *)cmd)->d[0] ==
+                                       TCP(ulp)->th_ack);
+                               break;
+
+                       case O_TCPWIN:
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   cmd->arg1 == TCP(ulp)->th_win);
+                               break;
+
+                       case O_ESTAB:
+                               /* reject packets which have SYN only */
+                               /* XXX should i also check for TH_ACK ? */
+                               match = (proto == IPPROTO_TCP && offset == 0 &&
+                                   (TCP(ulp)->th_flags &
+                                    (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+                               break;
+
+                       case O_ALTQ: {
+                               struct pf_mtag *at;
+                               ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+                               match = 1;
+                               at = pf_find_mtag(m);
+                               if (at != NULL && at->qid != 0)
+                                       break;
+                               at = pf_get_mtag(m);
+                               if (at == NULL) {
+                                       /*
+                                        * Let the packet fall back to the
+                                        * default ALTQ.
+                                        */
+                                       break;
+                               }
+                               at->qid = altq->qid;
+                               if (is_ipv4)
+                                       at->af = AF_INET;
+                               else
+                                       at->af = AF_LINK;
+                               at->hdr = ip;
+                               break;
+                       }
+
+                       case O_LOG:
+                                       ipfw_log(f, hlen, args, m,
+                                           oif, offset, tablearg, ip);
+                               match = 1;
+                               break;
+
+                       case O_PROB:
+                               match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+                               break;
+
+                       case O_VERREVPATH:
+                               /* Outgoing packets automatically pass/match */
+                               match = ((oif != NULL) ||
+                                   (m->m_pkthdr.rcvif == NULL) ||
+                                   (
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           m->m_pkthdr.rcvif) :
+#endif
+                                   verify_path(src_ip, m->m_pkthdr.rcvif,
+                                       args->f_id.fib)));
+                               break;
+
+                       case O_VERSRCREACH:
+                               /* Outgoing packets automatically pass/match */
+                               match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+                                   is_ipv6 ?
+                                       verify_path6(&(args->f_id.src_ip6),
+                                           NULL) :
+#endif
+                                   verify_path(src_ip, NULL, args->f_id.fib)));
+                               break;
+
+                       case O_ANTISPOOF:
+                               /* Outgoing packets automatically pass/match */
+                               if (oif == NULL && hlen > 0 &&
+                                   (  (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+                                   || (is_ipv6 &&
+                                       in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+                                   ))
+                                       match =
+#ifdef INET6
+                                           is_ipv6 ? verify_path6(
+                                               &(args->f_id.src_ip6),
+                                               m->m_pkthdr.rcvif) :
+#endif
+                                           verify_path(src_ip,
+                                               m->m_pkthdr.rcvif,
+                                               args->f_id.fib);
+                               else
+                                       match = 1;
+                               break;
+
+                       case O_IPSEC:
+#ifdef IPSEC
+                               match = (m_tag_find(m,
+                                   PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+                               /* otherwise no match */
+                               break;
+
+#ifdef INET6
+                       case O_IP6_SRC:
+                               match = is_ipv6 &&
+                                   IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+
+                       case O_IP6_DST:
+                               match = is_ipv6 &&
+                               IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+                                   &((ipfw_insn_ip6 *)cmd)->addr6);
+                               break;
+                       case O_IP6_SRC_MASK:
+                       case O_IP6_DST_MASK:
+                               if (is_ipv6) {
+                                       int i = cmdlen - 1;
+                                       struct in6_addr p;
+                                       struct in6_addr *d =
+                                           &((ipfw_insn_ip6 *)cmd)->addr6;
+
+                                       for (; !match && i > 0; d += 2,
+                                           i -= F_INSN_SIZE(struct in6_addr)
+                                           * 2) {
+                                               p = (cmd->opcode ==
+                                                   O_IP6_SRC_MASK) ?
+                                                   args->f_id.src_ip6:
+                                                   args->f_id.dst_ip6;
+                                               APPLY_MASK(&p, &d[1]);
+                                               match =
+                                                   IN6_ARE_ADDR_EQUAL(&d[0],
+                                                   &p);
+                                       }
+                               }
+                               break;
+
+                       case O_IP6_SRC_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+                               break;
+
+                       case O_IP6_DST_ME:
+                               match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+                               break;
+
+                       case O_FLOW6ID:
+                               match = is_ipv6 &&
+                                   flow6id_match(args->f_id.flow_id6,
+                                   (ipfw_insn_u32 *) cmd);
+                               break;
+
+                       case O_EXT_HDR:
+                               match = is_ipv6 &&
+                                   (ext_hd & ((ipfw_insn *) cmd)->arg1);
+                               break;
+
+                       case O_IP6:
+                               match = is_ipv6;
+                               break;
+#endif
+
+                       case O_IP4:
+                               match = is_ipv4;
+                               break;
+
+                       case O_TAG: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               /* Packet is already tagged with this tag? */
+                               mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+                               /* We have `untag' action when F_NOT flag is
+                                * present. And we must remove this mtag from
+                                * mbuf and reset `match' to zero (`match' will
+                                * be inversed later).
+                                * Otherwise we should allocate new mtag and
+                                * push it into mbuf.
+                                */
+                               if (cmd->len & F_NOT) { /* `untag' action */
+                                       if (mtag != NULL)
+                                               m_tag_delete(m, mtag);
+                                       match = 0;
+                               } else if (mtag == NULL) {
+                                       if ((mtag = m_tag_alloc(MTAG_IPFW,
+                                           tag, 0, M_NOWAIT)) != NULL)
+                                               m_tag_prepend(m, mtag);
+                                       match = 1;
+                               }
+                               break;
+                       }
+
+                       case O_FIB: /* try match the specified fib */
+                               if (args->f_id.fib == cmd->arg1)
+                                       match = 1;
+                               break;
+
+                       case O_TAGGED: {
+                               struct m_tag *mtag;
+                               uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+
+                               if (cmdlen == 1) {
+                                       match = m_tag_locate(m, MTAG_IPFW,
+                                           tag, NULL) != NULL;
+                                       break;
+                               }
+
+                               /* we have ranges */
+                               for (mtag = m_tag_first(m);
+                                   mtag != NULL && !match;
+                                   mtag = m_tag_next(m, mtag)) {
+                                       uint16_t *p;
+                                       int i;
+
+                                       if (mtag->m_tag_cookie != MTAG_IPFW)
+                                               continue;
+
+                                       p = ((ipfw_insn_u16 *)cmd)->ports;
+                                       i = cmdlen - 1;
+                                       for(; !match && i > 0; i--, p += 2)
+                                               match =
+                                                   mtag->m_tag_id >= p[0] &&
+                                                   mtag->m_tag_id <= p[1];
+                               }
+                               break;
+                       }
+                               
+                       /*
+                        * The second set of opcodes represents 'actions',
+                        * i.e. the terminal part of a rule once the packet
+                        * matches all previous patterns.
+                        * Typically there is only one action for each rule,
+                        * and the opcode is stored at the end of the rule
+                        * (but there are exceptions -- see below).
+                        *
+                        * In general, here we set retval and terminate the
+                        * outer loop (would be a 'break 3' in some language,
+                        * but we need to set l=0, done=1)
+                        *
+                        * Exceptions:
+                        * O_COUNT and O_SKIPTO actions:
+                        *   instead of terminating, we jump to the next rule
+                        *   (setting l=0), or to the SKIPTO target (setting
+                        *   f/f_len, cmd and l as needed), respectively.
+                        *
+                        * O_TAG, O_LOG and O_ALTQ action parameters:
+                        *   perform some action and set match = 1;
+                        *
+                        * O_LIMIT and O_KEEP_STATE: these opcodes are
+                        *   not real 'actions', and are stored right
+                        *   before the 'action' part of the rule.
+                        *   These opcodes try to install an entry in the
+                        *   state tables; if successful, we continue with
+                        *   the next opcode (match=1; break;), otherwise
+                        *   the packet must be dropped (set retval,
+                        *   break loops with l=0, done=1)
+                        *
+                        * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+                        *   cause a lookup of the state table, and a jump
+                        *   to the 'action' part of the parent rule
+                        *   if an entry is found, or
+                        *   (CHECK_STATE only) a jump to the next rule if
+                        *   the entry is not found.
+                        *   The result of the lookup is cached so that
+                        *   further instances of these opcodes become NOPs.
+                        *   The jump to the next rule is done by setting
+                        *   l=0, cmdlen=0.
+                        */
+                       case O_LIMIT:
+                       case O_KEEP_STATE:
+                               if (ipfw_install_state(f,
+                                   (ipfw_insn_limit *)cmd, args, tablearg)) {
+                                       /* error or limit violation */
+                                       retval = IP_FW_DENY;
+                                       l = 0;  /* exit inner loop */
+                                       done = 1; /* exit outer loop */
+                               }
+                               match = 1;
+                               break;
+
+                       case O_PROBE_STATE:
+                       case O_CHECK_STATE:
+                               /*
+                                * dynamic rules are checked at the first
+                                * keep-state or check-state occurrence,
+                                * with the result being stored in dyn_dir.
+                                * The compiler introduces a PROBE_STATE
+                                * instruction for us when we have a
+                                * KEEP_STATE (because PROBE_STATE needs
+                                * to be run first).
+                                */
+                               if (dyn_dir == MATCH_UNKNOWN &&
+                                   (q = ipfw_lookup_dyn_rule(&args->f_id,
+                                    &dyn_dir, proto == IPPROTO_TCP ?
+                                       TCP(ulp) : NULL))
+                                       != NULL) {
+                                       /*
+                                        * Found dynamic entry, update stats
+                                        * and jump to the 'action' part of
+                                        * the parent rule by setting
+                                        * f, cmd, l and clearing cmdlen.
+                                        */
+                                       q->pcnt++;
+                                       q->bcnt += pktlen;
+                                       /* XXX we would like to have f_pos
+                                        * readily accessible in the dynamic
+                                        * rule, instead of having to
+                                        * lookup q->rule.
+                                        */
+                                       f = q->rule;
+                                       f_pos = ipfw_find_rule(chain,
+                                               f->rulenum, f->id);
+                                       cmd = ACTION_PTR(f);
+                                       l = f->cmd_len - f->act_ofs;
+                                       ipfw_dyn_unlock();
+                                       cmdlen = 0;
+                                       match = 1;
+                                       break;
+                               }
+                               /*
+                                * Dynamic entry not found. If CHECK_STATE,
+                                * skip to next rule, if PROBE_STATE just
+                                * ignore and continue with next opcode.
+                                */
+                               if (cmd->opcode == O_CHECK_STATE)
+                                       l = 0;  /* exit inner loop */
+                               match = 1;
+                               break;
+
+                       case O_ACCEPT:
+                               retval = 0;     /* accept */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_PIPE:
+                       case O_QUEUE:
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               if (cmd->opcode == O_PIPE)
+                                       args->rule.info |= IPFW_IS_PIPE;
+                               if (V_fw_one_pass)
+                                       args->rule.info |= IPFW_ONEPASS;
+                               retval = IP_FW_DUMMYNET;
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               break;
+
+                       case O_DIVERT:
+                       case O_TEE:
+                               if (args->eh) /* not on layer 2 */
+                                   break;
+                               /* otherwise this is terminal */
+                               l = 0;          /* exit inner loop */
+                               done = 1;       /* exit outer loop */
+                               retval = (cmd->opcode == O_DIVERT) ?
+                                       IP_FW_DIVERT : IP_FW_TEE;
+                               set_match(args, f_pos, chain);
+                               args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                   tablearg : cmd->arg1;
+                               break;
+
+                       case O_COUNT:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                               l = 0;          /* exit inner loop */
+                               break;
+
+                       case O_SKIPTO:
+                               f->pcnt++;      /* update stats */
+                               f->bcnt += pktlen;
+                               f->timestamp = time_uptime;
+                           /* If possible use cached f_pos (in f->next_rule),
+                            * whose version is written in f->next_rule
+                            * (horrible hacks to avoid changing the ABI).
+                            */
+                           if (cmd->arg1 != IP_FW_TABLEARG &&
+                                   (uintptr_t)f->x_next == chain->id) {
+                               f_pos = (uintptr_t)f->next_rule;
+                               } else {
+                               int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+                                       tablearg : cmd->arg1;
+                               /* make sure we do not jump backward */
+                               if (i <= f->rulenum)
+                                   i = f->rulenum + 1;
+                               f_pos = ipfw_find_rule(chain, i, 0);
+                               /* update the cache */
+                               if (cmd->arg1 != IP_FW_TABLEARG) {
+                                   f->next_rule =
+                                       (void *)(uintptr_t)f_pos;
+                                   f->x_next =
+                                       (void *)(uintptr_t)chain->id;
+                               }
+                               }
+                               /*
+                            * Skip disabled rules, and re-enter
+           &nbs