diff -urBP kernel-orig/include/linux/if_ether.h kernel/include/linux/if_ether.h --- kernel-orig/include/linux/if_ether.h Tue Aug 22 14:34:46 2000 +++ kernel/include/linux/if_ether.h Tue Aug 22 14:32:45 2000 @@ -56,6 +56,8 @@ #define ETH_P_AARP 0x80F3 /* Appletalk AARP */ #define ETH_P_IPX 0x8137 /* IPX over DIX */ #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ +#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast */ +#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast */ #define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */ #define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */ diff -urBP kernel-orig/include/linux/mpls.h kernel/include/linux/mpls.h --- kernel-orig/include/linux/mpls.h Thu Jan 1 01:00:00 1970 +++ kernel/include/linux/mpls.h Wed Oct 18 21:29:08 2000 @@ -0,0 +1,229 @@ +/****************************************************************************** + * mpls.h + * + * An implementation of Multi-Protocol Label Switching (MPLS) for Linux. + * + * Version 2.2 + * + * Copyright (c) 2000, K A Fraser + * + * - Label stacking support added by Phil Quiney + */ + +#ifndef _LINUX_MPLS_H_ +#define _LINUX_MPLS_H_ + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#define IFNAMSIZ 16 +#define ETH_ALEN 6 +typedef unsigned int u32; +#endif + +/* PJQ: Configuration macros - added LABEL_STACKING */ +#define LABEL_STACKING + +/* + * Define stack depth of however many labels. Note that increasing this + * value will increase the memory overhead of each new skbuff that is created. + */ +#define LABEL_STACK_DEPTH 8 + +/* + * PJQ: IP_FRAGMENT: This is kind of experimental. The current implementation + * basically makes sure that any unlabelled packet has room for the maximum + * possible label stack. If not it gets fragmented. There is a side effect + * in that it can't cope with fragmenting packets containing labels. In + * practice this would only be a problem if the MTU is different across the + * network. With this option enabled 'ping -s 1470
' works, where + * previously it failed. This is 'good enough' for *my* needs at present ;-) + */ +#define IP_FRAGMENT + + +/* + * Kernel <-> user communication is performed through the netlink + * socket interface. See notes and code in mpls.c for further information. + */ +#define NETLINK_MPLS 12 /* Netlink socket type for MPLS communications. */ + +#define MPLSMSG_ADD_SWITCH_MAPPING 0x01 /* Add a new SWITCH mapping */ +#define MPLSMSG_DEL_SWITCH_MAPPING 0x02 /* Delete a SWITCH mapping */ +#define MPLSMSG_ADD_PORT_MAPPING 0x11 /* Add a new PORT mapping */ +#define MPLSMSG_DEL_PORT_MAPPING 0x12 /* Delete a PORT mapping */ +#define MPLSMSG_ADD_INGRESS_MAPPING 0x21 /* Add a new INGRESS MAPPING */ +#define MPLSMSG_DEL_INGRESS_MAPPING 0x22 /* Delete an INGRESS MAPPING */ +#define MPLSMSG_ADD_EGRESS_MAPPING 0x31 /* Add a new EGRESS MAPPING */ +#define MPLSMSG_DEL_EGRESS_MAPPING 0x32 /* Delete an EGRESS MAPPING */ +#define MPLSMSG_FLUSH_ALL 0x41 /* Flush all MPLS state */ +#define MPLSMSG_DEBUG_ON 0x51 /* Turn on DEBUG mode */ +#define MPLSMSG_DEBUG_OFF 0x52 /* Turn off DEBUG mode */ +#define MPLSMSG_ADD_LABEL_POP 0x61 /* Specify number of pops */ +#define MPLSMSG_DEL_LABEL_POP 0x62 /* & remove it */ + +/* + * PORT_MAPPING_T: In the Ethernet world, we define a port to be a (local + * interface, remote interface) tuple, because this is what is required to + * uniquely identify a point-to-point connection in a connectionless network. + * We identify Ethernet interfaces as follows: + * REMOTE: MAC address (unique id, and needed when forming MAC-layer header) + * LOCAL: Device name/structure (fast lookup, easy access to MAC address) + * + * In the ATM world, a port is uniquely identified by the local interface + * index alone. + * + * A local port provides a conveneient "binding post" for ingress mappings. + * Local ports can be used in any context except as the target of a switch + * mapping -- in thi ssituation an egress mapping should be used. + * + * A port also has an integer identifier associated with it. After + * registering a new port, applications refer to ports by their identifiers + * (when creating switch entries, for example). + */ + +enum { UNDEFINED_PORT = -1, ATM_PORT, ETH_PORT, LOCAL_PORT }; + +typedef struct eth_port_st /* if ( type == ETH_PORT ) */ +{ + char l_ifname[IFNAMSIZ]; + char r_addr[ETH_ALEN]; +} eth_port_t; + +typedef struct atm_port_st /* if ( type == ATM_PORT ) */ +{ + int l_ifindex; +} atm_port_t; + +typedef struct port_mapping_st +{ + int id; /* integer identifier */ + int type; /* interface type (xxx_PORT) */ + union + { + eth_port_t eth; /* ETH_PORT */ + atm_port_t atm; /* ATM_PORT */ + } u; +} port_mapping_t; + + +/* + * CID_T: An input/output connection is identified by a (port, label) pair, + * where a port is as defined above and a label is a standard 20-bit MPLS + * label. + */ +typedef struct cid_st +{ + int port; + int label; +#ifdef LABEL_STACKING + /* + * KAF: I don't think this is the place for label stack fields. However, + * moving them would be more hassle than it's worth at the moment. Leave + * them here for now... + */ + int label_stack[LABEL_STACK_DEPTH - 1]; + int num_labels; + int num_pops; +#endif +} cid_t; + +/* For an ATM interface, a label consists of a (VPI:16, VCI:16) pair. */ +#define SET_VPI(_lab,_vpi) ((_lab) = ((_lab) & 0xffff) | ((_vpi) << 16)) +#define GET_VPI(_lab) ((_lab) >> 16) +#define SET_VCI(_lab,_vci) ((_lab) = ((_lab) & ~0xffff) | (_vci)) +#define GET_VCI(_lab) ((_lab) & 0xffff) + +/* List of ingress/egress protocols recognised by this MPLS implementation. */ +enum { MPLSPROTO_IPV4 }; + + +/* + * IPV4_FEC_T: The FEC specification for an IPv4 ingress mapping. This is very + * simple because we simply use the `tclassid' of each packet's route to + * determine its FEC. + */ +typedef struct ipv4_fec_st +{ + u32 tclassid; /* "realm" ids from IPv4 route lookup */ +} ipv4_fec_t; + + +/* + * FEC_T: Generic FEC specification. + */ +typedef struct fec_st +{ + int proto; /* MPLSPROTO_xxx */ + union + { + ipv4_fec_t ipv4; + } u; +} fec_t; + + +/* + * INGRESS_MAPPING_T: Used by an LSR controller to specify that the FEC + * should be mapped to the input cid . The actual output label and + * interface will be looked up in the switching table. + */ +typedef struct ingress_mapping_st +{ + fec_t fec; + cid_t in_cid; +} ingress_mapping_t; + + +/* + * IPV4_EGRESS_SPEC_T: The egress specification for IPv4 packets. All we need + * to know is the local interface that we'll pretend to the IP code that + * the packet entered on. Thsi is unfortunately necessary as the IP fast + * routing code hashes on the local input interface. + */ +typedef struct ipv4_egress_spec_st +{ + char ifname[IFNAMSIZ]; +} ipv4_egress_spec_t; + + +/* + * EGRESS_SPEC_T: Generic egress specification. + */ +typedef struct egress_spec_st +{ + int proto; /* protocol (MPLSPROTO_xxx) to hand off to. */ + union + { + ipv4_egress_spec_t ipv4; + } u; +} egress_spec_t; + + +/* + * EGRESS_MAPPING_T: Used by an LSR controller to specify that incoming packets + * on the given connection should be passed to the relevant + * protocol-specific handler. + */ +typedef struct egress_mapping_st +{ + cid_t in_cid; + egress_spec_t egress; +} egress_mapping_t; + + +/* + * SWITCH_MAPPING_T: Associates an input connection with an output connection + * and QoS spec. + */ +typedef struct switch_mapping_st +{ + cid_t in_cid; + cid_t out_cid; + u32 out_tc_index; /* QoS spec index */ +} switch_mapping_t; + + +#endif /* _MPLS_H_ */ diff -urBP kernel-orig/net/Config.in kernel/net/Config.in --- kernel-orig/net/Config.in Tue Aug 22 14:34:47 2000 +++ kernel/net/Config.in Tue Aug 22 14:33:03 2000 @@ -18,6 +18,12 @@ fi bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX +tristate 'Multiprotocol Label Switching' CONFIG_MPLS +if [ "$CONFIG_MPLS" != "n" ]; then + define_bool CONFIG_IP_ADVANCED_ROUTER y + define_bool CONFIG_NETFILTER y + define_bool CONFIG_NET_CLS_ROUTE y +fi bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in diff -urBP kernel-orig/net/Makefile kernel/net/Makefile --- kernel-orig/net/Makefile Tue Aug 22 14:34:47 2000 +++ kernel/net/Makefile Tue Aug 22 14:33:09 2000 @@ -34,6 +34,14 @@ endif endif +ifeq ($(CONFIG_MPLS),y) +SUB_DIRS += mpls +else + ifeq ($(CONFIG_MPLS),m) + MOD_SUB_DIRS += mpls + endif +endif + ifeq ($(CONFIG_IPV6),y) SUB_DIRS += ipv6 else diff -urBP kernel-orig/net/core/skbuff.c kernel/net/core/skbuff.c --- kernel-orig/net/core/skbuff.c Tue Aug 22 14:34:46 2000 +++ kernel/net/core/skbuff.c Tue Aug 22 14:33:30 2000 @@ -62,6 +62,8 @@ #include #include +#include /* for LABEL_STACK_DEPTH */ + int sysctl_hot_list_len = 128; static kmem_cache_t *skbuff_head_cache; @@ -187,18 +189,23 @@ /* Get the DATA. Size must match skb_add_mtu(). */ size = ((size + 15) & ~15); - data = kmalloc(size + sizeof(atomic_t), gfp_mask); +/* + * There must be a better way...it will do for now. Label stack of 4 + * means a 4x4 = 16 byte overhead for _every_ skbuff allocated, for example. + */ +#define MPLS_SHIM_SPACE (LABEL_STACK_DEPTH * 4) /* make room for a shim hdr */ + data = kmalloc(size + MPLS_SHIM_SPACE + sizeof(atomic_t), gfp_mask); if (data == NULL) goto nodata; /* XXX: does not include slab overhead */ - skb->truesize = size + sizeof(struct sk_buff); + skb->truesize = size + MPLS_SHIM_SPACE + sizeof(struct sk_buff); /* Load the data pointers. */ skb->head = data; - skb->data = data; - skb->tail = data; - skb->end = data + size; + skb->data = data + MPLS_SHIM_SPACE; + skb->tail = data + MPLS_SHIM_SPACE; + skb->end = data + MPLS_SHIM_SPACE + size; /* Set up other state */ skb->len = 0; diff -urBP kernel-orig/net/mpls/Makefile kernel/net/mpls/Makefile --- kernel-orig/net/mpls/Makefile Thu Jan 1 01:00:00 1970 +++ kernel/net/mpls/Makefile Tue Aug 22 14:33:44 2000 @@ -0,0 +1,4 @@ +O_OBJS := mpls.o +M_OBJS := $(O_OBJS) + +include $(TOPDIR)/Rules.make diff -urBP kernel-orig/net/mpls/mpls.c kernel/net/mpls/mpls.c --- kernel-orig/net/mpls/mpls.c Thu Jan 1 01:00:00 1970 +++ kernel/net/mpls/mpls.c Wed Oct 18 21:40:21 2000 @@ -0,0 +1,2583 @@ +/****************************************************************************** + * mpls.c + * + * An implementation of Multi-Protocol Label Switching (MPLS) for Linux. + * + * Version 2.2 + * + * Copyright (c) 2000, K A Fraser + * + * - Label stacking support added by Phil Quiney + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +#include +#include +#endif + +/* + * Debugging definitions. + */ + +#if 0 +#define TRC(__f,__a...) if ( mpls_debug ) printk(__f , ## __a) +#define ASSERT(__x) \ + if(!(__x)) { \ + printk(__FUNCTION__": assertion "#__x" failed\n"); \ + cli(); for ( ;; ); } +#else +#define TRC(__f,__a...) ((void)0) +#define ASSERT(__X) ((void)0) +#endif + +#if 0 +#define LSTACK_TRC(__f,__a...) if ( mpls_debug ) printk(__f , ## __a) +#else +#define LSTACK_TRC(__f,__a...) ((void)0) +#endif + +static int mpls_debug = 1; + +/* + * The standard Linux linked-list macros, with a couple of definitions + * to make them more palatable when used as per-bucket chains for a hash table. + */ + +#include + +typedef struct list_head bucket_entry_t; +#define bucket_init(__b) INIT_LIST_HEAD((__b)) +#define for_each_in_bucket(__e,__b) \ + list_for_each(((bucket_entry_t *)(__e)),(__b)) +#define add_to_bucket(__e,__b) list_add(((bucket_entry_t *)(__e)),(__b)) +#define del_from_bucket(__e) list_del(((bucket_entry_t *)(__e))) + + +/* + * K_PORT_MAPPING_T: Internal versions of the user-accessible structures. + */ + +typedef struct k_eth_port_st +{ + struct net_device *l_ifdev; /* A direct ptr instead of the name */ + char r_addr[ETH_ALEN]; +} k_eth_port_t; + +typedef struct atm_port_st k_atm_port_t; /* No change to the atm structure */ + +typedef struct k_port_mapping_st +{ + int id; + int type; + union + { + k_eth_port_t eth; + k_atm_port_t atm; + } u; + atomic_t refcnt; +} k_port_mapping_t; + + +/* + * K_CID_T: Includes extra internal state. + */ +typedef struct k_cid_st +{ + int port; + int label; +#ifdef LABEL_STACKING + int label_stack[LABEL_STACK_DEPTH - 1]; + int num_labels; + int num_pops; +#endif + /* XXX TO HERE _MUST_ MATCH `cid_t' XXX */ + struct socket *sock; /* only used by ATM */ +} k_cid_t; + + +/* + * K_EGRESS_SPEC_T: Define internal versions of all the egress specifications. + */ + +typedef struct k_ipv4_egress_spec_st +{ + struct net_device *iif; /* Direct pointer instead of device name. */ +} k_ipv4_egress_spec_t; + +typedef struct k_egress_spec_st +{ + int proto; /* protocol (MPLSPROTO_xxx) to hand off to. */ + union + { + k_ipv4_egress_spec_t ipv4; /* MPLSPROTO_IPV4 */ + } u; +} k_egress_spec_t; + + +/* + * K_SWITCH_MAPPING_T: Although it is hidden from applications, forwarding + * and egress mapping share the same switching table, which we define here. + * + * For a given , we may switch to one of: + * SM_TYPE_NONE: This entry only exists so that other forwarding mappings + * can reference this as their output cid. We do + * this so that each cid only needs to be referenced once. + * SM_TYPE_FORWARD: A forwarding specification is included which tells us + * the output connection and QoS details. + * SM_TYPE_EGRESS: An egress specification is included which tells us which + * protocol-specific handler to hand the packet off to. + */ +#ifdef LABEL_STACKING +enum { SM_TYPE_NONE, SM_TYPE_FORWARD, SM_TYPE_EGRESS, SM_TYPE_LABEL_POP }; +#else +enum { SM_TYPE_NONE, SM_TYPE_FORWARD, SM_TYPE_EGRESS }; +#endif + +typedef struct k_forwarding_spec_st +{ + k_cid_t *out_cid; /* output connection */ + u32 out_tc_index; /* output "QoS" (actually an index for a qdisc) */ +} k_forwarding_spec_t; + +typedef struct k_switch_mapping_st +{ + k_cid_t in_cid; + int type; /* SM_TYPE_xxx */ + union + { + k_forwarding_spec_t fs; /* SM_TYPE_FORWARD */ + k_egress_spec_t es; /* SM_TYPE_EGRESS */ + } u; + atomic_t refcnt; +} k_switch_mapping_t; + + +/* + * K_INGRESS_MAPPING_T: As the user-accessible version, but the field + * is now a reference (see description of k_switch_mapping_t). + */ +typedef struct k_ingress_mapping_st +{ + fec_t fec; + k_cid_t *in_cid; +} k_ingress_mapping_t; + + +/* + * Table entry structures. Just simple wrappers around the 'k_mapping' structs. + */ + +typedef struct switch_table_entry_st +{ + bucket_entry_t __b; + k_switch_mapping_t sm; +} switch_table_entry_t; + +typedef struct port_table_entry_st +{ + bucket_entry_t __b; + k_port_mapping_t pm; +} port_table_entry_t; + + +typedef struct ingress_table_entry_st +{ + bucket_entry_t __b; + k_ingress_mapping_t im; +} ingress_table_entry_t; + + + +/* + * Cache from which we allocate entries for the ingress and switch tables. + * This is why the cache block size is the larger of the two structures. + * Note that the port table entries are preallocated, so their size is + * not taken into account when calculating the cache block size. + */ + +static kmem_cache_t *mpls_cachep = NULL; +#define CACHE_SIZE \ + (sizeof(switch_table_entry_t) > sizeof(ingress_table_entry_t)) ? \ + sizeof(switch_table_entry_t) : sizeof(ingress_table_entry_t) + + +/* + * PORT TABLE DEFINITIONS: We have two ways of looking up a port, depending + * on whether we wish to get the port details (eg. interface pointer) from + * the port identifier, or vice versa. + * + * The former we get by simply indexing into an array of table_entry + * structures (hence port table identifiers are limited to quite small + * integers!!). + * + * For the other direction we set up a hash table in a similar way to the + * ingress mapping and switching hash tables. + */ + +#define MAX_PORT 255 /* The maximum allowed port identifier */ +#define PORT_HASH_SIZE 256 /* # buckets in (local if, rem if) -> id table */ +#define PORT_HASH_BY_ETH_IF(__dev, __mac) (unsigned int) \ + ((((unsigned int)(__dev)) ^ (*(unsigned int*)(__mac))) % PORT_HASH_SIZE) +#define PORT_HASH_BY_ATM_IF(__if) (unsigned int) \ + (((unsigned int)(__if)) % PORT_HASH_SIZE) +static port_table_entry_t ports[MAX_PORT+1]; /* id -> 'details' */ +static bucket_entry_t port_table[PORT_HASH_SIZE]; /* 'details' -> id */ + + +/* + * INGRESS TABLE DEFINITIONS: A hash table for looking up the input connection + * identifier (in_cid) for a given forwarding equivalence class (FEC). + * + * FECs are identified for us by the IP routing code. FECs are registered + * by installing special route table entries with REALMS that are + * registered with this module. + * + * The 'FEC -> cid' lookup therefore reduces to a simple + * '(source realm, dest realm) -> cid' lookup. + */ +#define INGRESS_HASH_SIZE 256 +#define INGRESS_IPV4_HASH(__tclassid) (unsigned int) \ + (((unsigned int)(__tclassid)) % INGRESS_HASH_SIZE) +static bucket_entry_t ingress_table[INGRESS_HASH_SIZE]; /* realm -> cid */ + + +/* + * SWITCH TABLE DEFINITIONS: A hash table for looking up the output port, + * label and QoS details for a given input connection identifier (in_cid). + * + * CIDs are simply (port, label) pairs, as previously discussed. + */ +#define SWITCH_HASH_SIZE 256 +#define SWITCH_HASH(_cid) (unsigned int) \ + ((((unsigned int)(_cid)->label) ^ ((unsigned int)(_cid)->port)) \ + % SWITCH_HASH_SIZE) +static bucket_entry_t switch_table[SWITCH_HASH_SIZE]; + + +/* + * Global lock for the entire module. Coarse-grained, but probably adequate. + */ +static spinlock_t mpls_lock; + + +/* + * Netlink socket used for kernel <-> user communication. + * + * Applications use it to create and destroy port, ingress and switch mappings. + */ +static struct sock *mplsnl = NULL; + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + +/* + * Tasklet and frame queue for processing incoming ATM packets. + */ +static struct tasklet_struct recv_atm_tasklet = { 0 }; +static struct sk_buff_head atm_recvq; + + +/* + * The fake ATM device used for packet transmission. + */ +static struct net_device *atm_dev = NULL; +static char *atm_dev_name = "atm_mpls"; + +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + + +/* + * Internal function prototypes. + */ + +/* Ingress table functions */ +static int add_ite(ingress_table_entry_t *ite); +static int del_ite(fec_t *fec); +static ingress_table_entry_t *ite_from_fec(fec_t *fec); + +/* Switch table functions */ +static __inline__ void ste_hold(switch_table_entry_t *ste); +static __inline__ void cid_hold(k_cid_t *cid); +static __inline__ void ste_put(switch_table_entry_t *ste); +static __inline__ void cid_put(k_cid_t *cid); +static switch_table_entry_t *ste_from_cid(k_cid_t *cid); +static switch_table_entry_t *create_ste(k_cid_t *cid); + +/* Port table functions */ +static int add_pte(port_table_entry_t *pte); +static void del_pte(int id); +static port_table_entry_t *pte_from_eth_if( + char *r_addr, struct net_device *l_ifdev); +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +static port_table_entry_t *pte_from_atm_if(int l_ifindex); +#endif +static __inline__ port_table_entry_t *pte_from_if(k_port_mapping_t *pm); + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +/* ATM support functions */ +static int atm_start_xmit(struct sk_buff *skb, struct net_device *dev); +static void atm_push(struct atm_vcc *vcc, struct sk_buff *skb); +static void atm_pop(struct atm_vcc *vcc, struct sk_buff *skb); +#endif + +/* Data-path functions */ +static void mpls_ip_fragment(struct sk_buff *skb, int ttl, k_cid_t *in_cid, + int bos); +static void mpls_output_switch(struct sk_buff *skb, int ttl, k_cid_t *in_cid, + int bos); +static unsigned int mpls_ingress_hook( + unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)); +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +static void recv_atm_frame(unsigned long data); +#endif +static int recv_eth_frame( + struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt); + +/* Support functions */ +static int flush_all(void); +static __inline__ void netlink_rcv_skb(struct sk_buff *skb); +static void netlink_rcv_sk(struct sock *sk, int len); +static int mpls_get_info(char *buffer, char **start, off_t offset, int length); + + + + +/****************************************************************************** + **** INITIALISATION ********************************************************** + */ + + +/* + * mpls_packet_type: a packet_type hook through which we receive all MPLS + * Ethernet frames. The hook is called soon after a packet is + * received at a network interface, and so shouldn't + * introduce much latency. + */ +static struct packet_type mpls_packet_type = +{ + __constant_htons(ETH_P_MPLS_UC), /* packet type that we're interested in */ + NULL, /* we're not device-specific */ + recv_eth_frame, /* func which does receive processing */ + (void *)1, /* new-style handler! (see core/dev.c) */ + NULL +}; + + +/* + * mpls_ingress_ops: a netfilter hook through which we receive all IP packets + * immediately after they have been routed. The hook + * function looks for traffic class identifiers registered + * with the MPLS module, and steals packets that have an + * ingress mapping. + */ +static struct nf_hook_ops mpls_ingress_ops = +{ + { NULL, NULL }, + mpls_ingress_hook, + PF_INET, + NF_IP_POST_ROUTING, + NF_IP_PRI_LAST +}; + + +/* + * mpls_proto_init: + * START OF DAY INITIALISATION ROUTINE. + */ +#ifdef MODULE +int init_module(void) +#else +void __init mpls_proto_init(struct net_proto *pro) +#endif +{ + int i, cleanup=1; + + printk("MPLS version 2.2 (18/10/00) by Keir Fraser (kaf24@cl.cam.ac.uk)\n"); +#ifdef LABEL_STACKING + printk("Label Stacking by Phil Quiney (pquiney@nortelnetworks.com)\n"); + LSTACK_TRC(" -> Label Stacking Debug compiled in\n"); +#endif +#ifdef IP_FRAGMENT + printk("IP Fragmentation enabled (pquiney@nortelnetworks.com)\n"); +#endif +#if defined(CONFIG_NET_CLS_TCINDEX) || defined(CONFIG_NET_CLS_TCINDEX_MODULE) + printk("5-Tuple Classifier Enabled\n"); +#endif + + TRC(" -> init_module at address %p\n", init_module); + + /* Initialise the lookup tables. */ + memset(switch_table, 0, sizeof(switch_table)); + memset(ports, 0, sizeof(ports)); + memset(port_table, 0, sizeof(port_table)); + memset(ingress_table, 0, sizeof(ingress_table)); + for ( i = 0; i < MAX_PORT; i++ ) ports[i].pm.type = UNDEFINED_PORT; + for ( i = 0; i < PORT_HASH_SIZE; i++ ) bucket_init(&port_table[i]); + for ( i = 0; i < SWITCH_HASH_SIZE; i++ ) bucket_init(&switch_table[i]); + for ( i = 0; i < INGRESS_HASH_SIZE; i++ ) bucket_init(&ingress_table[i]); + + spin_lock_init(&mpls_lock); + spin_lock_irq(&mpls_lock); + + mpls_cachep = kmem_cache_create( + "mpls", CACHE_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if ( !mpls_cachep ) + { + printk("mpls: initialisation failed: unable to create slab cache\n"); + goto fail; + } + cleanup <<= 1; + + mplsnl = netlink_kernel_create(NETLINK_MPLS, netlink_rcv_sk); + if ( !mplsnl ) + { + printk("mpls: initialisation failed: unable to " + "create kernel netlink socket\n"); + goto fail; + } + cleanup <<= 1; + + /* Install a hook, after IP routing, to filter MPLS ingress packets. */ + nf_register_hook(&mpls_ingress_ops); + cleanup <<= 1; + + /* Register ourselves to receive incoming MPLS packets. */ + dev_add_pack(&mpls_packet_type); + cleanup <<= 1; + +#ifdef CONFIG_PROC_FS + proc_net_create("mpls", 0, mpls_get_info); +#endif + cleanup <<= 1; + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + + /* Initialise our ATM receive queue and receive handler. */ + skb_queue_head_init(&atm_recvq); + tasklet_init(&recv_atm_tasklet, recv_atm_frame, 0); + cleanup <<= 1; + + /* Set up our dummy ATM transmission device. */ + atm_dev = kmalloc(sizeof(*atm_dev), GFP_KERNEL); + if (!atm_dev) goto fail; + cleanup <<= 1; + + memset(atm_dev, 0, sizeof(*atm_dev)); + strcpy(atm_dev->name, atm_dev_name); + atm_dev->hard_start_xmit = atm_start_xmit; + atm_dev->mtu = 1500; + atm_dev->type = ARPHRD_ATM; + + if ( register_netdev(atm_dev) ) goto fail; + cleanup <<= 1; + + rtnl_lock(); + if ( dev_open(atm_dev) ) + { + rtnl_unlock(); + goto fail; + } + rtnl_unlock(); + cleanup <<= 1; + +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + + spin_unlock_irq(&mpls_lock); + +#ifdef MODULE + return(0); +#else + return; +#endif + + fail: + switch ( cleanup ) + { +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + case 512: dev_close(atm_dev); + case 256: unregister_netdev(atm_dev); + case 128: kfree(atm_dev); + case 64: tasklet_kill(&recv_atm_tasklet); +#endif + case 32: +#ifdef CONFIG_PROC_FS + proc_net_remove("mpls"); +#endif + case 16: dev_remove_pack(&mpls_packet_type); + case 8: nf_unregister_hook(&mpls_ingress_ops); + case 4: sock_release(mplsnl->socket); + case 2: kmem_cache_destroy(mpls_cachep); + case 1: break; + default: ASSERT(0); + } + +#ifdef MODULE + return(-ENOMEM); +#endif +} + + +/* + * cleanup_module: + * FLUSHES STATE AND UNREGISTERS ALL HOOKS. + */ +#ifdef MODULE +void cleanup_module(void) +{ + TRC("+ cleanup_module()\n"); + + spin_lock_irq(&mpls_lock); + + TRC("* cleanup_module: flush all internal state\n"); + flush_all(); + +#ifdef CONFIG_PROC_FS + TRC("* cleanup_module: remove proc fs entry\n"); + proc_net_remove("mpls"); +#endif + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + TRC("* cleanup_module: remove dummy ATM device\n"); + unregister_netdev(atm_dev); + TRC("* cleanup_module: remove ATM frame receive function\n"); + tasklet_kill(&recv_atm_tasklet); + { + struct sk_buff *skb; + while ( (skb = skb_dequeue(&atm_recvq)) ) kfree_skb(skb); + } +#endif + + TRC("* cleanup_module: remove Ethernet frame receive function\n"); + dev_remove_pack(&mpls_packet_type); + TRC("* cleanup_module: remove IPv4 post-routing hook\n"); + nf_unregister_hook(&mpls_ingress_ops); + TRC("* cleanup_module: release netlink socket\n"); + sock_release(mplsnl->socket); + TRC("* cleanup_module: destroy slab cache\n"); + kmem_cache_destroy(mpls_cachep); + + spin_unlock_irq(&mpls_lock); + + TRC("- cleanup_module()\n"); +} +#endif + + + + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + +/****************************************************************************** + **** ATM SUPPORT FUNCTIONS *************************************************** + */ + +/* + * atm_start_xmit: + * Transmit the given skbuff on the relevant vcc. + */ +static int atm_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct atm_vcc *vcc = *(struct atm_vcc **)(skb->cb); + TRC("* atm_start_xmit()\n"); + (void)vcc->send(vcc, skb); + return(0); +} + + +/* + * atm_push: + * Called by the ATM device interrupt handler when a packet is received on + * a vcc belonging to us. We simply queue the packet up for later processing. + */ +static void atm_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + if ( !skb ) return; + *(struct atm_vcc **)(skb->cb) = vcc; + skb_queue_tail(&atm_recvq, skb); + tasklet_hi_schedule(&recv_atm_tasklet); +} + + +/* + * atm_pop: + * Called by the ATM device interrupt handler when an skbuff we handed to it + * for transmission has been finished with. + */ +static void atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + dev_kfree_skb_any(skb); +} + +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + + + + +/****************************************************************************** + **** INGRESS TABLE FUNCTIONS ************************************************* + */ + +static int add_ite(ingress_table_entry_t *ite) +{ + TRC("+ add_ite(): ite=%p\n", ite); + + if ( ite_from_fec(&ite->im.fec) ) + { + TRC("- add_ite(): FAIL -- FEC already in use\n"); + return(-EADDRINUSE); + } + + switch ( ite->im.fec.proto ) + { + case MPLSPROTO_IPV4: + { + int bucket = INGRESS_IPV4_HASH(ite->im.fec.u.ipv4.tclassid); + add_to_bucket(ite, &ingress_table[bucket]); + } + break; + + default: + { + TRC("- add_ite(): FAIL -- invalid protocol '%d'\n", ite->im.fec.proto); + return(-EINVAL); + } + break; + + /* Add more protocols here... */ + } + + TRC("- add_ite(): added successfully\n"); + return(0); +} + + +static int del_ite(fec_t *fec) +{ + ingress_table_entry_t *ite; + int result = -EINVAL; + + TRC("+ del_ite(): fec=%p\n", fec); + + if ( (ite = ite_from_fec(fec)) ) + { + del_from_bucket(ite); + cid_put(ite->im.in_cid); + kmem_cache_free(mpls_cachep, ite); + result = 0; + } + + TRC("- del_ite(): result = %d\n", result); + return(result); +} + + +static ingress_table_entry_t *ite_from_fec(fec_t *fec) +{ + bucket_entry_t *b; + ingress_table_entry_t *ite; + + switch ( fec->proto ) + { + case MPLSPROTO_IPV4: + { + ipv4_fec_t *f = &fec->u.ipv4; + b = &ingress_table[INGRESS_IPV4_HASH(f->tclassid)]; + for_each_in_bucket(ite, b) + { + if ( (ite->im.fec.proto == MPLSPROTO_IPV4) && + (ite->im.fec.u.ipv4.tclassid == f->tclassid) ) return(ite); + } + } + break; + + default: ASSERT(0); + + /* Add more protocols here... */ + } + + return(NULL); +} + + + + +/****************************************************************************** + **** PORT TABLE FUNCTIONS **************************************************** + */ + +/* + * add_pte: + * Adds port

to the global port table. A copy of the structure is made, + * so the parameter can be trashed after this call. + */ +static int add_pte(port_table_entry_t *pte) +{ + TRC("+ add_pte(): pte = %p\n", pte); + + /* Check that identifier is not already in use. */ + if ( ports[pte->pm.id].pm.type != UNDEFINED_PORT ) + { + TRC("- add_pte(): FAIL -- port identifier '%d' in use\n", pte->pm.id); + return(-EADDRINUSE); + } + + /* Check that the interface is not already in use. */ + if ( pte_from_if(&pte->pm) ) + { + TRC("- add_pte(): FAIL -- interface already in use\n"); + return(-EADDRINUSE); + } + + /* Add to port array and hash table (if not a local port). */ + memcpy(&ports[pte->pm.id], pte, sizeof(*pte)); + if ( pte->pm.type == ETH_PORT ) + { + add_to_bucket( + &ports[pte->pm.id], + &port_table[PORT_HASH_BY_ETH_IF(pte->pm.u.eth.l_ifdev, + pte->pm.u.eth.r_addr)]); + } + else if ( pte->pm.type == ATM_PORT ) + { + add_to_bucket( + &ports[pte->pm.id], + &port_table[PORT_HASH_BY_ATM_IF(pte->pm.u.atm.l_ifindex)]); + } + + TRC("- add_pte(): success\n"); + return(0); +} + + +/* + * del_pte: + * Deletes the port table entry with identifier . + */ +static void del_pte(int id) +{ + TRC("+ del_pte(): id = %d\n", id); + + /* Noone should refer to a port that is to be deleted. */ + ASSERT(atomic_read(&ports[id].pm.refcnt) == 0); + + if ( ports[id].pm.type != LOCAL_PORT ) del_from_bucket(&ports[id]); + + if ( ports[id].pm.type == ETH_PORT ) dev_put(ports[id].pm.u.eth.l_ifdev); + + memset(&ports[id], 0, sizeof(port_mapping_t)); + ports[id].pm.type = UNDEFINED_PORT; + + TRC("- del_pte(): success\n"); +} + + +static port_table_entry_t *pte_from_eth_if( + char *r_addr, struct net_device *l_ifdev) +{ + bucket_entry_t *b; + port_table_entry_t *pte; + + b = &port_table[PORT_HASH_BY_ETH_IF(l_ifdev, r_addr)]; + for_each_in_bucket(pte, b) + { + if ( (pte->pm.type == ETH_PORT) && + (pte->pm.u.eth.l_ifdev == l_ifdev) && + (*(u32*)pte->pm.u.eth.r_addr == *(u32*)r_addr) && + (*(u16*)(pte->pm.u.eth.r_addr+4) == *(u16*)(r_addr+4)) ) + { + return(pte); + } + } + + return(NULL); +} + + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +static port_table_entry_t *pte_from_atm_if(int l_ifindex) +{ + bucket_entry_t *b; + port_table_entry_t *pte; + + b = &port_table[PORT_HASH_BY_ATM_IF(l_ifindex)]; + for_each_in_bucket(pte, b) + { + if ( (pte->pm.type == ATM_PORT) && + (pte->pm.u.atm.l_ifindex == l_ifindex) ) return(pte); + } + + return(NULL); +} +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + + +static __inline__ port_table_entry_t *pte_from_if(k_port_mapping_t *pm) +{ + switch ( pm->type ) + { + case ETH_PORT: return(pte_from_eth_if(pm->u.eth.r_addr,pm->u.eth.l_ifdev)); +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + case ATM_PORT: return(pte_from_atm_if(pm->u.atm.l_ifindex)); +#endif + case LOCAL_PORT: return(NULL); + default: ASSERT(0); + } + return(NULL); /* keep gcc happy */ +} + + + + + +/****************************************************************************** + **** SWITCH TABLE FUNCTIONS ************************************************** + */ + +/* + * ste_hold, ste_put, cid_hold, cid_put: + * To reduce the complexity of keeping tracks of all teh refernces between + * switch table entries, a reference counting mechanism is used. + * + * Whenever a pointer to a cid structure is stored, `cid_hold' should be + * called. `cid_hold' decrements the reference count when the pointer is + * destroyed. + * + * Likewise, whenever a new (in_cid -> out_cid) mapping is defined, + * `ste_hold' should be called. `ste_put' is called when the mapping is + * destroyed. + */ + +static __inline__ void ste_hold(switch_table_entry_t *ste) +{ + TRC("* ste_hold(): ste = %p\n", ste); + + if ( ste ) atomic_inc(&ste->sm.refcnt); +} + +static __inline__ void cid_hold(k_cid_t *cid) +{ + switch_table_entry_t *ste = (switch_table_entry_t *) + ((char *)cid - offsetof(switch_table_entry_t, sm.in_cid)); + + TRC("* cid_hold(): cid = %p, ste = %p\n", cid, ste); + + if ( cid ) ste_hold(ste); +} + +static __inline__ void ste_put(switch_table_entry_t *ste) +{ + TRC("+ ste_put: enter\n"); + TRC("* ste_put(): ste=%p, pre_refcnt=%d\n", + ste, atomic_read(&ste->sm.refcnt)); + + if ( ste && atomic_dec_and_test(&ste->sm.refcnt) ) + { + TRC("* ste_put(): refcnt==0 -- destroying entry\n"); + + /* Table entry has no references! Destroy it. */ + del_from_bucket(ste); +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + if ( ports[ste->sm.in_cid.port].pm.type == ATM_PORT ) + { + sock_release(ste->sm.in_cid.sock); + } +#endif + atomic_dec(&ports[ste->sm.in_cid.port].pm.refcnt); + + ASSERT(ste->sm.type == SM_TYPE_NONE); + + kmem_cache_free(mpls_cachep, ste); + } + TRC("- ste_put: exit\n"); +} + +static __inline__ void cid_put(k_cid_t *cid) +{ + switch_table_entry_t *ste = (switch_table_entry_t *) + ((char *)cid - offsetof(switch_table_entry_t, sm.in_cid)); + + TRC("* cid_put(): cid = %p, ste = %p\n", cid, ste); + + if ( cid ) ste_put(ste); +} + + +static switch_table_entry_t *ste_from_cid(k_cid_t *cid) +{ + bucket_entry_t *b; + switch_table_entry_t *ste; + + b = &switch_table[SWITCH_HASH(cid)]; + for_each_in_bucket(ste, b) + { + if ( (cid->port == ste->sm.in_cid.port) && + (cid->label == ste->sm.in_cid.label) ) return(ste); + } + + return(NULL); +} + + +/* + * create_ste: + * Creates a new table entry if none exists already + */ +static switch_table_entry_t *create_ste(k_cid_t *cid) +{ +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + struct socket *sock = NULL; + struct sockaddr_atmpvc addr; + struct atm_vcc *vcc = NULL; + int r; +#endif + switch_table_entry_t *ste; +#ifdef LABEL_STACKING + int index; +#endif + + TRC("+ create_ste(): port=%d, label=%d\n", cid->port, cid->label); + + ste = ste_from_cid(cid); + + if ( !ste && (ports[cid->port].pm.type != UNDEFINED_PORT) ) + { + ste = kmem_cache_alloc(mpls_cachep, GFP_KERNEL); + if ( ste == NULL ) + { + TRC("- create_ste(): could not allocate new 'ste' structure\n"); + return(NULL); + } + + memset(ste, 0, sizeof(*ste)); + ste->sm.type = SM_TYPE_NONE; + ste->sm.in_cid.port = cid->port; + ste->sm.in_cid.label = cid->label; + +#ifdef LABEL_STACKING + if(cid->num_labels) + { + LSTACK_TRC("create_ste: Copying label stack\n"); + for(index=0;indexnum_labels;index++) + { + LSTACK_TRC("create_ste: index %d label %d\n", + index, cid->label_stack[index]); + ste->sm.in_cid.label_stack[index] = cid->label_stack[index]; + } + /* Set number of labels... */ + ste->sm.in_cid.num_labels = cid->num_labels; + } +#endif + + atomic_inc(&ports[cid->port].pm.refcnt); + + add_to_bucket(ste, &switch_table[SWITCH_HASH(&ste->sm.in_cid)]); + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + if ( ports[cid->port].pm.type == ATM_PORT ) goto create_atm_sock; +#endif + } + + TRC("- create_ste(): non-atm port created successfully\n"); + return(ste); + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + create_atm_sock: + + if ( (r = sock_create(PF_ATMPVC, SOCK_DGRAM, 0, &sock)) ) + { + TRC("* create_ste(): failed to create socket (%d)\n", -r); + goto atm_fail; + } + + /* Set up the QoS specification for this connection endpoint. */ + vcc = ATM_SD(sock); + memset(&vcc->qos, 0, sizeof(vcc->qos)); + vcc->qos.txtp.traffic_class = ATM_UBR; + vcc->qos.rxtp.traffic_class = ATM_UBR; + vcc->qos.aal = ATM_AAL5; + set_bit(ATM_VF_HASQOS, &vcc->flags); + + /* Bind the endpoint to a (vpi, vci) pair. */ + addr.sap_family = AF_ATMPVC; + addr.sap_addr.itf = ports[cid->port].pm.u.atm.l_ifindex; + addr.sap_addr.vpi = GET_VPI(cid->label); + addr.sap_addr.vci = GET_VCI(cid->label); + if ( (r = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr))) ) + { + TRC("* create_ste(): failed to bind socket (%d)\n", -r); + goto atm_fail; + } + + /* Swizzle the data-path function pointers to point at us. */ + vcc->push = atm_push; + vcc->pop = atm_pop; + + ste->sm.in_cid.sock = sock; + + TRC("- create_ste(): atm port created successfully\n"); + return(ste); + + /* Failed to set up ATM socket: free resources. */ + atm_fail: + if ( sock ) sock_release(sock); + del_from_bucket(ste); + atomic_dec(&ports[cid->port].pm.refcnt); + kmem_cache_free(mpls_cachep, ste); + TRC("- create_ste(): FAIL -- could not create ATM socket\n"); + return(NULL); +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ +} + + + + +/****************************************************************************** + **** PACKET RECEIVE & SWITCHING FUNCTIONS ************************************ + */ + +#ifdef IP_FRAGMENT +/* + * This function is called to fragment the skb so that is can hold a + * maximum stack of labels & still fit in the MTU. The initial version + * can't cope with a packet with labels on it - hopefully this isn't too + * much of a problem in the sort of network we are using. This may bite us in + * the future... + * + * The code for this comes mainly from the ip_fragment function in the system + * file net/ipv4/ip_output.c and has been tweaked to fulfil this + * requirement. + * + * KAF: This is grim, but there isn't really any easy way of bending + * the original function to our purpose :-( + */ +static void mpls_ip_fragment(struct sk_buff *skb, int ttl, k_cid_t *in_cid, + int bos) +{ + static int call_flag = 0; + struct iphdr *iph; + unsigned char *raw; + unsigned char *ptr; + struct net_device *dev; + struct sk_buff *skb2; + unsigned int mtu, hlen, left, len; + int offset; + int not_last_frag; + struct rtable *rt = (struct rtable*)skb->dst; + + /* Protect the function against call thrashing */ + if(call_flag) + { + printk("mpls_ip_fragment: unexpected call...\n"); + call_flag = 0; + goto fail; + } + + call_flag = 1; + TRC("+ mpls_ip_fragment: skb = %p\n", skb); + dev = rt->u.dst.dev; + + /* + * Point into the IP datagram header. + */ + + raw = skb->nh.raw; + iph = (struct iphdr*)raw; + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + left = ntohs(iph->tot_len) - hlen; /* Space per frame */ + mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ +#ifdef LABEL_STACKING + mtu -= LABEL_STACK_DEPTH * 4; /* Allow for max label stack */ +#else + mtu -= 4; /* Allow for single label */ +#endif + + TRC("* mpls_ip_fragment: mtu = %d\n", mtu); + + ptr = raw + hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + not_last_frag = iph->frag_off & htons(IP_MF); + + /* + * Keep copying data until we run out. + */ + + while(left > 0) + { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) + { + printk("mpls_ip_fragment: no memory for new fragment!\n"); + goto fail; + } + + /* + * Set up data on packet + */ + + skb2->pkt_type = skb->pkt_type; + skb2->priority = skb->priority; + skb_reserve(skb2, (dev->hard_header_len+15)&~15); + skb_put(skb2, len + hlen); + skb2->nh.raw = skb2->data; + skb2->h.raw = skb2->data + hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + skb2->dst = dst_clone(skb->dst); + skb2->dev = skb->dev; + + /* + * Copy the packet header into the new buffer. + */ + + memcpy(skb2->nh.raw, raw, hlen); + + /* + * Copy a block of the IP datagram. + */ + memcpy(skb2->h.raw, ptr, len); + left -= len; + + /* + * Fill in the new header fields. + */ + iph = skb2->nh.iph; + iph->frag_off = htons((offset >> 3)); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || not_last_frag) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + +#ifdef CONFIG_NETFILTER + /* Connection association is same as pre-frag packet */ + skb2->nfct = skb->nfct; + nf_conntrack_get(skb2->nfct); +#ifdef CONFIG_NETFILTER_DEBUG + skb2->nf_debug = skb->nf_debug; +#endif +#endif + + /* + * Put this fragment into the sending queue. + */ + + + iph->tot_len = htons(len + hlen); + + ip_send_check(iph); + + /* Must take this before calling mpls_output_switch */ + spin_lock_irq(&mpls_lock); + mpls_output_switch(skb2, ttl, in_cid, bos); + } + + kfree_skb(skb); + TRC("- mpls_ip_fragment: OK\n"); + call_flag = 0; + return; + +fail: + kfree_skb(skb); + TRC("- mpls_ip_fragment: FAIL\n"); + call_flag = 0; + return; +} +#endif /* IP_FRAGMENT */ + +/* + * mpls_output_switch: + * Given a packet () and an input connection identifier (), + * this function will switch the packet to an output (port, label), attaching + * the given field. + * + * If no switch entry exists for the given , or the packet is too + * large for the output interface, the packet is dropped and any appropriate + * action taken. It is therefore unnecessary for this function to return + * any error indication to the caller. + * + * NB. The sk_buff *must* have its IP header pointer set correctly. + * + * NB2. This function must be called with the mpls_lock HELD!! It will be + * released before this function exits. + */ +static void mpls_output_switch(struct sk_buff *skb, int ttl, k_cid_t *in_cid, + int bos) +{ + struct net_device *out_dev = NULL; + switch_table_entry_t *ste; + k_port_mapping_t *out_pm; +#ifdef LABEL_STACKING + int i; + int first_label; + int label; + int new_bos = 0, new_ttl = 0, new_label = 0; + u32 new_shim; + k_cid_t new_cid; +#endif + + if ( --ttl < 1 ) goto time_exceeded; + + /* Find the switch table entry for the given connection id. */ + if ( !(ste = ste_from_cid(in_cid)) || + (ste->sm.type == SM_TYPE_NONE) ) goto error; + + /* If it's an egress mapping, we deliver locally. */ + if ( ste->sm.type == SM_TYPE_EGRESS ) goto local_deliver; + +#ifdef LABEL_STACKING + /* + * A new type has been introduced for label stacking which allows + * a label/port to pop the stack times before forwarding. As entered + * this function has had the top label removed... + */ + if ( ste->sm.type == SM_TYPE_LABEL_POP ) + { + if ( bos ) + { + printk("mpls_output_switch: Error Label " + "received at bos when set to SM_TYPE_LABEL_POP\n"); + goto error; + } + + LSTACK_TRC("mpls_output_switch: POP %d labels\n", + ste->sm.in_cid.num_pops); + + new_bos = 0; /* Preset this */ + for ( i = 0; i < ste->sm.in_cid.num_pops; i++ ) + { + if ( new_bos ) + { + LSTACK_TRC("* mpls_output_switch:" + " bos hit before pop count expired\n"); + break; + } + + /* Grab the shim data. (label:20, reserved:3, bos:1, ttl:8) */ + new_shim = ntohl(*(u32 *)skb->data); + new_label = (new_shim >> 12) & 0xfffff; + new_bos = (new_shim >> 8) & 1; + new_ttl = new_shim & 0xff; + + skb_pull(skb, 4); + } + + if ( new_bos ) + { + /* This is only true if the label is bos */ + skb->nh.raw = skb->data; + skb->h.raw = skb->nh.raw + (skb->nh.iph->ihl << 2); + } + + /* Prepare to call ourselves again with a new cid value */ + new_cid.port = in_cid->port; + new_cid.label = new_label; + + LSTACK_TRC("mpls_output_switch: port %d label %d\n", + new_cid.port, new_cid.label); + LSTACK_TRC("mpls_output_switch: skb %x ttl %d bos %d\n", + (unsigned int)skb, new_ttl, new_bos); + LSTACK_TRC("mpls_output_switch: recursive call.." + "watch out for the bits\n"); + + mpls_output_switch(skb, new_ttl, &new_cid, new_bos); + return; + } +#endif /* LABEL_STACKING */ + + /*** From here on we deal with forwarding only... ***/ + + out_pm = &ports[ste->sm.u.fs.out_cid->port].pm; + + TRC("+ mpls_output_switch(): forwarding %d/%d -> %d/%d\n", + in_cid->port, in_cid->label, + ste->sm.u.fs.out_cid->port, ste->sm.u.fs.out_cid->label); + + if ( bos ) + { + /* + * We can only trim the packet when we are at stack bottom as + * otherwise we can't account for the size of the label stack. + */ + __skb_trim(skb, ntohs(skb->nh.iph->tot_len)); + } + + /* Set the QoS index, used by a tcindex classifier in the oif qdisc. */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = ste->sm.u.fs.out_tc_index; +#endif + + switch ( out_pm->type ) + { + case ETH_PORT: + { + /* Set the output device interface. */ + skb->dev = out_dev = out_pm->u.eth.l_ifdev; + + /* MTU check. */ + TRC("* mpls_output_switch(): skb->len %d MTU %d\n", + skb->len, out_dev->mtu); + + /* + * PJQ: Part of the bodge. If we are not bottom of stack make the + * assumption that the MTU check has been done...otherwise it + * will fail in the LSR when not bos and a full size packet. That is + * we will detect the presence of labels & refuse to fragment - but + * at ingress we will have already done the check and made sure we can + * hold the maximum label stack, thus we can get away with skipping the + * test. + * This will break badly if the MTU is different across the network! + * For our 'captive' network this is a non-problem as all + * nodes have the same MTU. This limitation was acceptable at time of + * writing - at least to me ;-) + */ +#ifdef LABEL_STACKING + if((bos) && (skb->len > (out_dev->mtu - (LABEL_STACK_DEPTH * 4)))) +#else + if (skb->len > (out_dev->mtu - 4)) +#endif + goto too_large; + +#ifdef LABEL_STACKING + first_label = ste->sm.u.fs.out_cid->label; + + if ( bos || first_label ) + { + LSTACK_TRC("mpls_output_switch: First label %d\n", first_label); + + /* Construct the new shim header. */ + skb_push(skb, 4); + *(u32 *)skb->data = htonl(first_label << 12 | bos << 8 | ttl); + + /* Check to see if there is a stack to push */ + if(ste->sm.u.fs.out_cid->num_labels) + { + LSTACK_TRC("mpls_output_switch: Stacking %d labels\n", + ste->sm.u.fs.out_cid->num_labels); + + for ( i = 0; i < ste->sm.u.fs.out_cid->num_labels; i++) + { + label = ste->sm.u.fs.out_cid->label_stack[i]; + + LSTACK_TRC("mpls_output_switch: index %d label %d\n", + i, label); + + /* + * Not sure if ttl field is correct - probably not + * that critical for our lash up + */ + skb_push(skb, 4); + + /* We can't be bos here as we already pushed one ;-) */ + *(u32 *)skb->data = htonl(label << 12 | ttl); + } + } + } + else + { + /* + * We are not bos but label is zero. This is a good time + * to do PHP ie do nothing so we send with next label as 'top' + */ + LSTACK_TRC("mpls_output_switch: PHP\n"); + } +#else + /* Construct the new shim header. */ + skb_push(skb, 4); + *(u32 *)skb->data = + htonl(ste->sm.u.fs.out_cid->label << 12 | bos << 8 | ttl); +#endif /* not ( LABEL_STACKING ) */ + + /* Construct the link layer header. */ + if ( skb_headroom(skb) < ETH_HLEN ) + { + struct sk_buff *nskb; + nskb = skb_copy_expand(skb, ETH_HLEN, 0, GFP_ATOMIC); + if ( !nskb ) goto error; + kfree_skb(skb); + skb = nskb; + } + skb->mac.raw = skb_push(skb, ETH_HLEN); + memcpy(skb->mac.ethernet->h_source, out_dev->dev_addr, ETH_ALEN); + memcpy(skb->mac.ethernet->h_dest, out_pm->u.eth.r_addr, ETH_ALEN); + skb->mac.ethernet->h_proto = __constant_htons(ETH_P_MPLS_UC); + TRC("- mpls_output_switch(): switched to Ethernet interface\n"); + } + break; + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + case ATM_PORT: + { + out_dev = atm_dev; + if ( skb->len > out_dev->mtu ) goto too_large; + TRC("- mpls_output_switch(): switched to ATM interface\n"); + *(struct atm_vcc **)(skb->cb) = ATM_SD(ste->sm.u.fs.out_cid->sock); + out_dev = skb->dev = atm_dev; + } + break; +#endif + + default: ASSERT(0); + } + + /* Send the packet. No need for MPLS lock here! */ + skb->protocol = __constant_htons(ETH_P_MPLS_UC); + dev_hold(out_dev); + spin_unlock_irq(&mpls_lock); + (void)dev_queue_xmit(skb); + dev_put(out_dev); + return; + + + /* + * Deliver to a local protocol-specific handler for further processing. + */ + local_deliver: + ASSERT(ste->sm.u.es.proto == MPLSPROTO_IPV4); /* IPv4 only for now! */ + TRC("* mpls_output_switch(): delivering locally\n"); + +#ifdef LABEL_STACKING + if(!bos) + { + int bos; +#ifdef DEBUG_LABEL_STACKING + int label; +#endif + u32 shim; + int done = 0; + int count = 0; + + /* + * Interesting - need to cope with case where not bos but input label + * set to 'egress' + */ + + while(!done) + { + count++; /* Track number of labels we strip off */ + + shim = ntohl(*(u32 *)skb->data); +#ifdef DEBUG_LABEL_STACKING + label = (shim >> 12) & 0xfffff; +#endif + bos = (shim >> 8) & 1; + skb_pull(skb, 4); + +#ifdef DEBUG_LABEL_STACKING + printk("mpls_output_switch: Count %d label %d bos %d\n", + count, label, bos); +#endif + done = bos; /* Dirty way of terminating the while */ + + /* Prevent infinite loop */ + if(count>= LABEL_STACK_DEPTH) + done = 1; + } + + + /* Not sure if this is right - pinched it from elsewhere */ + skb->nh.raw = skb->data; + skb->h.raw = skb->nh.raw + (skb->nh.iph->ihl << 2); + + if(!bos) + { + printk("mpls_output_switch: stuffed packet - can't find bos\n"); + printk("mpls_output_switch: packet dropped\n"); + kfree_skb(skb); + return; + } + } +#endif /* LABEL_STACKING */ + + skb->dev = ste->sm.u.es.u.ipv4.iif; + dev_hold(skb->dev); + spin_unlock_irq(&mpls_lock); + skb->protocol = __constant_htons(ETH_P_IP); + skb->nh.iph->ttl = ttl; + skb->nh.iph->check = 0; + skb->nh.iph->check = + ip_fast_csum((unsigned char *)skb->nh.iph, skb->nh.iph->ihl); + (void)ip_rcv(skb,skb->dev,NULL); + return; + + + /* + * Packet is too large for output interface. Take appropriate action + * and drop the packet. + */ + too_large: + TRC("- mpls_output_switch(): packet too large for interface\n"); + if ( skb->nh.iph->frag_off & __constant_htons(IP_DF) ) + { + /* + * We are not allowed to fragment this packet. We therefore + * (attempt to) send an ICMP error message to the sender. However, + * there's no guarantee that we know a route back to the source (the + * packet got here by tunnelling through an LSP, after all). A more + * robust strategy would be the following (from IETF draft): + * (1) Create ICMP packet. + * (2) Prepend the current MPLS label stack. + * (3) Send the packet on its way to the _destination_ of the LSP. + * We can expect that the destination will be a router who knows about + * the sender. + * + * For now, note that we _must_ release the mpls_lock before calling + * icmp_send(), as the ICMP packet may also end up being tunnelled! + */ + spin_unlock_irq(&mpls_lock); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(out_dev->mtu)); + kfree_skb(skb); + return; + } + else + { +#ifdef IP_FRAGMENT + if(bos) + { + /* + * call to fragment packet...there are no labels & ip header ptrs + * are set up....most of parameters are to allow + * 'mpls_output_switch' to be called again + */ + spin_unlock_irq(&mpls_lock); + mpls_ip_fragment(skb, ttl, in_cid, bos); + return; + } + else + printk("Attempting to fragment packet with labels...this is " + "not supposed to happen\n"); +#endif + } + goto error; + + /* Packet lifetime in network exceeded (TTL decremented to zero) */ + time_exceeded: + TRC("* recv_eth_frame(): packet lifetime exceeded\n"); + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + + /* On an error, we drop the packet. */ + error: + TRC("- recv_eth_frame(): ERROR PROCESSING PKT\n"); + spin_unlock_irq(&mpls_lock); + kfree_skb(skb); + return; +} + + +/* + * mpls_ingress_hook: + * This is the ingress point for all IPv4 packets entering the "MPLS cloud". + * All packets are passed to this function after routing. If the route + * lookup attached to the packet a traffic class identifier (_tclassid_) + * that we know about, the packet will be stolen from IP and turned into + * an MPLS packet. + */ +static unsigned int mpls_ingress_hook( + unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + ingress_table_entry_t *ite; + fec_t fec; + + spin_lock_irq(&mpls_lock); + + /* + * Find the input (port, label) for this packet's traffic class identifier, + * if it has one. Otherwise we give the packet back to the IPv4 stack for + * conventional forwarding. + */ + fec.proto = MPLSPROTO_IPV4; + fec.u.ipv4.tclassid = skb->dst->tclassid; + +#if defined(CONFIG_NET_CLS_TCINDEX) || defined(CONFIG_NET_CLS_TCINDEX_MODULE) + /* + * Added by Barry Hill to enable 5 tuple match filtering using tc_index + * from Jamal's ingress filter + */ + if (skb->tc_index) + { + TRC("mpls_ingress_hook: Classifier = %d\n", skb->tc_index); + fec.u.ipv4.tclassid = skb->tc_index; + } +#endif + + if ( !(ite = ite_from_fec(&fec)) ) + { + spin_unlock_irq(&mpls_lock); + return(NF_ACCEPT); + } + + /* Okay, it belongs to an MPLS port: pass to the main switch routine. */ + mpls_output_switch(skb, skb->nh.iph->ttl, ite->im.in_cid, 1); + + /* Stolen by MPLS! */ + return(NF_STOLEN); +} + + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) +/* + * recv_atm_frame: + * Process all outstanding ATM frames on the `atm_recvq'. + */ +static void recv_atm_frame(unsigned long data) +{ + struct sk_buff *skb; + struct atm_vcc *vcc; + port_table_entry_t *pte; + k_cid_t cid; + + while ( (skb = skb_dequeue(&atm_recvq)) ) + { + spin_lock_irq(&mpls_lock); + vcc = *(struct atm_vcc **)(skb->cb); + + atm_return(vcc, skb->truesize); + + pte = pte_from_atm_if(vcc->dev->number); + if ( !pte ) goto fail; + + cid.port = pte->pm.id; + SET_VPI(cid.label, vcc->vpi); + SET_VCI(cid.label, vcc->vci); + TRC("* recv_atm_frame(): received frame on %d,%d\n", + cid.port, cid.label); + + skb->dev = atm_dev; + skb->nh.raw = skb->data; + skb->h.raw = skb->nh.raw + (skb->nh.iph->ihl << 2); + + mpls_output_switch(skb, skb->nh.iph->ttl, &cid, 1); + continue; + + fail: + TRC("* recv_atm_frame(): failed to process ATM frame\n"); + spin_unlock_irq(&mpls_lock); + kfree_skb(skb); + } +} +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + + +/* + * recv_eth_frame: + * This is called as the receive handler for packets of data-link + * type `ETH_P_MPLS_UC'. In the normal case, packets will either be + * forwarded with a new label, or delivered to the local IP stack. + */ +static int recv_eth_frame( + struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + int bos, ttl, label; + u32 shim; + port_table_entry_t *pte; + k_cid_t cid; + + TRC("+ recv_eth_frame(), skb=%p, len=%d\n", skb, skb->len); + + /* This function only deals with Ethernet frames. */ + if ( dev->type != ARPHRD_ETHER ) goto free_skb; + + /* Grab the shim data. (label:20, reserved:3, bos:1, ttl:8) */ + shim = ntohl(*(u32 *)skb->data); + label = (shim >> 12) & 0xfffff; + bos = (shim >> 8) & 1; + ttl = shim & 0xff; + + skb_pull(skb, 4); + + if ( bos ) + { + /* This is only true if the label is bos */ + skb->nh.raw = skb->data; + skb->h.raw = skb->nh.raw + (skb->nh.iph->ihl << 2); + } +#ifndef LABEL_STACKING + else + { + TRC("Multiple labels in stack with no support compiled in! :-/\n"); + goto free_skb; + } +#endif + + if ( label < 16 ) goto special_label; + + spin_lock_irq(&mpls_lock); + + /* Find the port that this packet arrived on. */ + pte = pte_from_eth_if(skb->mac.ethernet->h_source, dev); + if ( !pte ) goto no_port; + + /* Pass to the main switch routine. */ + cid.port = pte->pm.id; + cid.label = label; + + LSTACK_TRC("recv_eth_frame: port %d label %d\n", cid.port, cid.label); + LSTACK_TRC("recv_eth_frame: skb %x ttl %d bos %d\n", + (unsigned int)skb, ttl, bos); + + mpls_output_switch(skb, ttl, &cid, bos); + goto finish; + + + /* Special label processing -- we only understand IPv4 NULL for now... */ +#define LBL_IPV4_EXPLICIT_NULL 0 + special_label: + TRC("* recv_eth_frame(): special label %d\n", label); + + if ( label == LBL_IPV4_EXPLICIT_NULL && bos ) goto local_deliver; + + TRC("* recv_eth_frame(): unknown special label, or EXPLICIT_NULL was" + "not at stack bottom\n"); + goto free_skb; /* Terminate with extreme prejudice :) */ + + + /* Deliver to the local IP stack for further processing. */ + local_deliver: + TRC("* recv_eth_frame(): ipv4 delivery\n"); + skb->protocol = __constant_htons(ETH_P_IP); + skb->nh.iph->ttl = ttl; + skb->nh.iph->check = 0; + skb->nh.iph->check = /* prob. not worth coding a "faster" solution. */ + ip_fast_csum((unsigned char *)skb->nh.iph, skb->nh.iph->ihl); + (void)ip_rcv(skb,skb->dev,NULL); + goto finish; + + + /* MPLS frame did not arrive on a known input port. */ + no_port: + TRC("* recv_eth_frame: packet received on unregistered port!\n"); + spin_unlock_irq(&mpls_lock); + + free_skb: + TRC("* recv_eth_frame(): dropping skb\n"); + kfree_skb(skb); + + + finish: + TRC("- recv_eth_frame()\n"); + return(0); /* The error value is discarded anyway, so... */ +} + + + + +/****************************************************************************** + **** MISCELLANEOUS SUPPORT FUNCTIONS ***************************************** + */ + +/* + * flush_all: + * Flush all internal MPLS state. This includes port, ingress, and switch + * mappings. + */ +static int flush_all(void) +{ + int i; + bucket_entry_t *entry; + + TRC("+ flush_all: Start\n"); + TRC("* flush_all: flushing ingress mapping\n"); + for ( i = 0; i < INGRESS_HASH_SIZE; i++ ) + { + ingress_table_entry_t *ite; + + entry = ingress_table[i].next; + while ( entry != &ingress_table[i] ) + { + ite = (ingress_table_entry_t *)entry; + /* Point to next entry *before* releasing memory */ + entry = entry->next; + del_ite(&ite->im.fec); + } + + } + + TRC("* flush_all: flushing switch mappings\n"); + for ( i = 0; i < SWITCH_HASH_SIZE; i++ ) + { + switch_table_entry_t *ste; + + entry = switch_table[i].next; + while(entry != &switch_table[i]) + { + ste = (switch_table_entry_t *)entry; + /* Point to next entry *before* releasing memory */ + entry = entry->next; + + if(!ste) + { + printk("flush_all: NULL ste ptr\n"); + continue; + } + + if ( ste->sm.type == SM_TYPE_NONE ) + { + continue; + } + else if ( ste->sm.type == SM_TYPE_FORWARD ) + { + cid_put(ste->sm.u.fs.out_cid); + ste->sm.u.fs.out_cid = NULL; /* paranoia */ + } + else if ( ste->sm.type == SM_TYPE_EGRESS ) + { + switch ( ste->sm.u.es.proto ) + { + case MPLSPROTO_IPV4: + { + TRC("* flush_all: Call dev_put\n"); + dev_put(ste->sm.u.es.u.ipv4.iif); + ste->sm.u.es.u.ipv4.iif = NULL; /* paranoia */ + } + break; + + default: ASSERT(0); + + /* Add more protocols here... */ + } + } + + TRC("* flush_all: Call ste_put ste = %p\n", ste); + ste->sm.type = SM_TYPE_NONE; + ste_put(ste); + } + } + + TRC("* flush_all: flushing port mappings\n"); + for ( i = 0; i < MAX_PORT; i++ ) + { + if ( ports[i].pm.type == UNDEFINED_PORT ) continue; + del_pte(i); + } + + TRC("- flush_all: Exit\n"); + return(0); +} + + + + +/****************************************************************************** + **** NETLINK USER <-> KERNEL INTERFACE ROUTINES ****************************** + */ + +#define NETLINK_ERR(err) \ + do { \ + netlink_ack(skb, nlh, err); \ + TRC("- netlink_rcv_skb(), ERROR=%d\n", err); \ + return; \ + } while (0); + +#define PORT_OUT_OF_RANGE(__p) (((__p) < 0) || ((__p) > MAX_PORT)) + +static __inline__ void netlink_rcv_skb(struct sk_buff *skb) +{ + int result = 0; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + + TRC("+ netlink_rcv_skb(), skb=%p\n", skb); + + /* Sanity check the netlink message header. */ + if ( (nlh->nlmsg_len < sizeof(*nlh)) || + (skb->len < nlh->nlmsg_len) || + !(nlh->nlmsg_flags & NLM_F_REQUEST) || + (nlh->nlmsg_flags & ~(NLM_F_REQUEST|NLM_F_ACK)) ) + { + TRC("* netlink_rcv_skb(), flags=%04x\n", nlh->nlmsg_flags); + NETLINK_ERR(-EINVAL); + } + + spin_lock_irq(&mpls_lock); + + switch ( nlh->nlmsg_type ) + { + + /* + * Add SWITCH ENTRY + */ + case MPLSMSG_ADD_SWITCH_MAPPING: + { + switch_mapping_t sm; + switch_table_entry_t *in_ste, *out_ste; + + TRC("* netlink_rcv_skb(), ADD SWITCH MAPPING\n"); + + memcpy(&sm, NLMSG_DATA(nlh), sizeof(sm)); + + /* Validate port identifiers. */ + if ( PORT_OUT_OF_RANGE(sm.in_cid.port) || + PORT_OUT_OF_RANGE(sm.out_cid.port) ) + { + result = -EINVAL; + break; + } + + /* Grab the input CID. */ + + in_ste = create_ste((k_cid_t *)&sm.in_cid); + if ( in_ste == NULL ) + { + result = -ENOBUFS; + break; + } + + if ( in_ste->sm.type != SM_TYPE_NONE ) + { + result = -EADDRINUSE; + break; + } + + ste_hold(in_ste); + + /* Map to the output CID. */ + + if ( (ports[sm.out_cid.port].pm.type == LOCAL_PORT) || + ((out_ste = create_ste((k_cid_t *)&sm.out_cid)) == NULL) ) + { + ste_put(in_ste); + result = -ENOBUFS; + break; + } + + in_ste->sm.type = SM_TYPE_FORWARD; + in_ste->sm.u.fs.out_cid = &out_ste->sm.in_cid; + in_ste->sm.u.fs.out_tc_index = sm.out_tc_index; + cid_hold(in_ste->sm.u.fs.out_cid); + } + break; + + + /* + * Delete SWITCH ENTRY + */ + case MPLSMSG_DEL_SWITCH_MAPPING: + { + cid_t cid; + switch_table_entry_t *ste; + + TRC("* netlink_rcv_skb(), DEL SWITCH MAPPING\n"); + + memcpy(&cid, NLMSG_DATA(nlh), sizeof(cid)); + + if ( PORT_OUT_OF_RANGE(cid.port) ) + { + result = -EINVAL; + break; + } + + if ( !(ste = ste_from_cid((k_cid_t *)&cid)) || + (ste->sm.type != SM_TYPE_FORWARD) ) + { + result = -EINVAL; + break; + } + cid_put(ste->sm.u.fs.out_cid); + ste->sm.type = SM_TYPE_NONE; + ste->sm.u.fs.out_cid = NULL; /* paranoia */ + ste_put(ste); + } + break; + +#ifdef LABEL_STACKING + case MPLSMSG_ADD_LABEL_POP: + { + cid_t cid; + switch_table_entry_t *ste; + + TRC("* netlink_rcv_skb: ADD LABEL POP\n"); + + memcpy(&cid, NLMSG_DATA(nlh), sizeof(cid)); + + /* Validate port identifiers. */ + if ( PORT_OUT_OF_RANGE(cid.port) ) + { + result = -EINVAL; + break; + } + + /* Grab the input CID. */ + + ste = create_ste((k_cid_t *)&cid); + if ( ste == NULL ) + { + result = -ENOBUFS; + break; + } + + if ( ste->sm.type != SM_TYPE_NONE ) + { + result = -EADDRINUSE; + break; + } + + ste_hold(ste); + + ste->sm.type = SM_TYPE_LABEL_POP; + ste->sm.in_cid.num_pops = cid.num_pops; + + TRC("* netlink_rcv_skb: port=%d, label=%d, num_pops=%d\n", + ste->sm.in_cid.port, ste->sm.in_cid.label, + ste->sm.in_cid.num_pops); + + } + break; + + case MPLSMSG_DEL_LABEL_POP: + { + cid_t cid; + switch_table_entry_t *ste; + + TRC("* netlink_rcv_skb(), DEL LABEL POP\n"); + + memcpy(&cid, NLMSG_DATA(nlh), sizeof(cid)); + + if ( PORT_OUT_OF_RANGE(cid.port) ) + { + result = -EINVAL; + break; + } + + if ( !(ste = ste_from_cid((k_cid_t *)&cid)) || + (ste->sm.type != SM_TYPE_LABEL_POP) ) + { + result = -EINVAL; + break; + } + ste->sm.type = SM_TYPE_NONE; + ste_put(ste); + } + break; +#endif /* LABEL_STACKING */ + + /* + * Add PORT + */ + case MPLSMSG_ADD_PORT_MAPPING: + { + port_mapping_t pm; + port_table_entry_t pte; + TRC("* netlink_rcv_skb(), ADD PORT MAPPING\n"); + memset(&pte.pm, 0, sizeof(k_port_mapping_t)); + memcpy(&pm, NLMSG_DATA(nlh), sizeof(pm)); + + pte.pm.id = pm.id; + pte.pm.type = pm.type; + + if ( PORT_OUT_OF_RANGE(pm.id) ) + { + result = -EINVAL; + break; + } + + switch ( pm.type ) + { + case ETH_PORT: + { + memcpy(pte.pm.u.eth.r_addr, pm.u.eth.r_addr, ETH_ALEN); + pm.u.eth.l_ifname[IFNAMSIZ-1] = '\0'; + pte.pm.u.eth.l_ifdev = dev_get_by_name(pm.u.eth.l_ifname); + if ( !pte.pm.u.eth.l_ifdev ) result = -ENODEV; + } + break; + + case ATM_PORT: + { +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + pte.pm.u.atm.l_ifindex = pm.u.atm.l_ifindex; + if ( !atm_find_dev(pm.u.atm.l_ifindex) ) result = -ENODEV; +#else + result = -ENODEV; +#endif + } + break; + + case LOCAL_PORT: + { + } + break; + + default: + { + result = -EINVAL; + } + break; + } + + if ( !result ) result = add_pte(&pte); + } + break; + + + /* + * Delete PORT + */ + case MPLSMSG_DEL_PORT_MAPPING: + { + int id = *(int *)NLMSG_DATA(nlh); + TRC("* netlink_rcv_skb(), DEL PORT MAPPING\n"); + + if ( PORT_OUT_OF_RANGE(id) || (ports[id].pm.type == UNDEFINED_PORT) ) + { + result = -EINVAL; + break; + } + + if ( atomic_read(&ports[id].pm.refcnt) != 0 ) + { + result = -EADDRINUSE; + break; + } + + del_pte(id); + } + break; + + + /* + * Add INGRESS MAPPING + */ + case MPLSMSG_ADD_INGRESS_MAPPING: + { + ingress_mapping_t im; + ingress_table_entry_t *ite; + switch_table_entry_t *ste; + + TRC("* netlink_rcv_skb(), ADD INGRESS MAPPING\n"); + + memcpy(&im, NLMSG_DATA(nlh), sizeof(im)); + + if ( PORT_OUT_OF_RANGE(im.in_cid.port) ) + { + result = -EINVAL; + break; + } + + if ( !(ite = kmem_cache_alloc(mpls_cachep, GFP_KERNEL)) ) + { + result = -ENOBUFS; + break; + } + ite->im.fec = im.fec; + + ste = create_ste((k_cid_t *)&im.in_cid); + if ( !ste ) + { + kmem_cache_free(mpls_cachep, ite); + result = -ENOBUFS; + break; + } + ite->im.in_cid = &ste->sm.in_cid; + + if ( (result = add_ite(ite)) ) + { + kmem_cache_free(mpls_cachep, ite); + break; + } + + cid_hold(ite->im.in_cid); + } + break; + + + /* + * Delete INGRESS MAPPING + */ + case MPLSMSG_DEL_INGRESS_MAPPING: + { + fec_t fec; + TRC("* netlink_rcv_skb(), DEL INGRESS MAPPING\n"); + memcpy(&fec, NLMSG_DATA(nlh), sizeof(fec)); + result = del_ite(&fec); + } + break; + + + /* + * Add EGRESS MAPPING + */ + case MPLSMSG_ADD_EGRESS_MAPPING: + { + egress_mapping_t em; + switch_table_entry_t *ste; + TRC("* netlink_rcv_skb(), ADD EGRESS MAPPING\n"); + memcpy(&em, NLMSG_DATA(nlh), sizeof(em)); + + if ( PORT_OUT_OF_RANGE(em.in_cid.port) ) + { + result = -EINVAL; + break; + } + + /* Grab the input CID. */ + + ste = create_ste((k_cid_t *)&em.in_cid); + if ( ste == NULL ) + { + result = -ENOBUFS; + break; + } + + if ( ste->sm.type != SM_TYPE_NONE ) + { + result = -EADDRINUSE; + break; + } + + ste_hold(ste); + + switch ( em.egress.proto ) + { + case MPLSPROTO_IPV4: + { + TRC("* netlink_rcv_skb(): ifname %s\n", em.egress.u.ipv4.ifname); + ste->sm.u.es.u.ipv4.iif = dev_get_by_name(em.egress.u.ipv4.ifname); + if ( !ste->sm.u.es.u.ipv4.iif ) + { + result = -ENODEV; + ste_put(ste); + } + } + /* Moved this as it broke assertion in ste_put */ + ste->sm.type = SM_TYPE_EGRESS; + break; + + default: + { + result = -EINVAL; + ste_put(ste); + } + + /* Add more protocols here... */ + } + } + break; + + + /* + * Delete EGRESS MAPPING + */ + case MPLSMSG_DEL_EGRESS_MAPPING: + { + cid_t cid; + switch_table_entry_t *ste; + + TRC("* netlink_rcv_skb(), DEL EGRESS MAPPING\n"); + + memcpy(&cid, NLMSG_DATA(nlh), sizeof(cid)); + + if ( PORT_OUT_OF_RANGE(cid.port) ) + { + result = -EINVAL; + break; + } + + if ( !(ste = ste_from_cid((k_cid_t *)&cid)) || + (ste->sm.type != SM_TYPE_EGRESS) ) + { + result = -EINVAL; + break; + } + + switch ( ste->sm.u.es.proto ) + { + case MPLSPROTO_IPV4: + { + dev_put(ste->sm.u.es.u.ipv4.iif); + ste->sm.u.es.u.ipv4.iif = NULL; /* paranoia */ + } + break; + + default: ASSERT(0); + + /* Add more protocols here... */ + } + + ste->sm.type = SM_TYPE_NONE; + ste_put(ste); + } + break; + + + /* + * FLUSH + */ + case MPLSMSG_FLUSH_ALL: + { + TRC("* netlink_rcv_skb(), FLUSH ALL\n"); + result = flush_all(); + } + break; + + + /* + * DEBUG + */ + case MPLSMSG_DEBUG_ON: + case MPLSMSG_DEBUG_OFF: + { + TRC("* netlink_rcv_skb(), DEBUG ON/OFF\n"); + mpls_debug = (nlh->nlmsg_type == MPLSMSG_DEBUG_ON); + result = 0; + } + break; + + + default: + TRC("* netlink_rcv_skb(), UNKNOWN CMD\n"); + result = -EINVAL; + break; + } + + spin_unlock_irq(&mpls_lock); + + if ( result ) NETLINK_ERR(result); + + if ( nlh->nlmsg_flags & NLM_F_ACK ) netlink_ack(skb, nlh, 0); + + TRC("- netlink_rcv_skb(), OKAY\n"); +} + +static void netlink_rcv_sk(struct sock *sk, int len) +{ + struct sk_buff *skb; + + while ( (skb = skb_dequeue(&sk->receive_queue)) ) + { + netlink_rcv_skb(skb); + kfree_skb(skb); + } +} + + + + +/****************************************************************************** + **** PROC FILESYSTEM INFORMATION FUNCTIONS *********************************** + */ + +#ifdef CONFIG_PROC_FS + +#define MAX_LINE 128 /* Must be an overestimate, or badness _will_ happen! */ +/* + * (a) we sprintf into `buffer', resetting our pointer whenever the next line + * _could_ overflow the buffer. + * (a) when we first detect a line that _could_ be in the range + * [offset, offset+length], we reset our print buffer pointer. + * (c) the first buffer overflow (see (a)) after we come into range (see (b)) + * causes us to jump out of the print loop -- we've printed all we can! + */ +#define PROC_PRINT(__f, __a...) \ +do { \ + if ( (totsize <= offset && (totsize+MAX_LINE) > offset) ) size = 0; \ + if ( (size + MAX_LINE) > length ) \ + { \ + if ( totsize >= offset ) goto out; \ + size = 0; \ + } \ + j = sprintf(buffer + size, __f , ## __a); \ + size += j; totsize += j; \ +} while(0) + +static int mpls_get_info(char *buffer, char **start, off_t offset, int length) +{ + int i, i1, j, size = 0, totsize = 0; + ingress_table_entry_t *ite; + switch_table_entry_t *ste; + + TRC("+ mpls_get_info(), buf=%p, off=%ld, len=%d\n",buffer,offset,length); + + spin_lock_irq(&mpls_lock); + + PROC_PRINT("PORT TABLE (Local, Ethernet, ATM)\n" + "port -> (local port)\n"); + for ( i = 0; i < MAX_PORT; i++ ) + { + if ( ports[i].pm.type != LOCAL_PORT ) continue; + + PROC_PRINT("%4lu\n", (unsigned long)i); + } + + PROC_PRINT("port -> l_ifname remote_address\n"); + for ( i = 0; i < MAX_PORT; i++ ) + { + if ( ports[i].pm.type != ETH_PORT ) continue; + + PROC_PRINT("%4lu -> %-8s %02x:%02x:%02x:%02x:%02x:%02x\n", + (unsigned long)i, ports[i].pm.u.eth.l_ifdev->name, + (u8)ports[i].pm.u.eth.r_addr[0], + (u8)ports[i].pm.u.eth.r_addr[1], + (u8)ports[i].pm.u.eth.r_addr[2], + (u8)ports[i].pm.u.eth.r_addr[3], + (u8)ports[i].pm.u.eth.r_addr[4], + (u8)ports[i].pm.u.eth.r_addr[5]); + } + +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + PROC_PRINT("\nport -> atm_index\n"); + for ( i = 0; i < MAX_PORT; i++ ) + { + if ( ports[i].pm.type != ATM_PORT ) continue; + + PROC_PRINT("%4lu -> %9d\n", (unsigned long)i, + ports[i].pm.u.atm.l_ifindex); + } +#endif /* CONFIG_ATM || CONFIG_ATM_MODULE */ + +#ifdef LABEL_STACKING + PROC_PRINT("\nFORWARDING TABLE\nport label -> port label tc_index\n"); + for ( i = 0; i < SWITCH_HASH_SIZE; i++ ) + { + for_each_in_bucket(ste, &switch_table[i]) + { + /* We have broken the nice output format by adding the stack */ + if ( ste->sm.type == SM_TYPE_FORWARD ) + { + TRC("* mpls_get_info: Num labels %d\n", + ste->sm.u.fs.out_cid->num_labels + 1); + PROC_PRINT("%4lu %5lu -> %4lu %5lu", + (unsigned long)ste->sm.in_cid.port, + (unsigned long)ste->sm.in_cid.label, + (unsigned long)ste->sm.u.fs.out_cid->port, + (unsigned long)ste->sm.u.fs.out_cid->label); + + for ( i1 = 0; i1 < ste->sm.u.fs.out_cid->num_labels; i1++ ) + { + PROC_PRINT(":%lu", + (unsigned long)ste->sm.u.fs.out_cid-> + label_stack[i1]); + } + PROC_PRINT(" %8lu\n", + (unsigned long)ste->sm.u.fs.out_tc_index); + } + else if ( ste->sm.type == SM_TYPE_LABEL_POP ) + { + /* This bit does the 'Label Pop' labels */ + PROC_PRINT("%4lu %5lu Pop %d Labels\n", + (unsigned long)ste->sm.in_cid.port, + (unsigned long)ste->sm.in_cid.label, + ste->sm.in_cid.num_pops); + } + } + } +#else + PROC_PRINT("\nFORWARDING TABLE\nport label -> port label tc_index\n"); + for ( i = 0; i < SWITCH_HASH_SIZE; i++ ) + { + for_each_in_bucket(ste, &switch_table[i]) + { + if ( ste->sm.type != SM_TYPE_FORWARD ) continue; + + PROC_PRINT("%4lu %5lu -> %4lu %5lu %8lu\n", + (unsigned long)ste->sm.in_cid.port, + (unsigned long)ste->sm.in_cid.label, + (unsigned long)ste->sm.u.fs.out_cid->port, + (unsigned long)ste->sm.u.fs.out_cid->label, + (unsigned long)ste->sm.u.fs.out_tc_index); + } + } +#endif /* LABEL_STACKING */ + + PROC_PRINT("\nINGRESS TABLE (IPv4)\nsrc_realm dst_realm -> port label\n"); + for ( i = 0; i < INGRESS_HASH_SIZE; i++ ) + { + for_each_in_bucket(ite, &ingress_table[i]) + { + if ( ite->im.fec.proto != MPLSPROTO_IPV4 ) continue; + + PROC_PRINT("%9lu %9lu -> %4lu %5lu\n", + (unsigned long)ite->im.fec.u.ipv4.tclassid >> 16, + (unsigned long)ite->im.fec.u.ipv4.tclassid & 0xffff, + (unsigned long)ite->im.in_cid->port, + (unsigned long)ite->im.in_cid->label); + } + } + + PROC_PRINT("\nEGRESS TABLE (IPv4)\nport label -> l_ifname\n"); + for ( i = 0; i < SWITCH_HASH_SIZE; i++ ) + { + for_each_in_bucket(ste, &switch_table[i]) + { + if ( ste->sm.type != SM_TYPE_EGRESS ) continue; + + PROC_PRINT("%4lu %5lu -> %-8s\n", + (unsigned long)ste->sm.in_cid.port, + (unsigned long)ste->sm.in_cid.label, + ste->sm.u.es.u.ipv4.iif->name); + } + } + + out: + spin_unlock_irq(&mpls_lock); + length = (totsize < offset) ? 0 : (totsize - offset); + *start = buffer + size - length; + TRC("- mpls_get_info(), start=%p, size=%d, len=%d\n", *start,size,length); + return(length); +} + +#endif diff -urBP kernel-orig/net/netsyms.c kernel/net/netsyms.c --- kernel-orig/net/netsyms.c Mon Sep 4 13:11:36 2000 +++ kernel/net/netsyms.c Mon Sep 4 11:23:43 2000 @@ -230,6 +230,7 @@ EXPORT_SYMBOL(icmp_reply); EXPORT_SYMBOL(ip_options_compile); EXPORT_SYMBOL(ip_options_undo); +EXPORT_SYMBOL(ip_options_fragment); EXPORT_SYMBOL(arp_send); EXPORT_SYMBOL(arp_broken_ops); EXPORT_SYMBOL(__ip_select_ident); diff -urBP kernel-orig/net/protocols.c kernel/net/protocols.c --- kernel-orig/net/protocols.c Tue Aug 22 14:34:47 2000 +++ kernel/net/protocols.c Tue Aug 22 14:33:21 2000 @@ -15,6 +15,10 @@ #include #endif +#ifdef CONFIG_MPLS +extern void mpls_proto_init(struct net_proto *); +#endif + #ifdef CONFIG_INET #include #ifdef CONFIG_IPV6 @@ -120,6 +124,10 @@ #ifdef CONFIG_UNIX { "UNIX", unix_proto_init }, /* Unix domain socket family */ +#endif + +#ifdef CONFIG_MPLS + { "MPLS", mpls_proto_init }, /* MPLS */ #endif #ifdef NEED_802 diff -urBP kernel-orig/net/sched/cls_tcindex.c kernel/net/sched/cls_tcindex.c --- kernel-orig/net/sched/cls_tcindex.c Tue Aug 22 14:34:46 2000 +++ kernel/net/sched/cls_tcindex.c Tue Aug 22 14:34:01 2000 @@ -170,14 +170,16 @@ int i; struct tcindex_filter **walk = NULL; - for (i = 0; !f && i < p->hash; i++) { - for (walk = p->h+i; !f && *walk; walk = &(*walk)->next) { - if (&(*walk)->result == r) + for (i = 0; i < p->hash; i++) { + for (walk = p->h+i; *walk; walk = &(*walk)->next) { + if (&(*walk)->result == r) { f = *walk; + goto found; + } } } - if (!f) return -ENOENT; + found: /* @@@ OK? -- No (jhs) Look more into it