blob: ac4d6b3fa9c9943ed5c74f3c264630ec0ff2ce83 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
112#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700113#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
David S. Miller68a5e3d2011-03-11 20:07:33 -0500115#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#define IP_MAX_MTU 0xFFF0
119
120#define RT_GC_TIMEOUT (300*HZ)
121
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126static int ip_rt_redirect_number __read_mostly = 9;
127static int ip_rt_redirect_load __read_mostly = HZ / 50;
128static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost __read_mostly = HZ;
130static int ip_rt_error_burst __read_mostly = 5 * HZ;
131static int ip_rt_gc_elasticity __read_mostly = 8;
132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700135static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000146static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800151static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000153static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155{
156}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157
David S. Miller62fa8a82011-01-26 20:51:05 -0800158static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159{
David S. Miller06582542011-01-27 14:58:42 -0800160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800163
David S. Miller06582542011-01-27 14:58:42 -0800164 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400165 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800166
167 peer = rt->peer;
168 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
David S. Miller06582542011-01-27 14:58:42 -0800172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191}
192
David S. Millerd3aaeb32011-07-18 00:40:17 -0700193static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800197 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800200 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000201 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800202 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700208 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700209 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210};
211
212#define ECN_OR_COST(class) TC_PRIO_##class
213
Philippe De Muyter4839c522007-07-09 15:32:57 -0700214const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000216 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231};
Amir Vadaid4a96862012-04-04 21:33:28 +0000232EXPORT_SYMBOL(ip_tos2prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233
234/*
235 * Route cache.
236 */
237
238/* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000249 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250};
Neil Horman1080d702008-10-27 12:28:25 -0700251
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700252#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700254/*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700258 */
Ingo Molnar62051202006-07-03 00:24:59 -0700259#ifdef CONFIG_LOCKDEP
260# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700261#else
Ingo Molnar62051202006-07-03 00:24:59 -0700262# if NR_CPUS >= 32
263# define RT_HASH_LOCK_SZ 4096
264# elif NR_CPUS >= 16
265# define RT_HASH_LOCK_SZ 2048
266# elif NR_CPUS >= 8
267# define RT_HASH_LOCK_SZ 1024
268# elif NR_CPUS >= 4
269# define RT_HASH_LOCK_SZ 512
270# else
271# define RT_HASH_LOCK_SZ 256
272# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700273#endif
274
275static spinlock_t *rt_hash_locks;
276# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800277
278static __init void rt_hash_lock_init(void)
279{
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700290#else
291# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800292
293static inline void rt_hash_lock_init(void)
294{
295}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700296#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700298static struct rt_hash_bucket *rt_hash_table __read_mostly;
Eric Dumazet95c96172012-04-15 05:58:06 +0000299static unsigned int rt_hash_mask __read_mostly;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700300static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301
Eric Dumazet2f970d82006-01-17 02:54:36 -0800302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000303#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700305static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700306 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700309 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800310 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311}
312
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700313static inline int rt_genid(struct net *net)
314{
315 return atomic_read(&net->ipv4.rt_genid);
316}
317
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318#ifdef CONFIG_PROC_FS
319struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800320 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800322 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323};
324
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900325static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900327 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700332 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800337 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800338 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700339 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800340 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 rcu_read_unlock_bh();
342 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800343 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344}
345
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900346static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800347 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900349 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700350
Eric Dumazet1c317202010-10-25 21:02:07 +0000351 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 while (!r) {
353 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700354 do {
355 if (--st->bucket < 0)
356 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000361 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362}
363
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900364static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800365 struct rtable *r)
366{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700369 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800370 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375}
376
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900377static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900379 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
381 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900382 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 --pos;
384 return pos ? NULL : r;
385}
386
387static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388{
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900391 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700392 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800393 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394}
395
396static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397{
Eric Dumazet29e75252008-01-31 17:05:09 -0800398 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900401 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900403 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 ++*pos;
405 return r;
406}
407
408static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409{
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412}
413
414static int rt_cache_seq_show(struct seq_file *seq, void *v)
415{
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700423 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000424 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425
Eric Dumazet218fa902011-11-29 20:05:55 +0000426 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000427 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700433 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800438 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700442 r->rt_key_tos,
David S. Millerf6b72b622011-07-14 07:53:20 -0700443 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000444 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900448 }
449 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700452static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457};
458
459static int rt_cache_seq_open(struct inode *inode, struct file *file)
460{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800461 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700462 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463}
464
Arjan van de Ven9a321442007-02-12 00:55:35 -0800465static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800470 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471};
472
473
474static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475{
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
Rusty Russell0f23174a2008-12-29 12:23:42 +0000481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800485 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487 return NULL;
488}
489
490static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491{
492 int cpu;
493
Rusty Russell0f23174a2008-12-29 12:23:42 +0000494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800498 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 }
500 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502}
503
504static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505{
506
507}
508
509static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510{
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 return 0;
516 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900517
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000520 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900531 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541}
542
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700543static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548};
549
550
551static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552{
553 return seq_open(file, &rt_cpu_seq_ops);
554}
555
Arjan van de Ven9a321442007-02-12 00:55:35 -0800556static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562};
563
Patrick McHardyc7066f72011-01-14 13:36:42 +0100564#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800565static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800566{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800569
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800573
Alexey Dobriyana661c412009-11-25 15:40:35 -0800574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800581 }
582 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800583
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800587}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800588
589static int rt_acct_proc_open(struct inode *inode, struct file *file)
590{
591 return single_open(file, rt_acct_proc_show, NULL);
592}
593
594static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800601#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800602
Denis V. Lunev73b38712008-02-28 20:51:18 -0800603static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604{
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
Wang Chen77020722008-02-28 14:14:25 -0800612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800614 if (!pde)
615 goto err2;
616
Patrick McHardyc7066f72011-01-14 13:36:42 +0100617#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800619 if (!pde)
620 goto err3;
621#endif
622 return 0;
623
Patrick McHardyc7066f72011-01-14 13:36:42 +0100624#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800625err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627#endif
628err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630err1:
631 return -ENOMEM;
632}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800633
634static void __net_exit ip_rt_do_proc_exit(struct net *net)
635{
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100638#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800639 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000640#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800641}
642
643static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646};
647
648static int __init ip_rt_proc_init(void)
649{
650 return register_pernet_subsys(&ip_rt_proc_ops);
651}
652
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800653#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800654static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800655{
656 return 0;
657}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900659
Stephen Hemminger5969f712008-04-10 01:52:09 -0700660static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661{
Changli Gaod8d1f302010-06-10 23:31:35 -0700662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663}
664
Stephen Hemminger5969f712008-04-10 01:52:09 -0700665static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669}
670
Stephen Hemminger5969f712008-04-10 01:52:09 -0700671static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672{
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800676 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677}
678
Stephen Hemminger5969f712008-04-10 01:52:09 -0700679static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680{
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800682 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683}
684
685static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686{
687 unsigned long age;
688 int ret = 0;
689
Changli Gaod8d1f302010-06-10 23:31:35 -0700690 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 goto out;
692
Changli Gaod8d1f302010-06-10 23:31:35 -0700693 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698out: return ret;
699}
700
701/* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706static inline u32 rt_score(struct rtable *rt)
707{
Changli Gaod8d1f302010-06-10 23:31:35 -0700708 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
David S. Millerc7537962010-11-11 17:07:48 -0800715 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720}
721
Neil Horman1080d702008-10-27 12:28:25 -0700722static inline bool rt_caching(const struct net *net)
723{
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726}
727
David S. Miller5e2b61f2011-03-04 21:47:09 -0800728static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700730{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700734}
735
David S. Miller5e2b61f2011-03-04 21:47:09 -0800736static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744}
745
Denis V. Lunevb5921912008-01-22 23:50:25 -0800746static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747{
Changli Gaod8d1f302010-06-10 23:31:35 -0700748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800749}
750
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700751static inline int rt_is_expired(struct rtable *rth)
752{
Changli Gaod8d1f302010-06-10 23:31:35 -0700753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700754}
755
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800756/*
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
760 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800761static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800762{
763 unsigned int i;
764 struct rtable *rth, *next;
765
766 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800767 struct rtable __rcu **pprev;
768 struct rtable *list;
769
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800770 if (process_context && need_resched())
771 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000772 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800773 if (!rth)
774 continue;
775
776 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700777
David S. Miller6561a3b2010-12-19 21:11:20 -0800778 list = NULL;
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000781 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700782
David S. Miller6561a3b2010-12-19 21:11:20 -0800783 while (rth) {
784 next = rcu_dereference_protected(rth->dst.rt_next,
785 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700786
David S. Miller6561a3b2010-12-19 21:11:20 -0800787 if (!net ||
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
791 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700792 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800793 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700794 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800795 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700796 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800797
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800798 spin_unlock_bh(rt_hash_lock_addr(i));
799
David S. Miller6561a3b2010-12-19 21:11:20 -0800800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
802 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800803 }
804 }
805}
806
Neil Horman1080d702008-10-27 12:28:25 -0700807/*
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813 */
814
815#define FRACT_BITS 3
816#define ONE (1UL << FRACT_BITS)
817
Eric Dumazet98376382010-03-08 03:20:00 +0000818/*
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
824 */
825static int has_noalias(const struct rtable *head, const struct rtable *rth)
826{
827 const struct rtable *aux = head;
828
829 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800830 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000831 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000833 }
834 return ONE;
835}
836
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500837static void rt_check_expire(void)
838{
839 static unsigned int rover;
840 unsigned int i = rover, goal;
841 struct rtable *rth;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
845 unsigned long delta;
846 u64 mult;
847
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
859
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
862
863 if (need_resched())
864 cond_resched();
865
866 samples++;
867
868 if (rcu_dereference_raw(*rthp) == NULL)
869 continue;
870 length = 0;
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
877 rt_free(rth);
878 continue;
879 }
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
883nofree:
884 tmo >>= 1;
885 rthp = &rth->dst.rt_next;
886 /*
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
893 */
894 length += has_noalias(rt_hash_table[i].chain, rth);
895 continue;
896 }
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 goto nofree;
899
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
902 rt_free(rth);
903 }
904 spin_unlock_bh(rt_hash_lock_addr(i));
905 sum += length;
906 sum2 += length*length;
907 }
908 if (samples) {
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
912 ip_rt_gc_elasticity,
913 (avg + 4*sd) >> FRACT_BITS);
914 }
915 rover = i;
916}
917
918/*
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
921 */
922static void rt_worker_func(struct work_struct *work)
923{
924 rt_check_expire();
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926}
927
Eric Dumazet29e75252008-01-31 17:05:09 -0800928/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300929 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700934static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935{
Eric Dumazet29e75252008-01-31 17:05:09 -0800936 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937
Eric Dumazet29e75252008-01-31 17:05:09 -0800938 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000940 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941}
942
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800943/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800944 * delay < 0 : invalidate cache (fast : entries will be deleted later)
945 * delay >= 0 : invalidate & flush cache (can be long)
946 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700947void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800948{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700949 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800950 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800951 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800952}
953
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000954/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800955void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000956{
David S. Miller6561a3b2010-12-19 21:11:20 -0800957 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000958}
959
Neil Horman1080d702008-10-27 12:28:25 -0700960static void rt_emergency_hash_rebuild(struct net *net)
961{
Neil Horman3ee94372010-05-08 01:57:52 -0700962 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000963 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700964 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700965}
966
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967/*
968 Short description of GC goals.
969
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
973
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
978 */
979
Daniel Lezcano569d3642008-01-18 03:56:57 -0800980static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981{
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
984 static int rover;
985 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000986 struct rtable *rth;
987 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 unsigned long now = jiffies;
989 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 /*
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
995 */
996
997 RT_CACHE_STAT_INC(gc_total);
998
999 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001000 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 RT_CACHE_STAT_INC(gc_ignored);
1002 goto out;
1003 }
1004
Eric Dumazetfc66f952010-10-08 06:37:34 +00001005 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 if (goal <= 0) {
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001011 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001014 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 }
1016 } else {
1017 /* We are in dangerous area. Try to reduce cache really
1018 * aggressively.
1019 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001021 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 }
1023
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1025 last_gc = now;
1026
1027 if (goal <= 0) {
1028 equilibrium += goal;
1029 goto work_done;
1030 }
1031
1032 do {
1033 int i, k;
1034
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1037
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001040 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001043 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001044 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001046 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 continue;
1048 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001049 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 rt_free(rth);
1051 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001053 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 if (goal <= 0)
1055 break;
1056 }
1057 rover = k;
1058
1059 if (goal <= 0)
1060 goto work_done;
1061
1062 /* Goal is not achieved. We stop process if:
1063
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1069 */
1070
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073 if (expire == 0)
1074 break;
1075
1076 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001077
Eric Dumazetfc66f952010-10-08 06:37:34 +00001078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 goto out;
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1081
Eric Dumazetfc66f952010-10-08 06:37:34 +00001082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085 goto out;
1086 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001087 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088 RT_CACHE_STAT_INC(gc_dst_overflow);
1089 return 1;
1090
1091work_done:
1092 expire += ip_rt_gc_min_interval;
1093 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097out: return 0;
1098}
1099
Eric Dumazet98376382010-03-08 03:20:00 +00001100/*
1101 * Returns number of entries in a hash chain that have different hash_inputs
1102 */
1103static int slow_chain_length(const struct rtable *head)
1104{
1105 int length = 0;
1106 const struct rtable *rth = head;
1107
1108 while (rth) {
1109 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001110 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001111 }
1112 return length >> FRACT_BITS;
1113}
1114
David S. Millerd3aaeb32011-07-18 00:40:17 -07001115static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001116{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001117 static const __be32 inaddr_any = 0;
1118 struct net_device *dev = dst->dev;
1119 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001120 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001121 struct neighbour *n;
1122
David S. Miller39232972012-01-26 15:22:32 -05001123 rt = (const struct rtable *) dst;
1124
David Miller3769cff2011-07-11 22:44:24 +00001125 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001126 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001127 else if (rt->rt_gateway)
1128 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001129
David S. Miller80703d22012-02-15 17:48:35 -05001130 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001131 if (n)
1132 return n;
David Miller32092ec2011-07-25 00:01:41 +00001133 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001134}
1135
1136static int rt_bind_neighbour(struct rtable *rt)
1137{
1138 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001139 if (IS_ERR(n))
1140 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001141 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001142
1143 return 0;
1144}
1145
Eric Dumazet95c96172012-04-15 05:58:06 +00001146static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
David S. Millerb23dd4f2011-03-02 14:31:35 -08001147 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001148{
Eric Dumazet1c317202010-10-25 21:02:07 +00001149 struct rtable *rth, *cand;
1150 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 u32 min_score;
1153 int chain_length;
1154 int attempts = !in_softirq();
1155
1156restart:
1157 chain_length = 0;
1158 min_score = ~(u32)0;
1159 cand = NULL;
1160 candp = NULL;
1161 now = jiffies;
1162
Changli Gaod8d1f302010-06-10 23:31:35 -07001163 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001164 /*
1165 * If we're not caching, just tell the caller we
1166 * were successful and don't touch the route. The
1167 * caller hold the sole reference to the cache entry, and
1168 * it will be released when the caller is done with it.
1169 * If we drop it here, the callers have no way to resolve routes
1170 * when we're not caching. Instead, just point *rp at rt, so
1171 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001172 * Note that we do rt_free on this new route entry, so that
1173 * once its refcount hits zero, we are still able to reap it
1174 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001175 * Note: To avoid expensive rcu stuff for this uncached dst,
1176 * we set DST_NOCACHE so that dst_release() can free dst without
1177 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001178 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001179
Eric Dumazetc7d44262010-10-03 22:17:54 -07001180 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001182 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001183 if (err) {
1184 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001185 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001186 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001187 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001188 }
1189 }
1190
Neil Hormanb6280b42009-06-22 10:18:53 +00001191 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001192 }
1193
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 rthp = &rt_hash_table[hash].chain;
1195
Eric Dumazet22c047c2005-07-05 14:55:24 -07001196 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001197 while ((rth = rcu_dereference_protected(*rthp,
1198 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001199 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001200 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001201 rt_free(rth);
1202 continue;
1203 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001204 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001206 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 /*
1208 * Since lookup is lockfree, the deletion
1209 * must be visible to another weakly ordered CPU before
1210 * the insertion at the start of the hash chain.
1211 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001212 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001213 rt_hash_table[hash].chain);
1214 /*
1215 * Since lookup is lockfree, the update writes
1216 * must be ordered for consistency on SMP.
1217 */
1218 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1219
Changli Gaod8d1f302010-06-10 23:31:35 -07001220 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001221 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222
1223 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001224 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001225 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001226 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227 }
1228
Changli Gaod8d1f302010-06-10 23:31:35 -07001229 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 u32 score = rt_score(rth);
1231
1232 if (score <= min_score) {
1233 cand = rth;
1234 candp = rthp;
1235 min_score = score;
1236 }
1237 }
1238
1239 chain_length++;
1240
Changli Gaod8d1f302010-06-10 23:31:35 -07001241 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001242 }
1243
1244 if (cand) {
1245 /* ip_rt_gc_elasticity used to be average length of chain
1246 * length, when exceeded gc becomes really aggressive.
1247 *
1248 * The second limit is less certain. At the moment it allows
1249 * only 2 entries per bucket. We will see.
1250 */
1251 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001252 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 rt_free(cand);
1254 }
Neil Horman1080d702008-10-27 12:28:25 -07001255 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001256 if (chain_length > rt_chain_length_max &&
1257 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001258 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001259 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001260 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001261 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001262 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001263 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001264 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001265 spin_unlock_bh(rt_hash_lock_addr(hash));
1266
David S. Miller5e2b61f2011-03-04 21:47:09 -08001267 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001268 ifindex, rt_genid(net));
1269 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001271 }
1272
1273 /* Try to bind route to arp only if it is output
1274 route or unicast forwarding path.
1275 */
David S. Millerc7537962010-11-11 17:07:48 -08001276 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001277 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001279 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280
1281 if (err != -ENOBUFS) {
1282 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001283 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284 }
1285
1286 /* Neighbour tables are full and nothing
1287 can be released. Try to shrink route cache,
1288 it is most likely it holds some neighbour records.
1289 */
1290 if (attempts-- > 0) {
1291 int saved_elasticity = ip_rt_gc_elasticity;
1292 int saved_int = ip_rt_gc_min_interval;
1293 ip_rt_gc_elasticity = 1;
1294 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001295 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 ip_rt_gc_min_interval = saved_int;
1297 ip_rt_gc_elasticity = saved_elasticity;
1298 goto restart;
1299 }
1300
1301 if (net_ratelimit())
Joe Perchesafd465032012-03-12 07:03:32 +00001302 pr_warn("Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001304 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 }
1306 }
1307
Changli Gaod8d1f302010-06-10 23:31:35 -07001308 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001309
Eric Dumazet00269b52008-10-16 14:18:29 -07001310 /*
1311 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001312 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001313 * before making rt visible to other CPUS.
1314 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001315 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001316
Eric Dumazet22c047c2005-07-05 14:55:24 -07001317 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001318
Neil Hormanb6280b42009-06-22 10:18:53 +00001319skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001320 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001321 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001322 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323}
1324
David S. Miller6431cbc2011-02-07 20:38:06 -08001325static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1326
1327static u32 rt_peer_genid(void)
1328{
1329 return atomic_read(&__rt_peer_genid);
1330}
1331
David S. Millera48eff12011-05-18 18:42:43 -04001332void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 struct inet_peer *peer;
1335
David S. Millera48eff12011-05-18 18:42:43 -04001336 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001338 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001340 else
1341 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342}
1343
1344/*
1345 * Peer allocation may fail only in serious out-of-memory conditions. However
1346 * we still can generate some output.
1347 * Random ID selection looks a bit dangerous because we have no chances to
1348 * select ID being unique in a reasonable period of time.
1349 * But broken packet identifier may be better than no packet at all.
1350 */
1351static void ip_select_fb_ident(struct iphdr *iph)
1352{
1353 static DEFINE_SPINLOCK(ip_fb_id_lock);
1354 static u32 ip_fallback_id;
1355 u32 salt;
1356
1357 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001358 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359 iph->id = htons(salt & 0xFFFF);
1360 ip_fallback_id = salt;
1361 spin_unlock_bh(&ip_fb_id_lock);
1362}
1363
1364void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365{
1366 struct rtable *rt = (struct rtable *) dst;
1367
Eric Dumazete688a602011-12-22 04:15:53 +00001368 if (rt && !(rt->dst.flags & DST_NOPEER)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001370 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371
1372 /* If peer is attached to destination, it is never detached,
1373 so that we need not to grab a lock to dereference it.
1374 */
1375 if (rt->peer) {
1376 iph->id = htons(inet_getid(rt->peer, more));
1377 return;
1378 }
Eric Dumazete688a602011-12-22 04:15:53 +00001379 } else if (!rt)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001380 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001381 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382
1383 ip_select_fb_ident(iph);
1384}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001385EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386
Eric Dumazet95c96172012-04-15 05:58:06 +00001387static void rt_del(unsigned int hash, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388{
Eric Dumazet1c317202010-10-25 21:02:07 +00001389 struct rtable __rcu **rthp;
1390 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391
Eric Dumazet29e75252008-01-31 17:05:09 -08001392 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001393 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001395 while ((aux = rcu_dereference_protected(*rthp,
1396 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001397 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001398 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001399 rt_free(aux);
1400 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001402 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001403 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001404 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405}
1406
David S. Millerde398fb2011-12-05 13:21:42 -05001407static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001408{
1409 struct rtable *rt = (struct rtable *) dst;
1410 __be32 orig_gw = rt->rt_gateway;
1411 struct neighbour *n, *old_n;
1412
1413 dst_confirm(&rt->dst);
1414
1415 rt->rt_gateway = peer->redirect_learned.a4;
1416
1417 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001418 if (IS_ERR(n)) {
1419 rt->rt_gateway = orig_gw;
1420 return;
1421 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001422 old_n = xchg(&rt->dst._neighbour, n);
1423 if (old_n)
1424 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001425 if (!(n->nud_state & NUD_VALID)) {
1426 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001427 } else {
1428 rt->rt_flags |= RTCF_REDIRECTED;
1429 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1430 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001431}
1432
Eric Dumazeted7865a42010-06-07 21:49:44 -07001433/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001434void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1435 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001437 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001438 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001439 __be32 skeys[2] = { saddr, 0 };
1440 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001441 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001442 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 if (!in_dev)
1445 return;
1446
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001447 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001448 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1449 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1450 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 goto reject_redirect;
1452
1453 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1454 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1455 goto reject_redirect;
1456 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1457 goto reject_redirect;
1458 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001459 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460 goto reject_redirect;
1461 }
1462
Flavio Leitner7cc91502011-10-24 02:56:38 -04001463 for (s = 0; s < 2; s++) {
1464 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001465 unsigned int hash;
1466 struct rtable __rcu **rthp;
1467 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001469 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1470
1471 rthp = &rt_hash_table[hash].chain;
1472
1473 while ((rt = rcu_dereference(*rthp)) != NULL) {
1474 rthp = &rt->dst.rt_next;
1475
1476 if (rt->rt_key_dst != daddr ||
1477 rt->rt_key_src != skeys[s] ||
1478 rt->rt_oif != ikeys[i] ||
1479 rt_is_input_route(rt) ||
1480 rt_is_expired(rt) ||
1481 !net_eq(dev_net(rt->dst.dev), net) ||
1482 rt->dst.error ||
1483 rt->dst.dev != dev ||
1484 rt->rt_gateway != old_gw)
1485 continue;
1486
1487 if (!rt->peer)
1488 rt_bind_peer(rt, rt->rt_dst, 1);
1489
1490 peer = rt->peer;
1491 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001492 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001493 peer->redirect_learned.a4 = new_gw;
1494 atomic_inc(&__rt_peer_genid);
1495 }
1496 check_peer_redir(&rt->dst, peer);
1497 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001498 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001499 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 return;
1502
1503reject_redirect:
1504#ifdef CONFIG_IP_ROUTE_VERBOSE
1505 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001506 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001507 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001508 &old_gw, dev->name, &new_gw,
1509 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001511 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512}
1513
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001514static bool peer_pmtu_expired(struct inet_peer *peer)
1515{
1516 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1517
1518 return orig &&
1519 time_after_eq(jiffies, orig) &&
1520 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1521}
1522
1523static bool peer_pmtu_cleaned(struct inet_peer *peer)
1524{
1525 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1526
1527 return orig &&
1528 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1529}
1530
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1532{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001533 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534 struct dst_entry *ret = dst;
1535
1536 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001537 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 ip_rt_put(rt);
1539 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001540 } else if (rt->rt_flags & RTCF_REDIRECTED) {
Eric Dumazet95c96172012-04-15 05:58:06 +00001541 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001542 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001543 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 rt_del(hash, rt);
1545 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001546 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1547 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548 }
1549 }
1550 return ret;
1551}
1552
1553/*
1554 * Algorithm:
1555 * 1. The first ip_rt_redirect_number redirects are sent
1556 * with exponential backoff, then we stop sending them at all,
1557 * assuming that the host ignores our redirects.
1558 * 2. If we did not see packets requiring redirects
1559 * during ip_rt_redirect_silence, we assume that the host
1560 * forgot redirected route and start to send redirects again.
1561 *
1562 * This algorithm is much cheaper and more intelligent than dumb load limiting
1563 * in icmp.c.
1564 *
1565 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1566 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1567 */
1568
1569void ip_rt_send_redirect(struct sk_buff *skb)
1570{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001571 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001572 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001573 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001574 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575
Eric Dumazet30038fc2009-08-28 23:52:01 -07001576 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001577 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001578 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1579 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001581 }
1582 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1583 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584
David S. Miller92d86822011-02-04 15:55:25 -08001585 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001586 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001587 peer = rt->peer;
1588 if (!peer) {
1589 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1590 return;
1591 }
1592
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 /* No redirected packets during ip_rt_redirect_silence;
1594 * reset the algorithm.
1595 */
David S. Miller92d86822011-02-04 15:55:25 -08001596 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1597 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598
1599 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001600 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001601 */
David S. Miller92d86822011-02-04 15:55:25 -08001602 if (peer->rate_tokens >= ip_rt_redirect_number) {
1603 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001604 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 }
1606
1607 /* Check for load limit; set rate_last to the latest sent
1608 * redirect.
1609 */
David S. Miller92d86822011-02-04 15:55:25 -08001610 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001611 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001612 (peer->rate_last +
1613 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001615 peer->rate_last = jiffies;
1616 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001618 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001619 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001621 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1622 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001623 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624#endif
1625 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626}
1627
1628static int ip_error(struct sk_buff *skb)
1629{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001630 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001631 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001633 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 int code;
1635
Changli Gaod8d1f302010-06-10 23:31:35 -07001636 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001637 case EINVAL:
1638 default:
1639 goto out;
1640 case EHOSTUNREACH:
1641 code = ICMP_HOST_UNREACH;
1642 break;
1643 case ENETUNREACH:
1644 code = ICMP_NET_UNREACH;
1645 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1646 IPSTATS_MIB_INNOROUTES);
1647 break;
1648 case EACCES:
1649 code = ICMP_PKT_FILTERED;
1650 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651 }
1652
David S. Miller92d86822011-02-04 15:55:25 -08001653 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001654 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001655 peer = rt->peer;
1656
1657 send = true;
1658 if (peer) {
1659 now = jiffies;
1660 peer->rate_tokens += now - peer->rate_last;
1661 if (peer->rate_tokens > ip_rt_error_burst)
1662 peer->rate_tokens = ip_rt_error_burst;
1663 peer->rate_last = now;
1664 if (peer->rate_tokens >= ip_rt_error_cost)
1665 peer->rate_tokens -= ip_rt_error_cost;
1666 else
1667 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 }
David S. Miller92d86822011-02-04 15:55:25 -08001669 if (send)
1670 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671
1672out: kfree_skb(skb);
1673 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001674}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675
1676/*
1677 * The last two values are not from the RFC but
1678 * are needed for AMPRnet AX.25 paths.
1679 */
1680
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001681static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1683
Stephen Hemminger5969f712008-04-10 01:52:09 -07001684static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001685{
1686 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001687
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1689 if (old_mtu > mtu_plateau[i])
1690 return mtu_plateau[i];
1691 return 68;
1692}
1693
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001694unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001695 unsigned short new_mtu,
1696 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001700 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
David S. Miller2c8cec52011-02-09 20:42:07 -08001702 peer = inet_getpeer_v4(iph->daddr, 1);
1703 if (peer) {
1704 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705
David S. Miller2c8cec52011-02-09 20:42:07 -08001706 if (new_mtu < 68 || new_mtu >= old_mtu) {
1707 /* BSD 4.2 derived systems incorrectly adjust
1708 * tot_len by the IP header length, and report
1709 * a zero MTU in the ICMP message.
1710 */
1711 if (mtu == 0 &&
1712 old_mtu >= 68 + (iph->ihl << 2))
1713 old_mtu -= iph->ihl << 2;
1714 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001716
1717 if (mtu < ip_rt_min_pmtu)
1718 mtu = ip_rt_min_pmtu;
1719 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001720 unsigned long pmtu_expires;
1721
1722 pmtu_expires = jiffies + ip_rt_mtu_expires;
1723 if (!pmtu_expires)
1724 pmtu_expires = 1UL;
1725
David S. Miller2c8cec52011-02-09 20:42:07 -08001726 est_mtu = mtu;
1727 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001728 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001729 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001730 }
1731
1732 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733 }
1734 return est_mtu ? : new_mtu;
1735}
1736
David S. Miller2c8cec52011-02-09 20:42:07 -08001737static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1738{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001739 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001740
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001741 if (!expires)
1742 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001743 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001744 u32 orig_dst_mtu = dst_mtu(dst);
1745 if (peer->pmtu_learned < orig_dst_mtu) {
1746 if (!peer->pmtu_orig)
1747 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1748 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1749 }
1750 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1751 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1752}
1753
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1755{
David S. Miller2c8cec52011-02-09 20:42:07 -08001756 struct rtable *rt = (struct rtable *) dst;
1757 struct inet_peer *peer;
1758
1759 dst_confirm(dst);
1760
1761 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001762 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001763 peer = rt->peer;
1764 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001765 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1766
David S. Miller2c8cec52011-02-09 20:42:07 -08001767 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001769 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001770
1771 pmtu_expires = jiffies + ip_rt_mtu_expires;
1772 if (!pmtu_expires)
1773 pmtu_expires = 1UL;
1774
David S. Miller2c8cec52011-02-09 20:42:07 -08001775 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001776 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001777
1778 atomic_inc(&__rt_peer_genid);
1779 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001781 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782 }
1783}
1784
David S. Millerf39925d2011-02-09 22:00:16 -08001785
David S. Millerde398fb2011-12-05 13:21:42 -05001786static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001787{
David S. Miller6431cbc2011-02-07 20:38:06 -08001788 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001789 struct inet_peer *peer;
1790
David S. Miller6431cbc2011-02-07 20:38:06 -08001791 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001792 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001793
David S. Miller2c8cec52011-02-09 20:42:07 -08001794 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001795 if (peer) {
David S. Millerefbc368d2011-12-01 13:38:59 -05001796 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001797
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001798 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001799 peer->redirect_learned.a4 != rt->rt_gateway)
1800 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001801 }
1802
David S. Miller6431cbc2011-02-07 20:38:06 -08001803 rt->rt_peer_genid = rt_peer_genid();
1804 }
David S. Millerefbc368d2011-12-01 13:38:59 -05001805}
1806
1807static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808{
1809 struct rtable *rt = (struct rtable *) dst;
1810
1811 if (rt_is_expired(rt))
1812 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001813 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001814 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815}
1816
1817static void ipv4_dst_destroy(struct dst_entry *dst)
1818{
1819 struct rtable *rt = (struct rtable *) dst;
1820 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821
David S. Miller62fa8a82011-01-26 20:51:05 -08001822 if (rt->fi) {
1823 fib_info_put(rt->fi);
1824 rt->fi = NULL;
1825 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 if (peer) {
1827 rt->peer = NULL;
1828 inet_putpeer(peer);
1829 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830}
1831
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832
1833static void ipv4_link_failure(struct sk_buff *skb)
1834{
1835 struct rtable *rt;
1836
1837 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
Eric Dumazet511c3f92009-06-02 05:14:27 +00001839 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001840 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842}
1843
1844static int ip_rt_bug(struct sk_buff *skb)
1845{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001846 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848 skb->dev ? skb->dev->name : "?");
1849 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001850 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 return 0;
1852}
1853
1854/*
1855 We do not cache source address of outgoing interface,
1856 because it is used only by IP RR, TS and SRR options,
1857 so that it out of fast path.
1858
1859 BTW remember: "addr" is allowed to be not aligned
1860 in IP options!
1861 */
1862
David S. Miller8e363602011-05-13 17:29:41 -04001863void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864{
Al Viroa61ced52006-09-26 21:27:54 -07001865 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866
David S. Millerc7537962010-11-11 17:07:48 -08001867 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001868 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001869 else {
David S. Miller8e363602011-05-13 17:29:41 -04001870 struct fib_result res;
1871 struct flowi4 fl4;
1872 struct iphdr *iph;
1873
1874 iph = ip_hdr(skb);
1875
1876 memset(&fl4, 0, sizeof(fl4));
1877 fl4.daddr = iph->daddr;
1878 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001879 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001880 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881 fl4.flowi4_iif = skb->dev->ifindex;
1882 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001883
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001884 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001885 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001886 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001887 else
1888 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001890 rcu_read_unlock();
1891 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 memcpy(addr, &src, 4);
1893}
1894
Patrick McHardyc7066f72011-01-14 13:36:42 +01001895#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896static void set_class_tag(struct rtable *rt, u32 tag)
1897{
Changli Gaod8d1f302010-06-10 23:31:35 -07001898 if (!(rt->dst.tclassid & 0xFFFF))
1899 rt->dst.tclassid |= tag & 0xFFFF;
1900 if (!(rt->dst.tclassid & 0xFFFF0000))
1901 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902}
1903#endif
1904
David S. Miller0dbaee32010-12-13 12:52:14 -08001905static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906{
1907 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909 if (advmss == 0) {
1910 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911 ip_rt_min_advmss);
1912 if (advmss > 65535 - 40)
1913 advmss = 65535 - 40;
1914 }
1915 return advmss;
1916}
1917
Steffen Klassertebb762f2011-11-23 02:12:51 +00001918static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001919{
Steffen Klassert261663b2011-11-23 02:14:50 +00001920 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001921 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
Steffen Klassert261663b2011-11-23 02:14:50 +00001923 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001924 return mtu;
1925
1926 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001927
1928 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001929
1930 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931 mtu = 576;
1932 }
1933
1934 if (mtu > IP_MAX_MTU)
1935 mtu = IP_MAX_MTU;
1936
1937 return mtu;
1938}
1939
David S. Miller813b3b52011-04-28 14:48:42 -07001940static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001941 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001942{
David S. Miller0131ba42011-02-04 14:37:30 -08001943 struct inet_peer *peer;
1944 int create = 0;
1945
1946 /* If a peer entry exists for this destination, we must hook
1947 * it up in order to get at cached metrics.
1948 */
David S. Miller813b3b52011-04-28 14:48:42 -07001949 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001950 create = 1;
1951
David S. Miller3c0afdc2011-03-04 21:26:07 -08001952 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001953 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001954 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001955 if (inet_metrics_new(peer))
1956 memcpy(peer->metrics, fi->fib_metrics,
1957 sizeof(u32) * RTAX_MAX);
1958 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001959
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001960 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001961
David S. Millerf39925d2011-02-09 22:00:16 -08001962 if (peer->redirect_learned.a4 &&
1963 peer->redirect_learned.a4 != rt->rt_gateway) {
1964 rt->rt_gateway = peer->redirect_learned.a4;
1965 rt->rt_flags |= RTCF_REDIRECTED;
1966 }
David S. Miller0131ba42011-02-04 14:37:30 -08001967 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001968 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1969 rt->fi = fi;
1970 atomic_inc(&fi->fib_clntref);
1971 }
David S. Millera4daad62011-01-27 22:01:53 -08001972 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001973 }
1974}
1975
David S. Miller813b3b52011-04-28 14:48:42 -07001976static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001977 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001978 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001979{
David S. Millerdefb3512010-12-08 21:16:57 -08001980 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981
1982 if (fi) {
1983 if (FIB_RES_GW(*res) &&
1984 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1985 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001986 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001987#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001988 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001990 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991
David S. Millerdefb3512010-12-08 21:16:57 -08001992 if (dst_mtu(dst) > IP_MAX_MTU)
1993 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001994 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001995 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996
Patrick McHardyc7066f72011-01-14 13:36:42 +01001997#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998#ifdef CONFIG_IP_MULTIPLE_TABLES
1999 set_class_tag(rt, fib_rules_tclass(res));
2000#endif
2001 set_class_tag(rt, itag);
2002#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003}
2004
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002005static struct rtable *rt_dst_alloc(struct net_device *dev,
2006 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002007{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002008 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2009 DST_HOST |
2010 (nopolicy ? DST_NOPOLICY : 0) |
2011 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002012}
2013
Eric Dumazet96d36222010-06-02 19:21:31 +00002014/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002015static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 u8 tos, struct net_device *dev, int our)
2017{
Eric Dumazet96d36222010-06-02 19:21:31 +00002018 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002020 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002021 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002023 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024
2025 /* Primary sanity checks. */
2026
2027 if (in_dev == NULL)
2028 return -EINVAL;
2029
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002030 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002031 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 goto e_inval;
2033
Joe Perchesf97c1e02007-12-16 13:45:43 -08002034 if (ipv4_is_zeronet(saddr)) {
2035 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 goto e_inval;
2037 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002038 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002039 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2040 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002041 if (err < 0)
2042 goto e_err;
2043 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00002044 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002045 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046 if (!rth)
2047 goto e_nobufs;
2048
Patrick McHardyc7066f72011-01-14 13:36:42 +01002049#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002050 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051#endif
David S. Millercf911662011-04-28 14:31:47 -07002052 rth->dst.output = ip_rt_bug;
2053
2054 rth->rt_key_dst = daddr;
2055 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002056 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002058 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002059 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002060 rth->rt_dst = daddr;
2061 rth->rt_src = saddr;
2062 rth->rt_route_iif = dev->ifindex;
2063 rth->rt_iif = dev->ifindex;
2064 rth->rt_oif = 0;
2065 rth->rt_mark = skb->mark;
2066 rth->rt_gateway = daddr;
2067 rth->rt_spec_dst= spec_dst;
2068 rth->rt_peer_genid = 0;
2069 rth->peer = NULL;
2070 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002072 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073 rth->rt_flags |= RTCF_LOCAL;
2074 }
2075
2076#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002077 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002078 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079#endif
2080 RT_CACHE_STAT_INC(in_slow_mc);
2081
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002082 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002083 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002084 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002085
2086e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002089 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002090e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002091 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092}
2093
2094
2095static void ip_handle_martian_source(struct net_device *dev,
2096 struct in_device *in_dev,
2097 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002098 __be32 daddr,
2099 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100{
2101 RT_CACHE_STAT_INC(in_martian_src);
2102#ifdef CONFIG_IP_ROUTE_VERBOSE
2103 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2104 /*
2105 * RFC1812 recommendation, if source is martian,
2106 * the only hint is MAC header.
2107 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002108 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002109 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002110 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002111 print_hex_dump(KERN_WARNING, "ll header: ",
2112 DUMP_PREFIX_OFFSET, 16, 1,
2113 skb_mac_header(skb),
2114 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115 }
2116 }
2117#endif
2118}
2119
Eric Dumazet47360222010-06-03 04:13:21 +00002120/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002121static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002122 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002123 struct in_device *in_dev,
2124 __be32 daddr, __be32 saddr, u32 tos,
2125 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127 struct rtable *rth;
2128 int err;
2129 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002130 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002131 __be32 spec_dst;
2132 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133
2134 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002135 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 if (out_dev == NULL) {
2137 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002138 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 return -EINVAL;
2140 }
2141
2142
Michael Smith5c04c812011-04-07 04:51:50 +00002143 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2144 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002146 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002148
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 goto cleanup;
2150 }
2151
2152 if (err)
2153 flags |= RTCF_DIRECTSRC;
2154
Thomas Graf51b77ca2008-06-03 16:36:01 -07002155 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 (IN_DEV_SHARED_MEDIA(out_dev) ||
2157 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2158 flags |= RTCF_DOREDIRECT;
2159
2160 if (skb->protocol != htons(ETH_P_IP)) {
2161 /* Not IP (i.e. ARP). Do not create route, if it is
2162 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002163 *
2164 * Proxy arp feature have been extended to allow, ARP
2165 * replies back to the same interface, to support
2166 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002167 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002168 if (out_dev == in_dev &&
2169 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 err = -EINVAL;
2171 goto cleanup;
2172 }
2173 }
2174
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002175 rth = rt_dst_alloc(out_dev->dev,
2176 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002177 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 if (!rth) {
2179 err = -ENOBUFS;
2180 goto cleanup;
2181 }
2182
David S. Miller5e2b61f2011-03-04 21:47:09 -08002183 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002184 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002185 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2186 rth->rt_flags = flags;
2187 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002188 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002189 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002191 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002192 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002193 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002194 rth->rt_mark = skb->mark;
2195 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002197 rth->rt_peer_genid = 0;
2198 rth->peer = NULL;
2199 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200
Changli Gaod8d1f302010-06-10 23:31:35 -07002201 rth->dst.input = ip_forward;
2202 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203
David S. Miller5e2b61f2011-03-04 21:47:09 -08002204 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206 *result = rth;
2207 err = 0;
2208 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002210}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211
Stephen Hemminger5969f712008-04-10 01:52:09 -07002212static int ip_mkroute_input(struct sk_buff *skb,
2213 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002214 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002215 struct in_device *in_dev,
2216 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217{
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002218 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 int err;
Eric Dumazet95c96172012-04-15 05:58:06 +00002220 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221
2222#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002223 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002224 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225#endif
2226
2227 /* create a routing cache entry */
2228 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2229 if (err)
2230 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231
2232 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002233 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002234 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002235 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002236 if (IS_ERR(rth))
2237 return PTR_ERR(rth);
2238 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239}
2240
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241/*
2242 * NOTE. We drop all the packets that has local source
2243 * addresses, because every properly looped back packet
2244 * must have correct destination already attached by output routine.
2245 *
2246 * Such approach solves two big problems:
2247 * 1. Not simplex devices are handled properly.
2248 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002249 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250 */
2251
Al Viro9e12bb22006-09-26 21:25:20 -07002252static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253 u8 tos, struct net_device *dev)
2254{
2255 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002256 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002257 struct flowi4 fl4;
Eric Dumazet95c96172012-04-15 05:58:06 +00002258 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 u32 itag = 0;
Eric Dumazet95c96172012-04-15 05:58:06 +00002260 struct rtable *rth;
2261 unsigned int hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002262 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 int err = -EINVAL;
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002264 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265
2266 /* IP on this device is disabled. */
2267
2268 if (!in_dev)
2269 goto out;
2270
2271 /* Check for the most weird martians, which can be not detected
2272 by fib_lookup.
2273 */
2274
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002275 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002276 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277 goto martian_source;
2278
Andy Walls27a954b2010-10-17 15:11:22 +00002279 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 goto brd_input;
2281
2282 /* Accept zero addresses only to limited broadcast;
2283 * I even do not know to fix it or not. Waiting for complains :-)
2284 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002285 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 goto martian_source;
2287
Andy Walls27a954b2010-10-17 15:11:22 +00002288 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289 goto martian_destination;
2290
2291 /*
2292 * Now we are ready to route packet.
2293 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002294 fl4.flowi4_oif = 0;
2295 fl4.flowi4_iif = dev->ifindex;
2296 fl4.flowi4_mark = skb->mark;
2297 fl4.flowi4_tos = tos;
2298 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2299 fl4.daddr = daddr;
2300 fl4.saddr = saddr;
2301 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002302 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002304 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 goto no_route;
2306 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307
2308 RT_CACHE_STAT_INC(in_slow_tot);
2309
2310 if (res.type == RTN_BROADCAST)
2311 goto brd_input;
2312
2313 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002314 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002315 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002316 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002317 if (err < 0)
2318 goto martian_source_keep_err;
2319 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 flags |= RTCF_DIRECTSRC;
2321 spec_dst = daddr;
2322 goto local_input;
2323 }
2324
2325 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002326 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 if (res.type != RTN_UNICAST)
2328 goto martian_destination;
2329
David S. Miller68a5e3d2011-03-11 20:07:33 -05002330 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331out: return err;
2332
2333brd_input:
2334 if (skb->protocol != htons(ETH_P_IP))
2335 goto e_inval;
2336
Joe Perchesf97c1e02007-12-16 13:45:43 -08002337 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2339 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002340 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2341 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002343 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344 if (err)
2345 flags |= RTCF_DIRECTSRC;
2346 }
2347 flags |= RTCF_BROADCAST;
2348 res.type = RTN_BROADCAST;
2349 RT_CACHE_STAT_INC(in_brd);
2350
2351local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002352 rth = rt_dst_alloc(net->loopback_dev,
2353 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 if (!rth)
2355 goto e_nobufs;
2356
David S. Millercf911662011-04-28 14:31:47 -07002357 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002358 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002359#ifdef CONFIG_IP_ROUTE_CLASSID
2360 rth->dst.tclassid = itag;
2361#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362
David S. Miller5e2b61f2011-03-04 21:47:09 -08002363 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002364 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002365 rth->rt_genid = rt_genid(net);
2366 rth->rt_flags = flags|RTCF_LOCAL;
2367 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002368 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002369 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002371#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002372 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002374 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002375 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002376 rth->rt_oif = 0;
2377 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378 rth->rt_gateway = daddr;
2379 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002380 rth->rt_peer_genid = 0;
2381 rth->peer = NULL;
2382 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002384 rth->dst.input= ip_error;
2385 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386 rth->rt_flags &= ~RTCF_LOCAL;
2387 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002388 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2389 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002390 err = 0;
2391 if (IS_ERR(rth))
2392 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002393 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394
2395no_route:
2396 RT_CACHE_STAT_INC(in_no_route);
2397 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2398 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002399 if (err == -ESRCH)
2400 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002401 goto local_input;
2402
2403 /*
2404 * Do not cache martian addresses: they should be logged (RFC1812)
2405 */
2406martian_destination:
2407 RT_CACHE_STAT_INC(in_martian_dst);
2408#ifdef CONFIG_IP_ROUTE_VERBOSE
2409 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002410 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002411 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002412#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002413
2414e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002415 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002416 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002417
Linus Torvalds1da177e2005-04-16 15:20:36 -07002418e_inval:
2419 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002420 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002421
2422e_nobufs:
2423 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002424 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
2426martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002427 err = -EINVAL;
2428martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002430 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431}
2432
Eric Dumazet407eadd2010-05-10 11:32:55 +00002433int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2434 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435{
Eric Dumazet95c96172012-04-15 05:58:06 +00002436 struct rtable *rth;
2437 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002439 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002440 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002441
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002442 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002443
Eric Dumazet96d36222010-06-02 19:21:31 +00002444 rcu_read_lock();
2445
Neil Horman1080d702008-10-27 12:28:25 -07002446 if (!rt_caching(net))
2447 goto skip_cache;
2448
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002450 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002453 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002454 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2455 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002456 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002457 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002458 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002459 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002460 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002461 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002462 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002463 dst_use_noref(&rth->dst, jiffies);
2464 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002465 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002466 dst_use(&rth->dst, jiffies);
2467 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002468 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469 RT_CACHE_STAT_INC(in_hit);
2470 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 return 0;
2472 }
2473 RT_CACHE_STAT_INC(in_hlist_search);
2474 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475
Neil Horman1080d702008-10-27 12:28:25 -07002476skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477 /* Multicast recognition logic is moved from route cache to here.
2478 The problem was that too many Ethernet cards have broken/missing
2479 hardware multicast filters :-( As result the host on multicasting
2480 network acquires a lot of useless route cache entries, sort of
2481 SDR messages from all the world. Now we try to get rid of them.
2482 Really, provided software IP multicast filter is organized
2483 reasonably (at least, hashed), it does not result in a slowdown
2484 comparing with route cache reject entries.
2485 Note, that multicast routers are not affected, because
2486 route cache entry is created eventually.
2487 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002488 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002489 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490
Eric Dumazet96d36222010-06-02 19:21:31 +00002491 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002492 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2493 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494 if (our
2495#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002496 ||
2497 (!ipv4_is_local_multicast(daddr) &&
2498 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002500 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002501 int res = ip_route_input_mc(skb, daddr, saddr,
2502 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002504 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505 }
2506 }
2507 rcu_read_unlock();
2508 return -EINVAL;
2509 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002510 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2511 rcu_read_unlock();
2512 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002513}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002514EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002516/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002517static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002518 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002519 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002520 int orig_oif, __u8 orig_rtos,
2521 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002522 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523{
David S. Miller982721f2011-02-16 21:44:24 -08002524 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002525 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002526 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002527 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528
David S. Miller68a5e3d2011-03-11 20:07:33 -05002529 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002530 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531
David S. Miller68a5e3d2011-03-11 20:07:33 -05002532 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002533 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002534 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002535 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002536 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002537 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538
2539 if (dev_out->flags & IFF_LOOPBACK)
2540 flags |= RTCF_LOCAL;
2541
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002542 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002543 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002544 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002545
David S. Miller982721f2011-02-16 21:44:24 -08002546 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002548 fi = NULL;
2549 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002550 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002551 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2552 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553 flags &= ~RTCF_LOCAL;
2554 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002555 * default one, but do not gateway in this case.
2556 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002557 */
David S. Miller982721f2011-02-16 21:44:24 -08002558 if (fi && res->prefixlen < 4)
2559 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 }
2561
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002562 rth = rt_dst_alloc(dev_out,
2563 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002564 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002565 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002566 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002567
David S. Millercf911662011-04-28 14:31:47 -07002568 rth->dst.output = ip_output;
2569
David S. Miller813b3b52011-04-28 14:48:42 -07002570 rth->rt_key_dst = orig_daddr;
2571 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002572 rth->rt_genid = rt_genid(dev_net(dev_out));
2573 rth->rt_flags = flags;
2574 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002575 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002576 rth->rt_dst = fl4->daddr;
2577 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002578 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002579 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2580 rth->rt_oif = orig_oif;
2581 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002582 rth->rt_gateway = fl4->daddr;
2583 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002584 rth->rt_peer_genid = 0;
2585 rth->peer = NULL;
2586 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002587
2588 RT_CACHE_STAT_INC(out_slow_tot);
2589
2590 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002591 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002592 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 }
2594 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002595 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002596 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002597 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002598 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599 RT_CACHE_STAT_INC(out_slow_mc);
2600 }
2601#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002602 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002604 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002605 rth->dst.input = ip_mr_input;
2606 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002607 }
2608 }
2609#endif
2610 }
2611
David S. Miller813b3b52011-04-28 14:48:42 -07002612 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613
David S. Miller5ada5522011-02-17 15:29:00 -08002614 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615}
2616
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617/*
2618 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002619 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 */
2621
David S. Miller813b3b52011-04-28 14:48:42 -07002622static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002625 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002626 unsigned int flags = 0;
2627 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002628 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002629 __be32 orig_daddr;
2630 __be32 orig_saddr;
2631 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632
2633 res.fi = NULL;
2634#ifdef CONFIG_IP_MULTIPLE_TABLES
2635 res.r = NULL;
2636#endif
2637
David S. Miller813b3b52011-04-28 14:48:42 -07002638 orig_daddr = fl4->daddr;
2639 orig_saddr = fl4->saddr;
2640 orig_oif = fl4->flowi4_oif;
2641
2642 fl4->flowi4_iif = net->loopback_dev->ifindex;
2643 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2644 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2645 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002646
David S. Miller010c2702011-02-17 15:37:09 -08002647 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002648 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002649 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002650 if (ipv4_is_multicast(fl4->saddr) ||
2651 ipv4_is_lbcast(fl4->saddr) ||
2652 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 goto out;
2654
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 /* I removed check for oif == dev_out->oif here.
2656 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002657 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2658 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659 2. Moreover, we are allowed to send packets with saddr
2660 of another iface. --ANK
2661 */
2662
David S. Miller813b3b52011-04-28 14:48:42 -07002663 if (fl4->flowi4_oif == 0 &&
2664 (ipv4_is_multicast(fl4->daddr) ||
2665 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002666 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002667 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002668 if (dev_out == NULL)
2669 goto out;
2670
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 /* Special hack: user can direct multicasts
2672 and limited broadcast via necessary interface
2673 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2674 This hack is not just for fun, it allows
2675 vic,vat and friends to work.
2676 They bind socket to loopback, set ttl to zero
2677 and expect that it will work.
2678 From the viewpoint of routing cache they are broken,
2679 because we are not allowed to build multicast path
2680 with loopback source addr (look, routing cache
2681 cannot know, that ttl is zero, so that packet
2682 will not leave this host and route is valid).
2683 Luckily, this hack is good workaround.
2684 */
2685
David S. Miller813b3b52011-04-28 14:48:42 -07002686 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687 goto make_route;
2688 }
Julian Anastasova210d012008-10-01 07:28:28 -07002689
David S. Miller813b3b52011-04-28 14:48:42 -07002690 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002691 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002692 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002693 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002694 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002695 }
2696
2697
David S. Miller813b3b52011-04-28 14:48:42 -07002698 if (fl4->flowi4_oif) {
2699 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002700 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701 if (dev_out == NULL)
2702 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002703
2704 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002705 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002706 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002707 goto out;
2708 }
David S. Miller813b3b52011-04-28 14:48:42 -07002709 if (ipv4_is_local_multicast(fl4->daddr) ||
2710 ipv4_is_lbcast(fl4->daddr)) {
2711 if (!fl4->saddr)
2712 fl4->saddr = inet_select_addr(dev_out, 0,
2713 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714 goto make_route;
2715 }
David S. Miller813b3b52011-04-28 14:48:42 -07002716 if (fl4->saddr) {
2717 if (ipv4_is_multicast(fl4->daddr))
2718 fl4->saddr = inet_select_addr(dev_out, 0,
2719 fl4->flowi4_scope);
2720 else if (!fl4->daddr)
2721 fl4->saddr = inet_select_addr(dev_out, 0,
2722 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723 }
2724 }
2725
David S. Miller813b3b52011-04-28 14:48:42 -07002726 if (!fl4->daddr) {
2727 fl4->daddr = fl4->saddr;
2728 if (!fl4->daddr)
2729 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002730 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002731 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732 res.type = RTN_LOCAL;
2733 flags |= RTCF_LOCAL;
2734 goto make_route;
2735 }
2736
David S. Miller813b3b52011-04-28 14:48:42 -07002737 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002739 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740 /* Apparently, routing tables are wrong. Assume,
2741 that the destination is on link.
2742
2743 WHY? DW.
2744 Because we are allowed to send to iface
2745 even if it has NO routes and NO assigned
2746 addresses. When oif is specified, routing
2747 tables are looked up with only one purpose:
2748 to catch if destination is gatewayed, rather than
2749 direct. Moreover, if MSG_DONTROUTE is set,
2750 we send packet, ignoring both routing tables
2751 and ifaddr state. --ANK
2752
2753
2754 We could make it even if oif is unknown,
2755 likely IPv6, but we do not.
2756 */
2757
David S. Miller813b3b52011-04-28 14:48:42 -07002758 if (fl4->saddr == 0)
2759 fl4->saddr = inet_select_addr(dev_out, 0,
2760 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002761 res.type = RTN_UNICAST;
2762 goto make_route;
2763 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002764 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765 goto out;
2766 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767
2768 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002769 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002770 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002771 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002772 else
David S. Miller813b3b52011-04-28 14:48:42 -07002773 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002774 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002775 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002776 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 res.fi = NULL;
2778 flags |= RTCF_LOCAL;
2779 goto make_route;
2780 }
2781
2782#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002783 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002784 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002785 else
2786#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002787 if (!res.prefixlen &&
2788 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002789 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002790 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791
David S. Miller813b3b52011-04-28 14:48:42 -07002792 if (!fl4->saddr)
2793 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002796 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002797
2798
2799make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002800 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002801 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002802 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002803 unsigned int hash;
2804
David S. Miller813b3b52011-04-28 14:48:42 -07002805 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002806 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002807 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002808 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809
David S. Miller010c2702011-02-17 15:37:09 -08002810out:
2811 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002812 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813}
2814
David S. Miller813b3b52011-04-28 14:48:42 -07002815struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002818 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819
Neil Horman1080d702008-10-27 12:28:25 -07002820 if (!rt_caching(net))
2821 goto slow_output;
2822
David S. Miller9d6ec932011-03-12 01:12:47 -05002823 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824
2825 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002826 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002827 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002828 if (rth->rt_key_dst == flp4->daddr &&
2829 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002830 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002831 rth->rt_oif == flp4->flowi4_oif &&
2832 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002833 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002834 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002835 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002836 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002837 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002838 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 RT_CACHE_STAT_INC(out_hit);
2840 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002841 if (!flp4->saddr)
2842 flp4->saddr = rth->rt_src;
2843 if (!flp4->daddr)
2844 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002845 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 }
2847 RT_CACHE_STAT_INC(out_hlist_search);
2848 }
2849 rcu_read_unlock_bh();
2850
Neil Horman1080d702008-10-27 12:28:25 -07002851slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002852 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002854EXPORT_SYMBOL_GPL(__ip_route_output_key);
2855
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002856static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2857{
2858 return NULL;
2859}
2860
Steffen Klassertebb762f2011-11-23 02:12:51 +00002861static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002862{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002863 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2864
2865 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002866}
2867
David S. Miller14e50e52007-05-24 18:17:54 -07002868static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2869{
2870}
2871
Held Bernhard0972ddb2011-04-24 22:07:32 +00002872static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2873 unsigned long old)
2874{
2875 return NULL;
2876}
2877
David S. Miller14e50e52007-05-24 18:17:54 -07002878static struct dst_ops ipv4_dst_blackhole_ops = {
2879 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002880 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002881 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002882 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002883 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002884 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002885 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002886 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002887 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002888};
2889
David S. Miller2774c132011-03-01 14:59:04 -08002890struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002891{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002892 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002893 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002894
2895 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002896 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002897
David S. Miller14e50e52007-05-24 18:17:54 -07002898 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002899 new->input = dst_discard;
2900 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002901 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002902
Changli Gaod8d1f302010-06-10 23:31:35 -07002903 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002904 if (new->dev)
2905 dev_hold(new->dev);
2906
David S. Miller5e2b61f2011-03-04 21:47:09 -08002907 rt->rt_key_dst = ort->rt_key_dst;
2908 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002909 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002910 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002911 rt->rt_iif = ort->rt_iif;
2912 rt->rt_oif = ort->rt_oif;
2913 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002914
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002915 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002916 rt->rt_flags = ort->rt_flags;
2917 rt->rt_type = ort->rt_type;
2918 rt->rt_dst = ort->rt_dst;
2919 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002920 rt->rt_gateway = ort->rt_gateway;
2921 rt->rt_spec_dst = ort->rt_spec_dst;
2922 rt->peer = ort->peer;
2923 if (rt->peer)
2924 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002925 rt->fi = ort->fi;
2926 if (rt->fi)
2927 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002928
2929 dst_free(new);
2930 }
2931
David S. Miller2774c132011-03-01 14:59:04 -08002932 dst_release(dst_orig);
2933
2934 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002935}
2936
David S. Miller9d6ec932011-03-12 01:12:47 -05002937struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002938 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939{
David S. Miller9d6ec932011-03-12 01:12:47 -05002940 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002941
David S. Millerb23dd4f2011-03-02 14:31:35 -08002942 if (IS_ERR(rt))
2943 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002944
David S. Miller56157872011-05-02 14:37:45 -07002945 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002946 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2947 flowi4_to_flowi(flp4),
2948 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949
David S. Millerb23dd4f2011-03-02 14:31:35 -08002950 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002951}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002952EXPORT_SYMBOL_GPL(ip_route_output_flow);
2953
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002954static int rt_fill_info(struct net *net,
2955 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002956 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002958 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002960 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002961 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002962 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002963 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002964
2965 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2966 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002967 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002968
2969 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970 r->rtm_family = AF_INET;
2971 r->rtm_dst_len = 32;
2972 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002973 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974 r->rtm_table = RT_TABLE_MAIN;
David S. Millerf3756b72012-04-01 20:39:02 -04002975 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2976 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 r->rtm_type = rt->rt_type;
2978 r->rtm_scope = RT_SCOPE_UNIVERSE;
2979 r->rtm_protocol = RTPROT_UNSPEC;
2980 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 if (rt->rt_flags & RTCF_NOTIFY)
2982 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002983
David S. Millerf3756b72012-04-01 20:39:02 -04002984 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2985 goto nla_put_failure;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002986 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 r->rtm_src_len = 32;
David S. Millerf3756b72012-04-01 20:39:02 -04002988 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2989 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990 }
David S. Millerf3756b72012-04-01 20:39:02 -04002991 if (rt->dst.dev &&
2992 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2993 goto nla_put_failure;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002994#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerf3756b72012-04-01 20:39:02 -04002995 if (rt->dst.tclassid &&
2996 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2997 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002998#endif
David S. Millerf3756b72012-04-01 20:39:02 -04002999 if (rt_is_input_route(rt)) {
3000 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
3001 goto nla_put_failure;
3002 } else if (rt->rt_src != rt->rt_key_src) {
3003 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
3004 goto nla_put_failure;
3005 }
3006 if (rt->rt_dst != rt->rt_gateway &&
3007 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3008 goto nla_put_failure;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003009
David S. Millerdefb3512010-12-08 21:16:57 -08003010 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003011 goto nla_put_failure;
3012
David S. Millerf3756b72012-04-01 20:39:02 -04003013 if (rt->rt_mark &&
3014 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3015 goto nla_put_failure;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003016
Changli Gaod8d1f302010-06-10 23:31:35 -07003017 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003018 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003019 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003020 id = atomic_read(&peer->ip_id_count) & 0xffff;
3021 if (peer->tcp_ts_stamp) {
3022 ts = peer->tcp_ts;
3023 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003025 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003026 if (expires) {
3027 if (time_before(jiffies, expires))
3028 expires -= jiffies;
3029 else
3030 expires = 0;
3031 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003033
David S. Millerc7537962010-11-11 17:07:48 -08003034 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003035#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003036 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003037
Joe Perchesf97c1e02007-12-16 13:45:43 -08003038 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003039 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003040 int err = ipmr_get_route(net, skb,
3041 rt->rt_src, rt->rt_dst,
3042 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043 if (err <= 0) {
3044 if (!nowait) {
3045 if (err == 0)
3046 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003047 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 } else {
3049 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003050 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003051 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003052 }
3053 }
3054 } else
3055#endif
David S. Millerf3756b72012-04-01 20:39:02 -04003056 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3057 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003058 }
3059
Changli Gaod8d1f302010-06-10 23:31:35 -07003060 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003061 expires, error) < 0)
3062 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063
Thomas Grafbe403ea2006-08-17 18:15:17 -07003064 return nlmsg_end(skb, nlh);
3065
3066nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003067 nlmsg_cancel(skb, nlh);
3068 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003069}
3070
Daniel Baluta5e73ea12012-04-15 01:34:41 +00003071static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003073 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003074 struct rtmsg *rtm;
3075 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003077 __be32 dst = 0;
3078 __be32 src = 0;
3079 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003080 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003081 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003082 struct sk_buff *skb;
3083
Thomas Grafd889ce32006-08-17 18:15:44 -07003084 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3085 if (err < 0)
3086 goto errout;
3087
3088 rtm = nlmsg_data(nlh);
3089
Linus Torvalds1da177e2005-04-16 15:20:36 -07003090 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003091 if (skb == NULL) {
3092 err = -ENOBUFS;
3093 goto errout;
3094 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095
3096 /* Reserve room for dummy headers, this skb can pass
3097 through good chunk of routing engine.
3098 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003099 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003100 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003101
3102 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003103 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3105
Al Viro17fb2c62006-09-26 22:15:25 -07003106 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3107 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003108 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003109 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003110
3111 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003112 struct net_device *dev;
3113
Denis V. Lunev19375042008-02-28 20:52:04 -08003114 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003115 if (dev == NULL) {
3116 err = -ENODEV;
3117 goto errout_free;
3118 }
3119
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 skb->protocol = htons(ETH_P_IP);
3121 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003122 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003123 local_bh_disable();
3124 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3125 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003126
Eric Dumazet511c3f92009-06-02 05:14:27 +00003127 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003128 if (err == 0 && rt->dst.error)
3129 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003130 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003131 struct flowi4 fl4 = {
3132 .daddr = dst,
3133 .saddr = src,
3134 .flowi4_tos = rtm->rtm_tos,
3135 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3136 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003137 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003138 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003139
3140 err = 0;
3141 if (IS_ERR(rt))
3142 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003144
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003146 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147
Changli Gaod8d1f302010-06-10 23:31:35 -07003148 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 if (rtm->rtm_flags & RTM_F_NOTIFY)
3150 rt->rt_flags |= RTCF_NOTIFY;
3151
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003152 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003153 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003154 if (err <= 0)
3155 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156
Denis V. Lunev19375042008-02-28 20:52:04 -08003157 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003158errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003159 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003160
Thomas Grafd889ce32006-08-17 18:15:44 -07003161errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003162 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003163 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164}
3165
3166int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3167{
3168 struct rtable *rt;
3169 int h, s_h;
3170 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003171 struct net *net;
3172
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003173 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003174
3175 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003176 if (s_h < 0)
3177 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003179 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3180 if (!rt_hash_table[h].chain)
3181 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003183 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003184 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3185 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003187 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003188 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003189 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003190 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003191 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003192 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003193 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194 rcu_read_unlock_bh();
3195 goto done;
3196 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003197 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 }
3199 rcu_read_unlock_bh();
3200 }
3201
3202done:
3203 cb->args[0] = h;
3204 cb->args[1] = idx;
3205 return skb->len;
3206}
3207
3208void ip_rt_multicast_event(struct in_device *in_dev)
3209{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003210 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003211}
3212
3213#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003214static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003215 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003216 size_t *lenp, loff_t *ppos)
3217{
3218 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003219 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003220 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003221 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003222
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003223 memcpy(&ctl, __ctl, sizeof(ctl));
3224 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003225 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003226
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003227 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003228 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003230 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003231
3232 return -EINVAL;
3233}
3234
Al Viroeeb61f72008-07-27 08:59:33 +01003235static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003236 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003237 .procname = "gc_thresh",
3238 .data = &ipv4_dst_ops.gc_thresh,
3239 .maxlen = sizeof(int),
3240 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003241 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003242 },
3243 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003244 .procname = "max_size",
3245 .data = &ip_rt_max_size,
3246 .maxlen = sizeof(int),
3247 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003248 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003249 },
3250 {
3251 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003252
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253 .procname = "gc_min_interval",
3254 .data = &ip_rt_gc_min_interval,
3255 .maxlen = sizeof(int),
3256 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003257 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003258 },
3259 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003260 .procname = "gc_min_interval_ms",
3261 .data = &ip_rt_gc_min_interval,
3262 .maxlen = sizeof(int),
3263 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003264 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265 },
3266 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267 .procname = "gc_timeout",
3268 .data = &ip_rt_gc_timeout,
3269 .maxlen = sizeof(int),
3270 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003271 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 },
3273 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003274 .procname = "gc_interval",
3275 .data = &ip_rt_gc_interval,
3276 .maxlen = sizeof(int),
3277 .mode = 0644,
3278 .proc_handler = proc_dointvec_jiffies,
3279 },
3280 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003281 .procname = "redirect_load",
3282 .data = &ip_rt_redirect_load,
3283 .maxlen = sizeof(int),
3284 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003285 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003286 },
3287 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003288 .procname = "redirect_number",
3289 .data = &ip_rt_redirect_number,
3290 .maxlen = sizeof(int),
3291 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003292 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003293 },
3294 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003295 .procname = "redirect_silence",
3296 .data = &ip_rt_redirect_silence,
3297 .maxlen = sizeof(int),
3298 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003299 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003300 },
3301 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003302 .procname = "error_cost",
3303 .data = &ip_rt_error_cost,
3304 .maxlen = sizeof(int),
3305 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003306 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307 },
3308 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003309 .procname = "error_burst",
3310 .data = &ip_rt_error_burst,
3311 .maxlen = sizeof(int),
3312 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003313 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314 },
3315 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 .procname = "gc_elasticity",
3317 .data = &ip_rt_gc_elasticity,
3318 .maxlen = sizeof(int),
3319 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003320 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003321 },
3322 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003323 .procname = "mtu_expires",
3324 .data = &ip_rt_mtu_expires,
3325 .maxlen = sizeof(int),
3326 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003327 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328 },
3329 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003330 .procname = "min_pmtu",
3331 .data = &ip_rt_min_pmtu,
3332 .maxlen = sizeof(int),
3333 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003334 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335 },
3336 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337 .procname = "min_adv_mss",
3338 .data = &ip_rt_min_advmss,
3339 .maxlen = sizeof(int),
3340 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003341 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003343 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003345
Al Viro2f4520d2008-08-25 15:17:44 -07003346static struct ctl_table empty[1];
3347
3348static struct ctl_table ipv4_skeleton[] =
3349{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003350 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003351 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003352 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003353 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003354 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003355};
3356
Al Viro2f4520d2008-08-25 15:17:44 -07003357static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003358 { .procname = "net", },
3359 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003360 { },
3361};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003362
3363static struct ctl_table ipv4_route_flush_table[] = {
3364 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003365 .procname = "flush",
3366 .maxlen = sizeof(int),
3367 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003368 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003369 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003370 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003371};
3372
Al Viro2f4520d2008-08-25 15:17:44 -07003373static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003374 { .procname = "net", },
3375 { .procname = "ipv4", },
3376 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003377 { },
3378};
3379
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003380static __net_init int sysctl_route_net_init(struct net *net)
3381{
3382 struct ctl_table *tbl;
3383
3384 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003385 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003386 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3387 if (tbl == NULL)
3388 goto err_dup;
3389 }
3390 tbl[0].extra1 = net;
3391
3392 net->ipv4.route_hdr =
3393 register_net_sysctl_table(net, ipv4_route_path, tbl);
3394 if (net->ipv4.route_hdr == NULL)
3395 goto err_reg;
3396 return 0;
3397
3398err_reg:
3399 if (tbl != ipv4_route_flush_table)
3400 kfree(tbl);
3401err_dup:
3402 return -ENOMEM;
3403}
3404
3405static __net_exit void sysctl_route_net_exit(struct net *net)
3406{
3407 struct ctl_table *tbl;
3408
3409 tbl = net->ipv4.route_hdr->ctl_table_arg;
3410 unregister_net_sysctl_table(net->ipv4.route_hdr);
3411 BUG_ON(tbl == ipv4_route_flush_table);
3412 kfree(tbl);
3413}
3414
3415static __net_initdata struct pernet_operations sysctl_route_ops = {
3416 .init = sysctl_route_net_init,
3417 .exit = sysctl_route_net_exit,
3418};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419#endif
3420
Neil Horman3ee94372010-05-08 01:57:52 -07003421static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003422{
Neil Horman3ee94372010-05-08 01:57:52 -07003423 get_random_bytes(&net->ipv4.rt_genid,
3424 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003425 get_random_bytes(&net->ipv4.dev_addr_genid,
3426 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003427 return 0;
3428}
3429
Neil Horman3ee94372010-05-08 01:57:52 -07003430static __net_initdata struct pernet_operations rt_genid_ops = {
3431 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003432};
3433
3434
Patrick McHardyc7066f72011-01-14 13:36:42 +01003435#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003436struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003437#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003438
3439static __initdata unsigned long rhash_entries;
3440static int __init set_rhash_entries(char *str)
3441{
3442 if (!str)
3443 return 0;
3444 rhash_entries = simple_strtoul(str, &str, 0);
3445 return 1;
3446}
3447__setup("rhash_entries=", set_rhash_entries);
3448
3449int __init ip_rt_init(void)
3450{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003451 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003452
Patrick McHardyc7066f72011-01-14 13:36:42 +01003453#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003454 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455 if (!ip_rt_acct)
3456 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003457#endif
3458
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003459 ipv4_dst_ops.kmem_cachep =
3460 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003461 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003462
David S. Miller14e50e52007-05-24 18:17:54 -07003463 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3464
Eric Dumazetfc66f952010-10-08 06:37:34 +00003465 if (dst_entries_init(&ipv4_dst_ops) < 0)
3466 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3467
3468 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3469 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3470
Eric Dumazet424c4b72005-07-05 14:58:19 -07003471 rt_hash_table = (struct rt_hash_bucket *)
3472 alloc_large_system_hash("IP route cache",
3473 sizeof(struct rt_hash_bucket),
3474 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003475 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003476 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003477 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003478 &rt_hash_log,
3479 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003480 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003481 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3482 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003483
3484 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3485 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3486
Linus Torvalds1da177e2005-04-16 15:20:36 -07003487 devinet_init();
3488 ip_fib_init();
3489
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003490 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3491 expires_ljiffies = jiffies;
3492 schedule_delayed_work(&expires_work,
3493 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3494
Denis V. Lunev73b38712008-02-28 20:51:18 -08003495 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003496 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003497#ifdef CONFIG_XFRM
3498 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003499 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003500#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003501 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003502
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003503#ifdef CONFIG_SYSCTL
3504 register_pernet_subsys(&sysctl_route_ops);
3505#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003506 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003507 return rc;
3508}
3509
Al Viroa1bc6eb2008-07-30 06:32:52 -04003510#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003511/*
3512 * We really need to sanitize the damn ipv4 init order, then all
3513 * this nonsense will go away.
3514 */
3515void __init ip_static_sysctl_init(void)
3516{
Al Viro2f4520d2008-08-25 15:17:44 -07003517 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003518}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003519#endif