blob: 171f483b21d535f9b8fa2735fc578036ff518910 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040053#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090054#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080070#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <net/checksum.h>
76#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070077#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070082#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080085EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000093EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
Herbert Xuc439cb22008-01-11 19:14:00 -080095int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700120 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300121 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000124 WARN_ON(!skb_dst(newskb));
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Al Viro13d8eaa02006-09-26 22:27:30 -0700143 __be32 saddr, __be32 daddr, struct ip_options *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 iph->daddr = rt->rt_dst;
162 iph->saddr = rt->rt_src;
163 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700164 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
166 if (opt && opt->optlen) {
167 iph->ihl += opt->optlen>>2;
168 ip_options_build(skb, opt, daddr, rt, 0);
169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185
Neil Hormanedf391f2009-04-27 02:45:02 -0700186 if (rt->rt_type == RTN_MULTICAST) {
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 } else if (rt->rt_type == RTN_BROADCAST)
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700190
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700192 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 struct sk_buff *skb2;
194
195 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196 if (skb2 == NULL) {
197 kfree_skb(skb);
198 return -ENOMEM;
199 }
200 if (skb->sk)
201 skb_set_owner_w(skb2, skb->sk);
202 kfree_skb(skb);
203 skb = skb2;
204 }
205
Stephen Hemminger3644f0c2006-12-07 15:08:17 -0800206 if (dst->hh)
207 return neigh_hh_output(dst->hh, skb);
208 else if (dst->neighbour)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 return dst->neighbour->output(skb);
210
211 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb);
214 return -EINVAL;
215}
216
John Heffner628a5c52007-04-20 15:53:27 -0700217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000222 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700223}
224
Patrick McHardy861d0482007-10-15 01:48:39 -0700225static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000229 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800230 IPCB(skb)->flags |= IPSKB_REROUTED;
231 return dst_output(skb);
232 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800233#endif
John Heffner628a5c52007-04-20 15:53:27 -0700234 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800235 return ip_fragment(skb, ip_finish_output2);
236 else
237 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000243 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700244 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246 /*
247 * If the indicated interface is up and running, send the packet.
248 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700249 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250
251 skb->dev = dev;
252 skb->protocol = htons(ETH_P_IP);
253
254 /*
255 * Multicasts are looped back for other local users
256 */
257
258 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800259 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260#ifdef CONFIG_IP_MROUTE
261 /* Small optimization: do not loopback not local frames,
262 which returned after forwarding; they will be dropped
263 by ip_mr_input in any case.
264 Note, that local frames are looped back to be delivered
265 to local recipients.
266
267 This check is duplicated in ip_mr_input at the moment.
268 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800269 &&
270 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800273 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 ip_dev_loopback_xmit);
279 }
280
281 /* Multicasts with ttl 0 must not go beyond the host */
282
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700283 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 kfree_skb(skb);
285 return 0;
286 }
287 }
288
289 if (rt->rt_flags&RTCF_BROADCAST) {
290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 }
295
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800298 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299}
300
301int ip_output(struct sk_buff *skb)
302{
Eric Dumazetadf30902009-06-02 05:19:30 +0000303 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800304
Neil Hormanedf391f2009-04-27 02:45:02 -0700305 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800307 skb->dev = dev;
308 skb->protocol = htons(ETH_P_IP);
309
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900311 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800312 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313}
314
Shan Wei4e15ed42010-04-15 16:43:08 +0000315int ip_queue_xmit(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316{
David S. Millere89862f2007-01-26 01:04:55 -0800317 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 struct inet_sock *inet = inet_sk(sk);
319 struct ip_options *opt = inet->opt;
320 struct rtable *rt;
321 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000322 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323
324 /* Skip all of this if the packet is already routed,
325 * f.e. by something like SCTP.
326 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000327 rcu_read_lock();
Eric Dumazet511c3f92009-06-02 05:14:27 +0000328 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 if (rt != NULL)
330 goto packet_routed;
331
332 /* Make sure we can route this packet. */
333 rt = (struct rtable *)__sk_dst_check(sk, 0);
334 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700335 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000338 daddr = inet->inet_daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 if(opt && opt->srr)
340 daddr = opt->faddr;
341
342 {
343 struct flowi fl = { .oif = sk->sk_bound_dev_if,
Atis Elsts914a9ab2009-10-01 15:16:49 -0700344 .mark = sk->sk_mark,
Changli Gao58116622010-11-12 18:43:55 +0000345 .fl4_dst = daddr,
346 .fl4_src = inet->inet_saddr,
347 .fl4_tos = RT_CONN_FLAGS(sk),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 .proto = sk->sk_protocol,
KOVACS Krisztian86b08d82008-10-01 07:44:42 -0700349 .flags = inet_sk_flowi_flags(sk),
Changli Gao58116622010-11-12 18:43:55 +0000350 .fl_ip_sport = inet->inet_sport,
351 .fl_ip_dport = inet->inet_dport };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352
353 /* If this fails, retransmit mechanism of transport layer will
354 * keep trying until route appears or the connection times
355 * itself out.
356 */
Venkat Yekkiralabeb8d132006-08-04 23:12:42 -0700357 security_sk_classify_flow(sk, &fl);
David S. Millerb23dd4f2011-03-02 14:31:35 -0800358 rt = ip_route_output_flow(sock_net(sk), &fl, sk);
359 if (IS_ERR(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 goto no_route;
361 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700362 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700364 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
366packet_routed:
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
368 goto no_route;
369
370 /* OK, we know where to send it, allocate and build IP header. */
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300371 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
372 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700373 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 iph->frag_off = htons(IP_DF);
377 else
378 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700379 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src;
382 iph->daddr = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 /* Transport layer set skb->h.foo itself. */
384
385 if (opt && opt->optlen) {
386 iph->ihl += opt->optlen >> 2;
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388 }
389
Changli Gaod8d1f302010-06-10 23:31:35 -0700390 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800394 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000396 res = ip_local_out(skb);
397 rcu_read_unlock();
398 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000401 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700402 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 kfree_skb(skb);
404 return -EHOSTUNREACH;
405}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000406EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407
408
409static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
410{
411 to->pkt_type = from->pkt_type;
412 to->priority = from->priority;
413 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000414 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000415 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800417 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418
419 /* Copy the flags to each fragment. */
420 IPCB(to)->flags = IPCB(from)->flags;
421
422#ifdef CONFIG_NET_SCHED
423 to->tc_index = from->tc_index;
424#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700425 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700426#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
427 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
428 to->nf_trace = from->nf_trace;
429#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300430#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
431 to->ipvs_property = from->ipvs_property;
432#endif
James Morris984bc162006-06-09 00:29:17 -0700433 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434}
435
436/*
437 * This IP datagram is too large to be sent in one piece. Break it up into
438 * smaller pieces (each of size equal to IP header plus
439 * a block of the data of the original IP data part) that will yet fit in a
440 * single device frame, and queue such a frame for sending.
441 */
442
Jianjun Kongd93191002008-11-03 00:23:42 -0800443int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444{
445 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 int ptr;
447 struct net_device *dev;
448 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000449 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800451 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000452 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 int err = 0;
454
Changli Gaod8d1f302010-06-10 23:31:35 -0700455 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456
457 /*
458 * Point into the IP datagram header.
459 */
460
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700461 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700464 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700466 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 kfree_skb(skb);
468 return -EMSGSIZE;
469 }
470
471 /*
472 * Setup starting values.
473 */
474
475 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700476 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200477#ifdef CONFIG_BRIDGE_NETFILTER
478 if (skb->nf_bridge)
479 mtu -= nf_bridge_mtu_reduction(skb);
480#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800481 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482
483 /* When frag_list is given, use it. First, check its validity:
484 * some transformers could create wrong frag_list or break existing
485 * one, it is not prohibited. In this case fall back to copying.
486 *
487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment.
489 */
David S. Miller21dc3302010-08-23 00:13:46 -0700490 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000491 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 int first_len = skb_pagelen(skb);
493
494 if (first_len - hlen > mtu ||
495 ((first_len - hlen) & 7) ||
496 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
497 skb_cloned(skb))
498 goto slow_path;
499
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700500 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 /* Correct geometry. */
502 if (frag->len > mtu ||
503 ((frag->len & 7) && frag->next) ||
504 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000505 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
507 /* Partially cloned skb? */
508 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000509 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700510
511 BUG_ON(frag->sk);
512 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700513 frag->sk = skb->sk;
514 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700515 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000516 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 }
518
519 /* Everything is OK. Generate! */
520
521 err = 0;
522 offset = 0;
523 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700524 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 skb->data_len = first_len - skb_headlen(skb);
526 skb->len = first_len;
527 iph->tot_len = htons(first_len);
528 iph->frag_off = htons(IP_MF);
529 ip_send_check(iph);
530
531 for (;;) {
532 /* Prepare header of the next frame,
533 * before previous one went down. */
534 if (frag) {
535 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300536 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700537 __skb_push(frag, hlen);
538 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700539 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700540 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 iph->tot_len = htons(frag->len);
542 ip_copy_metadata(frag, skb);
543 if (offset == 0)
544 ip_options_fragment(frag);
545 offset += skb->len - hlen;
546 iph->frag_off = htons(offset>>3);
547 if (frag->next != NULL)
548 iph->frag_off |= htons(IP_MF);
549 /* Ready, complete checksum */
550 ip_send_check(iph);
551 }
552
553 err = output(skb);
554
Wei Dongdafee492006-08-02 13:41:21 -0700555 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700556 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 if (err || !frag)
558 break;
559
560 skb = frag;
561 frag = skb->next;
562 skb->next = NULL;
563 }
564
565 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700566 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 return 0;
568 }
569
570 while (frag) {
571 skb = frag->next;
572 kfree_skb(frag);
573 frag = skb;
574 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 }
587
588slow_path:
589 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000590 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700593 * we need to make room for the encapsulating header
594 */
Changli Gaoc893b802010-07-31 13:25:08 +0000595 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700596
Linus Torvalds1da177e2005-04-16 15:20:36 -0700597 /*
598 * Fragment the datagram.
599 */
600
601 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
602 not_last_frag = iph->frag_off & htons(IP_MF);
603
604 /*
605 * Keep copying data until we run out.
606 */
607
Stephen Hemminger132adf52007-03-08 20:44:43 -0800608 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 len = left;
610 /* IF: it doesn't fit, use 'mtu' - the data space left */
611 if (len > mtu)
612 len = mtu;
613 /* IF: we are not sending upto and including the packet end
614 then align the next start on an eight byte boundary */
615 if (len < left) {
616 len &= ~7;
617 }
618 /*
619 * Allocate buffer.
620 */
621
622 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700623 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 err = -ENOMEM;
625 goto fail;
626 }
627
628 /*
629 * Set up data on packet
630 */
631
632 ip_copy_metadata(skb2, skb);
633 skb_reserve(skb2, ll_rs);
634 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700635 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700636 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637
638 /*
639 * Charge the memory for the fragment to any owner
640 * it might possess
641 */
642
643 if (skb->sk)
644 skb_set_owner_w(skb2, skb->sk);
645
646 /*
647 * Copy the packet header into the new buffer.
648 */
649
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300650 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651
652 /*
653 * Copy a block of the IP datagram.
654 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300655 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 BUG();
657 left -= len;
658
659 /*
660 * Fill in the new header fields.
661 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700662 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 iph->frag_off = htons((offset >> 3));
664
665 /* ANK: dirty, but effective trick. Upgrade options only if
666 * the segment to be fragmented was THE FIRST (otherwise,
667 * options are already fixed) and make it ONCE
668 * on the initial skb, so that all the following fragments
669 * will inherit fixed options.
670 */
671 if (offset == 0)
672 ip_options_fragment(skb);
673
674 /*
675 * Added AC : If we are fragmenting a fragment that's not the
676 * last fragment then keep MF on each bit
677 */
678 if (left > 0 || not_last_frag)
679 iph->frag_off |= htons(IP_MF);
680 ptr += len;
681 offset += len;
682
683 /*
684 * Put this fragment into the sending queue.
685 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 iph->tot_len = htons(len + hlen);
687
688 ip_send_check(iph);
689
690 err = output(skb2);
691 if (err)
692 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700693
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700694 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 }
696 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698 return err;
699
700fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900701 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700702 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 return err;
704}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700705EXPORT_SYMBOL(ip_fragment);
706
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707int
708ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
709{
710 struct iovec *iov = from;
711
Patrick McHardy84fa7932006-08-29 16:44:56 -0700712 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
714 return -EFAULT;
715 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800716 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
718 return -EFAULT;
719 skb->csum = csum_block_add(skb->csum, csum, odd);
720 }
721 return 0;
722}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000723EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724
Al Viro44bb9362006-11-14 21:36:14 -0800725static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726csum_page(struct page *page, int offset, int copy)
727{
728 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800729 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 kaddr = kmap(page);
731 csum = csum_partial(kaddr + offset, copy, 0);
732 kunmap(page);
733 return csum;
734}
735
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800736static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000737 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700738 int getfrag(void *from, char *to, int offset, int len,
739 int odd, struct sk_buff *skb),
740 void *from, int length, int hh_len, int fragheaderlen,
Jianjun Kongd93191002008-11-03 00:23:42 -0800741 int transhdrlen, int mtu, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700742{
743 struct sk_buff *skb;
744 int err;
745
746 /* There is support for UDP fragmentation offload by network
747 * device, so create one single skb packet containing complete
748 * udp datagram
749 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000750 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700751 skb = sock_alloc_send_skb(sk,
752 hh_len + fragheaderlen + transhdrlen + 20,
753 (flags & MSG_DONTWAIT), &err);
754
755 if (skb == NULL)
756 return err;
757
758 /* reserve space for Hardware header */
759 skb_reserve(skb, hh_len);
760
761 /* create space for UDP/IP header */
Jianjun Kongd93191002008-11-03 00:23:42 -0800762 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700763
764 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700765 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700766
767 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700768 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700769
Patrick McHardy84fa7932006-08-29 16:44:56 -0700770 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700771 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700772
Kostya Bbe9164e2008-04-29 22:36:30 -0700773 /* specify the length of each IP datagram fragment */
Herbert Xu79671682006-06-22 02:40:14 -0700774 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700775 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000776 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700777 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700778
779 return skb_append_datato_frags(sk, skb, getfrag, from,
780 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700781}
782
Herbert Xu1470ddf2011-03-01 02:36:47 +0000783static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
784 struct inet_cork *cork,
785 int getfrag(void *from, char *to, int offset,
786 int len, int odd, struct sk_buff *skb),
787 void *from, int length, int transhdrlen,
788 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789{
790 struct inet_sock *inet = inet_sk(sk);
791 struct sk_buff *skb;
792
Herbert Xu07df5292011-03-01 23:00:58 -0800793 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794 int hh_len;
795 int exthdrlen;
796 int mtu;
797 int copy;
798 int err;
799 int offset = 0;
800 unsigned int maxfraglen, fragheaderlen;
801 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000802 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803
Herbert Xu1470ddf2011-03-01 02:36:47 +0000804 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
805 length += exthdrlen;
806 transhdrlen += exthdrlen;
Herbert Xu07df5292011-03-01 23:00:58 -0800807 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808
Changli Gaod8d1f302010-06-10 23:31:35 -0700809 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810
811 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
812 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
813
Herbert Xu1470ddf2011-03-01 02:36:47 +0000814 if (cork->length + length > 0xFFFF - fragheaderlen) {
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000815 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
816 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817 return -EMSGSIZE;
818 }
819
820 /*
821 * transhdrlen > 0 means that this is the first fragment and we wish
822 * it won't be fragmented in the future.
823 */
824 if (transhdrlen &&
825 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700826 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700828 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829
Herbert Xu1470ddf2011-03-01 02:36:47 +0000830 skb = skb_peek_tail(queue);
Herbert Xu26cde9f2010-06-15 01:52:25 +0000831
Herbert Xu1470ddf2011-03-01 02:36:47 +0000832 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000833 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700834 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700835 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000836 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
837 hh_len, fragheaderlen, transhdrlen,
838 mtu, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800839 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700840 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700841 return 0;
842 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843
844 /* So, what's going on in the loop below?
845 *
846 * We use calculated fragment length to generate chained skb,
847 * each of segments is IP fragment ready for sending to network after
848 * adding appropriate IP header.
849 */
850
Herbert Xu26cde9f2010-06-15 01:52:25 +0000851 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 goto alloc_new_skb;
853
854 while (length > 0) {
855 /* Check if the remaining data fits into current packet. */
856 copy = mtu - skb->len;
857 if (copy < length)
858 copy = maxfraglen - skb->len;
859 if (copy <= 0) {
860 char *data;
861 unsigned int datalen;
862 unsigned int fraglen;
863 unsigned int fraggap;
864 unsigned int alloclen;
865 struct sk_buff *skb_prev;
866alloc_new_skb:
867 skb_prev = skb;
868 if (skb_prev)
869 fraggap = skb_prev->len - maxfraglen;
870 else
871 fraggap = 0;
872
873 /*
874 * If remaining data exceeds the mtu,
875 * we know we need more fragment(s).
876 */
877 datalen = length + fraggap;
878 if (datalen > mtu - fragheaderlen)
879 datalen = maxfraglen - fragheaderlen;
880 fraglen = datalen + fragheaderlen;
881
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900882 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700883 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884 alloclen = mtu;
885 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000886 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887
888 /* The last fragment gets additional space at tail.
889 * Note, with MSG_MORE we overallocate on fragments,
890 * because we have no idea what fragment will be
891 * the last.
892 */
Eric Dumazet59104f02010-09-20 20:16:27 +0000893 if (datalen == length + fraggap) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700894 alloclen += rt->dst.trailer_len;
Eric Dumazet59104f02010-09-20 20:16:27 +0000895 /* make sure mtu is not reached */
896 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
897 datalen -= ALIGN(rt->dst.trailer_len, 8);
898 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900900 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 alloclen + hh_len + 15,
902 (flags & MSG_DONTWAIT), &err);
903 } else {
904 skb = NULL;
905 if (atomic_read(&sk->sk_wmem_alloc) <=
906 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900907 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 alloclen + hh_len + 15, 1,
909 sk->sk_allocation);
910 if (unlikely(skb == NULL))
911 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000912 else
913 /* only the initial fragment is
914 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000915 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 }
917 if (skb == NULL)
918 goto error;
919
920 /*
921 * Fill in the control structures
922 */
923 skb->ip_summed = csummode;
924 skb->csum = 0;
925 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000926 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927
928 /*
929 * Find where to start putting bytes.
930 */
931 data = skb_put(skb, fraglen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300932 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700933 skb->transport_header = (skb->network_header +
934 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935 data += fragheaderlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936
937 if (fraggap) {
938 skb->csum = skb_copy_and_csum_bits(
939 skb_prev, maxfraglen,
940 data + transhdrlen, fraggap, 0);
941 skb_prev->csum = csum_sub(skb_prev->csum,
942 skb->csum);
943 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700944 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 }
946
947 copy = datalen - transhdrlen - fraggap;
948 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
949 err = -EFAULT;
950 kfree_skb(skb);
951 goto error;
952 }
953
954 offset += copy;
955 length -= datalen - fraggap;
956 transhdrlen = 0;
957 exthdrlen = 0;
958 csummode = CHECKSUM_NONE;
959
960 /*
961 * Put the packet on the pending queue.
962 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000963 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 continue;
965 }
966
967 if (copy > length)
968 copy = length;
969
Changli Gaod8d1f302010-06-10 23:31:35 -0700970 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971 unsigned int off;
972
973 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900974 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975 offset, copy, off, skb) < 0) {
976 __skb_trim(skb, off);
977 err = -EFAULT;
978 goto error;
979 }
980 } else {
981 int i = skb_shinfo(skb)->nr_frags;
982 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000983 struct page *page = cork->page;
984 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700985 unsigned int left;
986
987 if (page && (left = PAGE_SIZE - off) > 0) {
988 if (copy >= left)
989 copy = left;
990 if (page != frag->page) {
991 if (i == MAX_SKB_FRAGS) {
992 err = -EMSGSIZE;
993 goto error;
994 }
995 get_page(page);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000996 skb_fill_page_desc(skb, i, page, off, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 frag = &skb_shinfo(skb)->frags[i];
998 }
999 } else if (i < MAX_SKB_FRAGS) {
1000 if (copy > PAGE_SIZE)
1001 copy = PAGE_SIZE;
1002 page = alloc_pages(sk->sk_allocation, 0);
1003 if (page == NULL) {
1004 err = -ENOMEM;
1005 goto error;
1006 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001007 cork->page = page;
1008 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009
1010 skb_fill_page_desc(skb, i, page, 0, 0);
1011 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 } else {
1013 err = -EMSGSIZE;
1014 goto error;
1015 }
1016 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1017 err = -EFAULT;
1018 goto error;
1019 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001020 cork->off += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 frag->size += copy;
1022 skb->len += copy;
1023 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001024 skb->truesize += copy;
1025 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 }
1027 offset += copy;
1028 length -= copy;
1029 }
1030
1031 return 0;
1032
1033error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001034 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001035 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001036 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037}
1038
Herbert Xu1470ddf2011-03-01 02:36:47 +00001039static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1040 struct ipcm_cookie *ipc, struct rtable **rtp)
1041{
1042 struct inet_sock *inet = inet_sk(sk);
1043 struct ip_options *opt;
1044 struct rtable *rt;
1045
1046 /*
1047 * setup for corking.
1048 */
1049 opt = ipc->opt;
1050 if (opt) {
1051 if (cork->opt == NULL) {
1052 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1053 sk->sk_allocation);
1054 if (unlikely(cork->opt == NULL))
1055 return -ENOBUFS;
1056 }
1057 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1058 cork->flags |= IPCORK_OPT;
1059 cork->addr = ipc->addr;
1060 }
1061 rt = *rtp;
1062 if (unlikely(!rt))
1063 return -EFAULT;
1064 /*
1065 * We steal reference to this route, caller should not release it
1066 */
1067 *rtp = NULL;
1068 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1069 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1070 cork->dst = &rt->dst;
1071 cork->length = 0;
1072 cork->tx_flags = ipc->tx_flags;
1073 cork->page = NULL;
1074 cork->off = 0;
1075
1076 return 0;
1077}
1078
1079/*
1080 * ip_append_data() and ip_append_page() can make one large IP datagram
1081 * from many pieces of data. Each pieces will be holded on the socket
1082 * until ip_push_pending_frames() is called. Each piece can be a page
1083 * or non-page data.
1084 *
1085 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1086 * this interface potentially.
1087 *
1088 * LATER: length must be adjusted by pad at tail, when it is required.
1089 */
1090int ip_append_data(struct sock *sk,
1091 int getfrag(void *from, char *to, int offset, int len,
1092 int odd, struct sk_buff *skb),
1093 void *from, int length, int transhdrlen,
1094 struct ipcm_cookie *ipc, struct rtable **rtp,
1095 unsigned int flags)
1096{
1097 struct inet_sock *inet = inet_sk(sk);
1098 int err;
1099
1100 if (flags&MSG_PROBE)
1101 return 0;
1102
1103 if (skb_queue_empty(&sk->sk_write_queue)) {
1104 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1105 if (err)
1106 return err;
1107 } else {
1108 transhdrlen = 0;
1109 }
1110
1111 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1112 from, length, transhdrlen, flags);
1113}
1114
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115ssize_t ip_append_page(struct sock *sk, struct page *page,
1116 int offset, size_t size, int flags)
1117{
1118 struct inet_sock *inet = inet_sk(sk);
1119 struct sk_buff *skb;
1120 struct rtable *rt;
1121 struct ip_options *opt = NULL;
1122 int hh_len;
1123 int mtu;
1124 int len;
1125 int err;
1126 unsigned int maxfraglen, fragheaderlen, fraggap;
1127
1128 if (inet->hdrincl)
1129 return -EPERM;
1130
1131 if (flags&MSG_PROBE)
1132 return 0;
1133
1134 if (skb_queue_empty(&sk->sk_write_queue))
1135 return -EINVAL;
1136
YOSHIFUJI Hideakic8cdaf92008-03-10 04:30:37 -04001137 rt = (struct rtable *)inet->cork.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 if (inet->cork.flags & IPCORK_OPT)
1139 opt = inet->cork.opt;
1140
Changli Gaod8d1f302010-06-10 23:31:35 -07001141 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 return -EOPNOTSUPP;
1143
Changli Gaod8d1f302010-06-10 23:31:35 -07001144 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 mtu = inet->cork.fragsize;
1146
1147 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1148 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1149
1150 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
Eric Dumazetc720c7e82009-10-15 06:30:45 +00001151 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 return -EMSGSIZE;
1153 }
1154
1155 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1156 return -EINVAL;
1157
1158 inet->cork.length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001159 if ((size + skb->len > mtu) &&
1160 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001161 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001162 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001163 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001164 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001165
Linus Torvalds1da177e2005-04-16 15:20:36 -07001166
1167 while (size > 0) {
1168 int i;
1169
Herbert Xu89114af2006-07-08 13:34:32 -07001170 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001171 len = size;
1172 else {
1173
1174 /* Check if the remaining data fits into current packet. */
1175 len = mtu - skb->len;
1176 if (len < size)
1177 len = maxfraglen - skb->len;
1178 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179 if (len <= 0) {
1180 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 int alloclen;
1182
1183 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001184 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001185
1186 alloclen = fragheaderlen + hh_len + fraggap + 15;
1187 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1188 if (unlikely(!skb)) {
1189 err = -ENOBUFS;
1190 goto error;
1191 }
1192
1193 /*
1194 * Fill in the control structures
1195 */
1196 skb->ip_summed = CHECKSUM_NONE;
1197 skb->csum = 0;
1198 skb_reserve(skb, hh_len);
1199
1200 /*
1201 * Find where to start putting bytes.
1202 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001203 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001204 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001205 skb->transport_header = (skb->network_header +
1206 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001208 skb->csum = skb_copy_and_csum_bits(skb_prev,
1209 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001210 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001211 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001212 skb_prev->csum = csum_sub(skb_prev->csum,
1213 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001214 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 }
1216
1217 /*
1218 * Put the packet on the pending queue.
1219 */
1220 __skb_queue_tail(&sk->sk_write_queue, skb);
1221 continue;
1222 }
1223
1224 i = skb_shinfo(skb)->nr_frags;
1225 if (len > size)
1226 len = size;
1227 if (skb_can_coalesce(skb, i, page, offset)) {
1228 skb_shinfo(skb)->frags[i-1].size += len;
1229 } else if (i < MAX_SKB_FRAGS) {
1230 get_page(page);
1231 skb_fill_page_desc(skb, i, page, offset, len);
1232 } else {
1233 err = -EMSGSIZE;
1234 goto error;
1235 }
1236
1237 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001238 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239 csum = csum_page(page, offset, len);
1240 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1241 }
1242
1243 skb->len += len;
1244 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001245 skb->truesize += len;
1246 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247 offset += len;
1248 size -= len;
1249 }
1250 return 0;
1251
1252error:
1253 inet->cork.length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001254 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 return err;
1256}
1257
Herbert Xu1470ddf2011-03-01 02:36:47 +00001258static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001259{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001260 cork->flags &= ~IPCORK_OPT;
1261 kfree(cork->opt);
1262 cork->opt = NULL;
1263 dst_release(cork->dst);
1264 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001265}
1266
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267/*
1268 * Combined all pending IP fragments on the socket as one IP datagram
1269 * and push them out.
1270 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001271struct sk_buff *__ip_make_skb(struct sock *sk,
1272 struct sk_buff_head *queue,
1273 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274{
1275 struct sk_buff *skb, *tmp_skb;
1276 struct sk_buff **tail_skb;
1277 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001278 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001280 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001282 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284
Herbert Xu1470ddf2011-03-01 02:36:47 +00001285 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 goto out;
1287 tail_skb = &(skb_shinfo(skb)->frag_list);
1288
1289 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001290 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001291 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001292 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001293 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294 *tail_skb = tmp_skb;
1295 tail_skb = &(tmp_skb->next);
1296 skb->len += tmp_skb->len;
1297 skb->data_len += tmp_skb->len;
1298 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 tmp_skb->destructor = NULL;
1300 tmp_skb->sk = NULL;
1301 }
1302
1303 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1304 * to fragment the frame generated here. No matter, what transforms
1305 * how transforms change size of the packet, it will come out.
1306 */
John Heffner628a5c52007-04-20 15:53:27 -07001307 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 skb->local_df = 1;
1309
1310 /* DF bit is set when we want to see DF on outgoing frames.
1311 * If local_df is set too, we still allow to fragment this frame
1312 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001313 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001314 (skb->len <= dst_mtu(&rt->dst) &&
1315 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316 df = htons(IP_DF);
1317
Herbert Xu1470ddf2011-03-01 02:36:47 +00001318 if (cork->flags & IPCORK_OPT)
1319 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001320
1321 if (rt->rt_type == RTN_MULTICAST)
1322 ttl = inet->mc_ttl;
1323 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001324 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325
1326 iph = (struct iphdr *)skb->data;
1327 iph->version = 4;
1328 iph->ihl = 5;
1329 if (opt) {
1330 iph->ihl += opt->optlen>>2;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001331 ip_options_build(skb, opt, cork->addr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 }
1333 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001335 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336 iph->ttl = ttl;
1337 iph->protocol = sk->sk_protocol;
1338 iph->saddr = rt->rt_src;
1339 iph->daddr = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340
1341 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001342 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001343 /*
1344 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1345 * on dst refcount
1346 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001347 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001348 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349
David L Stevens96793b42007-09-17 09:57:33 -07001350 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001351 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001352 skb_transport_header(skb))->type);
1353
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001354 ip_cork_release(cork);
1355out:
1356 return skb;
1357}
1358
1359int ip_send_skb(struct sk_buff *skb)
1360{
1361 struct net *net = sock_net(skb->sk);
1362 int err;
1363
Herbert Xuc439cb22008-01-11 19:14:00 -08001364 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365 if (err) {
1366 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001367 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001369 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 }
1371
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373}
1374
Herbert Xu1470ddf2011-03-01 02:36:47 +00001375int ip_push_pending_frames(struct sock *sk)
1376{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001377 struct sk_buff *skb;
1378
1379 skb = ip_finish_skb(sk);
1380 if (!skb)
1381 return 0;
1382
1383 /* Netfilter gets whole the not fragmented skb. */
1384 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001385}
1386
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387/*
1388 * Throw away all pending data on the socket.
1389 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001390static void __ip_flush_pending_frames(struct sock *sk,
1391 struct sk_buff_head *queue,
1392 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394 struct sk_buff *skb;
1395
Herbert Xu1470ddf2011-03-01 02:36:47 +00001396 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397 kfree_skb(skb);
1398
Herbert Xu1470ddf2011-03-01 02:36:47 +00001399 ip_cork_release(cork);
1400}
1401
1402void ip_flush_pending_frames(struct sock *sk)
1403{
1404 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405}
1406
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001407struct sk_buff *ip_make_skb(struct sock *sk,
1408 int getfrag(void *from, char *to, int offset,
1409 int len, int odd, struct sk_buff *skb),
1410 void *from, int length, int transhdrlen,
1411 struct ipcm_cookie *ipc, struct rtable **rtp,
1412 unsigned int flags)
1413{
1414 struct inet_cork cork = {};
1415 struct sk_buff_head queue;
1416 int err;
1417
1418 if (flags & MSG_PROBE)
1419 return NULL;
1420
1421 __skb_queue_head_init(&queue);
1422
1423 err = ip_setup_cork(sk, &cork, ipc, rtp);
1424 if (err)
1425 return ERR_PTR(err);
1426
1427 err = __ip_append_data(sk, &queue, &cork, getfrag,
1428 from, length, transhdrlen, flags);
1429 if (err) {
1430 __ip_flush_pending_frames(sk, &queue, &cork);
1431 return ERR_PTR(err);
1432 }
1433
1434 return __ip_make_skb(sk, &queue, &cork);
1435}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436
1437/*
1438 * Fetch data from kernel space and fill in checksum if needed.
1439 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001440static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441 int len, int odd, struct sk_buff *skb)
1442{
Al Viro50842052006-11-14 21:36:34 -08001443 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444
1445 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1446 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001447 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448}
1449
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001450/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 * Generic function to send a packet as reply to another packet.
1452 * Used to send TCP resets so far. ICMP should use this function too.
1453 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001454 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 */
1457void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1458 unsigned int len)
1459{
1460 struct inet_sock *inet = inet_sk(sk);
1461 struct {
1462 struct ip_options opt;
1463 char data[40];
1464 } replyopts;
1465 struct ipcm_cookie ipc;
Al Viro3ca3c682006-09-27 18:28:07 -07001466 __be32 daddr;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001467 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468
1469 if (ip_options_echo(&replyopts.opt, skb))
1470 return;
1471
1472 daddr = ipc.addr = rt->rt_src;
1473 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001474 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475
1476 if (replyopts.opt.optlen) {
1477 ipc.opt = &replyopts.opt;
1478
1479 if (ipc.opt->srr)
1480 daddr = replyopts.opt.faddr;
1481 }
1482
1483 {
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001484 struct flowi fl = { .oif = arg->bound_dev_if,
Changli Gao58116622010-11-12 18:43:55 +00001485 .fl4_dst = daddr,
1486 .fl4_src = rt->rt_spec_dst,
1487 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1488 .fl_ip_sport = tcp_hdr(skb)->dest,
1489 .fl_ip_dport = tcp_hdr(skb)->source,
KOVACS Krisztian86b08d82008-10-01 07:44:42 -07001490 .proto = sk->sk_protocol,
1491 .flags = ip_reply_arg_flowi_flags(arg) };
Venkat Yekkiralabeb8d132006-08-04 23:12:42 -07001492 security_skb_classify_flow(skb, &fl);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001493 rt = ip_route_output_key(sock_net(sk), &fl);
1494 if (IS_ERR(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 return;
1496 }
1497
1498 /* And let IP do all the hard work.
1499
1500 This chunk is not reenterable, hence spinlock.
1501 Note that it uses the fact, that this function is called
1502 with locally disabled BH and that sk cannot be already spinlocked.
1503 */
1504 bh_lock_sock(sk);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001505 inet->tos = ip_hdr(skb)->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001507 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001508 sk->sk_bound_dev_if = arg->bound_dev_if;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001510 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1512 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001513 *((__sum16 *)skb_transport_header(skb) +
1514 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1515 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 skb->ip_summed = CHECKSUM_NONE;
1517 ip_push_pending_frames(sk);
1518 }
1519
1520 bh_unlock_sock(sk);
1521
1522 ip_rt_put(rt);
1523}
1524
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525void __init ip_init(void)
1526{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 ip_rt_init();
1528 inet_initpeers();
1529
1530#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1531 igmp_mc_proc_init();
1532#endif
1533}