alistair23-linux/net/core/datagram.c
Herbert Xu 3305b80c21 [IP]: Simplify and consolidate MSG_PEEK error handling
When a packet is obtained from skb_recv_datagram with MSG_PEEK enabled
it is left on the socket receive queue.  This means that when we detect
a checksum error we have to be careful when trying to free the packet
as someone could have dequeued it in the time being.

Currently this delicate logic is duplicated three times between UDPv4,
UDPv6 and RAWv6.  This patch moves them into a one place and simplifies
the code somewhat.

This is based on a suggestion by Eric Dumazet.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-01-03 13:10:41 -08:00

509 lines
12 KiB
C

/*
* SUCS NET3:
*
* Generic datagram handling routines. These are generic for all
* protocols. Possibly a generic IP version on top of these would
* make sense. Not tonight however 8-).
* This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
* NetROM layer all have identical poll code and mostly
* identical recvmsg() code. So we share it here. The poll was
* shared before but buried in udp.c so I moved it.
*
* Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old
* udp.c code)
*
* Fixes:
* Alan Cox : NULL return from skb_peek_copy()
* understood
* Alan Cox : Rewrote skb_read_datagram to avoid the
* skb_peek_copy stuff.
* Alan Cox : Added support for SOCK_SEQPACKET.
* IPX can no longer use the SO_TYPE hack
* but AX.25 now works right, and SPX is
* feasible.
* Alan Cox : Fixed write poll of non IP protocol
* crash.
* Florian La Roche: Changed for my new skbuff handling.
* Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
* Linus Torvalds : BSD semantic fixes.
* Alan Cox : Datagram iovec handling
* Darryl Miles : Fixed non-blocking SOCK_STREAM.
* Alan Cox : POSIXisms
* Pete Wyckoff : Unconnected accept() fix.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
/*
* Is a socket 'connection oriented' ?
*/
static inline int connection_based(struct sock *sk)
{
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
/*
* Wait for a packet..
*/
static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
{
int error;
DEFINE_WAIT(wait);
prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
error = sock_error(sk);
if (error)
goto out_err;
if (!skb_queue_empty(&sk->sk_receive_queue))
goto out;
/* Socket shut down? */
if (sk->sk_shutdown & RCV_SHUTDOWN)
goto out_noerr;
/* Sequenced packets can come disconnected.
* If so we report the problem
*/
error = -ENOTCONN;
if (connection_based(sk) &&
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
goto out_err;
/* handle signals */
if (signal_pending(current))
goto interrupted;
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
finish_wait(sk->sk_sleep, &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
out_err:
*err = error;
goto out;
out_noerr:
*err = 0;
error = 1;
goto out;
}
/**
* skb_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
* @noblock: blocking operation?
* @err: error code returned
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
* udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
* This function will lock the socket if a skb is returned, so the caller
* needs to unlock the socket in that case (usually by calling
* skb_free_datagram)
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
* * significantly datagram socket latencies at high loads,
* * when data copying to user space takes lots of time.
* * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
* * 8) Great win.)
* * --ANK (980729)
*
* The order of the tests when we find no data waiting are specified
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
int noblock, int *err)
{
struct sk_buff *skb;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
int error = sock_error(sk);
if (error)
goto no_packet;
timeo = sock_rcvtimeo(sk, noblock);
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
* However, this function was corrent in any case. 8)
*/
if (flags & MSG_PEEK) {
unsigned long cpu_flags;
spin_lock_irqsave(&sk->sk_receive_queue.lock,
cpu_flags);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
atomic_inc(&skb->users);
spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
cpu_flags);
} else
skb = skb_dequeue(&sk->sk_receive_queue);
if (skb)
return skb;
/* User doesn't want to wait */
error = -EAGAIN;
if (!timeo)
goto no_packet;
} while (!wait_for_packet(sk, err, &timeo));
return NULL;
no_packet:
*err = error;
return NULL;
}
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
}
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
* @skb: datagram skbuff
* @flags: MSG_ flags
*
* This function frees a datagram skbuff that was received by
* skb_recv_datagram. The flags argument must match the one
* used for skb_recv_datagram.
*
* If the MSG_PEEK flag is set, and the packet is still on the
* receive queue of the socket, it will be taken off the queue
* before it is freed.
*
* This function currently only disables BH when acquiring the
* sk_receive_queue lock. Therefore it must not be used in a
* context where that lock is acquired in an IRQ context.
*/
void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
if (flags & MSG_PEEK) {
spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
__skb_unlink(skb, &sk->sk_receive_queue);
atomic_dec(&skb->users);
}
spin_unlock_bh(&sk->sk_receive_queue.lock);
}
kfree_skb(skb);
}
EXPORT_SYMBOL(skb_kill_datagram);
/**
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: io vector to copy to
* @len: amount of data to copy from buffer to iovec
*
* Note: the iovec is modified during the copy.
*/
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
int i, err, fraglen, end = 0;
struct sk_buff *next = skb_shinfo(skb)->frag_list;
if (!len)
return 0;
next_skb:
fraglen = skb_headlen(skb);
i = -1;
while (1) {
int start = end;
if ((end += fraglen) > offset) {
int copy = end - offset, o = offset - start;
if (copy > len)
copy = len;
if (i == -1)
err = memcpy_toiovec(to, skb->data + o, copy);
else {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
struct page *page = frag->page;
void *p = kmap(page) + frag->page_offset + o;
err = memcpy_toiovec(to, p, copy);
kunmap(page);
}
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
}
if (++i >= skb_shinfo(skb)->nr_frags)
break;
fraglen = skb_shinfo(skb)->frags[i].size;
}
if (next) {
skb = next;
BUG_ON(skb_shinfo(skb)->frag_list);
next = skb->next;
goto next_skb;
}
fault:
return -EFAULT;
}
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
u8 __user *to, int len,
unsigned int *csump)
{
int start = skb_headlen(skb);
int pos = 0;
int i, copy = start - offset;
/* Copy header. */
if (copy > 0) {
int err = 0;
if (copy > len)
copy = len;
*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
*csump, &err);
if (err)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
pos = copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
BUG_TRAP(start <= offset + len);
end = start + skb_shinfo(skb)->frags[i].size;
if ((copy = end - offset) > 0) {
unsigned int csum2;
int err = 0;
u8 *vaddr;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
struct page *page = frag->page;
if (copy > len)
copy = len;
vaddr = kmap(page);
csum2 = csum_and_copy_to_user(vaddr +
frag->page_offset +
offset - start,
to, copy, 0, &err);
kunmap(page);
if (err)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
offset += copy;
to += copy;
pos += copy;
}
start = end;
}
if (skb_shinfo(skb)->frag_list) {
struct sk_buff *list = skb_shinfo(skb)->frag_list;
for (; list; list=list->next) {
int end;
BUG_TRAP(start <= offset + len);
end = start + list->len;
if ((copy = end - offset) > 0) {
unsigned int csum2 = 0;
if (copy > len)
copy = len;
if (skb_copy_and_csum_datagram(list,
offset - start,
to, copy,
&csum2))
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
pos += copy;
}
start = end;
}
}
if (!len)
return 0;
fault:
return -EFAULT;
}
unsigned int __skb_checksum_complete(struct sk_buff *skb)
{
unsigned int sum;
sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_HW))
netdev_rx_csum_fault(skb->dev);
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
/**
* skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
* @iov: io vector
*
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
* -EFAULT - fault during copy. Beware, in this case iovec
* can be modified!
*/
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen, struct iovec *iov)
{
unsigned int csum;
int chunk = skb->len - hlen;
/* Skip filled elements.
* Pretty silly, look at memcpy_toiovec, though 8)
*/
while (!iov->iov_len)
iov++;
if (iov->iov_len < chunk) {
if (__skb_checksum_complete(skb))
goto csum_error;
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
chunk, &csum))
goto fault;
if ((unsigned short)csum_fold(csum))
goto csum_error;
if (unlikely(skb->ip_summed == CHECKSUM_HW))
netdev_rx_csum_fault(skb->dev);
iov->iov_len -= chunk;
iov->iov_base += chunk;
}
return 0;
csum_error:
return -EINVAL;
fault:
return -EFAULT;
}
/**
* datagram_poll - generic datagram poll
* @file: file struct
* @sock: socket
* @wait: poll table
*
* Datagram poll: Again totally generic. This also handles
* sequenced packet sockets providing the socket receive queue
* is only ever holding data ready to receive.
*
* Note: when you _don't_ use this routine for this protocol,
* and you use a different write policy from sock_writeable()
* then please supply your own write_space callback.
*/
unsigned int datagram_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
unsigned int mask;
poll_wait(file, sk->sk_sleep, wait);
mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
/* readable? */
if (!skb_queue_empty(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN))
mask |= POLLIN | POLLRDNORM;
/* Connection-based need to check for termination and startup */
if (connection_based(sk)) {
if (sk->sk_state == TCP_CLOSE)
mask |= POLLHUP;
/* connection hasn't started yet? */
if (sk->sk_state == TCP_SYN_SENT)
return mask;
}
/* writable? */
if (sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
return mask;
}
EXPORT_SYMBOL(datagram_poll);
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
EXPORT_SYMBOL(skb_copy_datagram_iovec);
EXPORT_SYMBOL(skb_free_datagram);
EXPORT_SYMBOL(skb_recv_datagram);